public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/57169] New: fully unrolled matrix multiplication not vectorized
@ 2013-05-04 8:55 vincenzo.innocente at cern dot ch
2013-05-06 11:33 ` [Bug tree-optimization/57169] " rguenth at gcc dot gnu.org
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: vincenzo.innocente at cern dot ch @ 2013-05-04 8:55 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57169
Bug #: 57169
Summary: fully unrolled matrix multiplication not vectorized
Classification: Unclassified
Product: gcc
Version: 4.9.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
AssignedTo: unassigned@gcc.gnu.org
ReportedBy: vincenzo.innocente@cern.ch
a lot of legacy code still fully unroll linear algebra for small dimensions
As shown below gcc fails to vectorized a unrolled 4x4 matrix multiplication
while vectorize well the corresponding loop expression
sample code
alignas(32) float a[4][4];
alignas(32) float b[4][4];
alignas(32) float c[4][4];
void matmul() {
for (int i=0;i!=4;++i)
for (int j=0;j!=4;++j) {
float sum=0;
for (int k=0;k!=4;++k)
sum += a[i][k]*b[k][j];
c[i][j]=sum;
}
}
alignas(32) float src1[4][4];
alignas(32) float src2[4][4];
alignas(32) float dest[4][4];
void matmulU(){
dest[0][0] = src1[0][0] * src2[0][0] + src1[0][1] * src2[1][0] + src1[0][2] *
src2[2][0] + src1[0][3] * src2[3][0];
dest[0][1] = src1[0][0] * src2[0][1] + src1[0][1] * src2[1][1] + src1[0][2] *
src2[2][1] + src1[0][3] * src2[3][1];
dest[0][2] = src1[0][0] * src2[0][2] + src1[0][1] * src2[1][2] + src1[0][2] *
src2[2][2] + src1[0][3] * src2[3][2];
dest[0][3] = src1[0][0] * src2[0][3] + src1[0][1] * src2[1][3] + src1[0][2] *
src2[2][3] + src1[0][3] * src2[3][3];
dest[1][0] = src1[1][0] * src2[0][0] + src1[1][1] * src2[1][0] + src1[1][2] *
src2[2][0] + src1[1][3] * src2[3][0];
dest[1][1] = src1[1][0] * src2[0][1] + src1[1][1] * src2[1][1] + src1[1][2] *
src2[2][1] + src1[1][3] * src2[3][1];
dest[1][2] = src1[1][0] * src2[0][2] + src1[1][1] * src2[1][2] + src1[1][2] *
src2[2][2] + src1[1][3] * src2[3][2];
dest[1][3] = src1[1][0] * src2[0][3] + src1[1][1] * src2[1][3] + src1[1][2] *
src2[2][3] + src1[1][3] * src2[3][3];
dest[2][0] = src1[2][0] * src2[0][0] + src1[2][1] * src2[1][0] + src1[2][2] *
src2[2][0] + src1[2][3] * src2[3][0];
dest[2][1] = src1[2][0] * src2[0][1] + src1[2][1] * src2[1][1] + src1[2][2] *
src2[2][1] + src1[2][3] * src2[3][1];
dest[2][2] = src1[2][0] * src2[0][2] + src1[2][1] * src2[1][2] + src1[2][2] *
src2[2][2] + src1[2][3] * src2[3][2];
dest[2][3] = src1[2][0] * src2[0][3] + src1[2][1] * src2[1][3] + src1[2][2] *
src2[2][3] + src1[2][3] * src2[3][3];
dest[3][0] = src1[3][0] * src2[0][0] + src1[3][1] * src2[1][0] + src1[3][2] *
src2[2][0] + src1[3][3] * src2[3][0];
dest[3][1] = src1[3][0] * src2[0][1] + src1[3][1] * src2[1][1] + src1[3][2] *
src2[2][1] + src1[3][3] * src2[3][1];
dest[3][2] = src1[3][0] * src2[0][2] + src1[3][1] * src2[1][2] + src1[3][2] *
src2[2][2] + src1[3][3] * src2[3][2];
dest[3][3] = src1[3][0] * src2[0][3] + src1[3][1] * src2[1][3] + src1[3][2] *
src2[2][3] + src1[3][3] * src2[3][3];
};
generated asm
c++ -v
Using built-in specs.
COLLECT_GCC=c++
COLLECT_LTO_WRAPPER=/usr/local/libexec/gcc/x86_64-apple-darwin12.3.0/4.9.0/lto-wrapper
Target: x86_64-apple-darwin12.3.0
Configured with: ./configure --disable-multilib --disable-bootstrap
--enable-lto -disable-libitm --enable-languages=c,c++,fortran,lto --no-create
--no-recursion
Thread model: posix
gcc version 4.9.0 20130428 (experimental) [trunk revision 198366] (GCC)
Vincenzos-MacBook-Pro:vectorize innocent$ c++ -O3 -march=corei7-avx -std=c++11
-S matmul.cc -mavx2 -mfma
Vincenzos-MacBook-Pro:vectorize innocent$ cat matmul.s
.text
.align 4,0x90
.globl __Z6matmulv
__Z6matmulv:
LFB0:
vmovss 8+_b(%rip), %xmm7
vmovss 24+_b(%rip), %xmm1
vinsertps $0x10, 12+_b(%rip), %xmm7, %xmm0
vmovss _b(%rip), %xmm7
vmovss 16+_b(%rip), %xmm2
vinsertps $0x10, 4+_b(%rip), %xmm7, %xmm8
vmovss 40+_b(%rip), %xmm3
vmovlhps %xmm0, %xmm8, %xmm8
vmovss 32+_b(%rip), %xmm4
vinsertf128 $1, %xmm8, %ymm8, %ymm8
vinsertps $0x10, 28+_b(%rip), %xmm1, %xmm0
vmovss 56+_b(%rip), %xmm7
vinsertps $0x10, 20+_b(%rip), %xmm2, %xmm6
vmovlhps %xmm0, %xmm6, %xmm6
vmovss 48+_b(%rip), %xmm1
vinsertf128 $1, %xmm6, %ymm6, %ymm6
vinsertps $0x10, 44+_b(%rip), %xmm3, %xmm0
vinsertps $0x10, 36+_b(%rip), %xmm4, %xmm5
vmovlhps %xmm0, %xmm5, %xmm5
vinsertps $0x10, 60+_b(%rip), %xmm7, %xmm0
vinsertps $0x10, 52+_b(%rip), %xmm1, %xmm4
vmovlhps %xmm0, %xmm4, %xmm4
vxorps %xmm7, %xmm7, %xmm7
vmovaps _a(%rip), %ymm0
vinsertf128 $1, %xmm5, %ymm5, %ymm5
vinsertf128 $1, %xmm4, %ymm4, %ymm4
vpermilps $255, %ymm0, %ymm1
vpermilps $170, %ymm0, %ymm2
vpermilps $85, %ymm0, %ymm3
vpermilps $0, %ymm0, %ymm0
vfmadd132ps %ymm8, %ymm7, %ymm0
vfmadd132ps %ymm6, %ymm0, %ymm3
vmovaps 32+_a(%rip), %ymm0
vfmadd132ps %ymm5, %ymm3, %ymm2
vfmadd132ps %ymm4, %ymm2, %ymm1
vmovaps %ymm1, _c(%rip)
vpermilps $170, %ymm0, %ymm2
vpermilps $255, %ymm0, %ymm1
vpermilps $85, %ymm0, %ymm3
vpermilps $0, %ymm0, %ymm0
vfmadd132ps %ymm8, %ymm7, %ymm0
vfmadd132ps %ymm6, %ymm0, %ymm3
vfmadd132ps %ymm5, %ymm3, %ymm2
vfmadd132ps %ymm4, %ymm2, %ymm1
vmovaps %ymm1, 32+_c(%rip)
vzeroupper
ret
LFE0:
.align 4,0x90
.globl __Z7matmulUv
__Z7matmulUv:
LFB1:
vmovss 4+_src1(%rip), %xmm5
vmovss 16+_src2(%rip), %xmm15
vmovss _src1(%rip), %xmm4
vmulss %xmm15, %xmm5, %xmm1
vmovss 8+_src1(%rip), %xmm2
vmovss 12+_src1(%rip), %xmm0
vmovss _src2(%rip), %xmm14
vmovss 32+_src2(%rip), %xmm13
vmovss 48+_src2(%rip), %xmm12
vfmadd231ss %xmm14, %xmm4, %xmm1
vmovss 20+_src2(%rip), %xmm11
vfmadd231ss %xmm13, %xmm2, %xmm1
vfmadd231ss %xmm12, %xmm0, %xmm1
vmovss %xmm1, _dest(%rip)
vmovss 4+_src2(%rip), %xmm10
vmulss %xmm11, %xmm5, %xmm1
vmovss 36+_src2(%rip), %xmm9
vmovss 52+_src2(%rip), %xmm8
vmovss 24+_src2(%rip), %xmm7
vmovss 28+_src2(%rip), %xmm6
vfmadd231ss %xmm10, %xmm4, %xmm1
vfmadd231ss %xmm9, %xmm2, %xmm1
vfmadd231ss %xmm8, %xmm0, %xmm1
vmovss %xmm1, 4+_dest(%rip)
vmulss %xmm7, %xmm5, %xmm1
vmovss 44+_src2(%rip), %xmm3
vmulss %xmm6, %xmm5, %xmm5
vfmadd231ss 8+_src2(%rip), %xmm4, %xmm1
vfmadd231ss 40+_src2(%rip), %xmm2, %xmm1
vfmadd231ss 56+_src2(%rip), %xmm0, %xmm1
vfmadd231ss 12+_src2(%rip), %xmm4, %xmm5
vfmadd231ss %xmm3, %xmm2, %xmm5
vfmadd231ss 60+_src2(%rip), %xmm0, %xmm5
vmovss %xmm5, 12+_dest(%rip)
vmovss 20+_src1(%rip), %xmm5
vmovss %xmm1, 8+_dest(%rip)
vmovss 16+_src1(%rip), %xmm4
vmulss %xmm5, %xmm15, %xmm1
vmovss 24+_src1(%rip), %xmm2
vmovss 28+_src1(%rip), %xmm0
vfmadd231ss %xmm4, %xmm14, %xmm1
vfmadd231ss %xmm2, %xmm13, %xmm1
vfmadd231ss %xmm0, %xmm12, %xmm1
vmovss %xmm1, 16+_dest(%rip)
vmulss %xmm5, %xmm11, %xmm1
vfmadd231ss %xmm4, %xmm10, %xmm1
vfmadd231ss %xmm2, %xmm9, %xmm1
vfmadd231ss %xmm0, %xmm8, %xmm1
vmovss %xmm1, 20+_dest(%rip)
vmulss %xmm5, %xmm7, %xmm1
vmulss %xmm5, %xmm6, %xmm5
vfmadd231ss 8+_src2(%rip), %xmm4, %xmm1
vfmadd231ss 40+_src2(%rip), %xmm2, %xmm1
vfmadd231ss 56+_src2(%rip), %xmm0, %xmm1
vmovss %xmm1, 24+_dest(%rip)
vfmadd231ss 12+_src2(%rip), %xmm4, %xmm5
vfmadd231ss %xmm2, %xmm3, %xmm5
vfmadd231ss 60+_src2(%rip), %xmm0, %xmm5
vmovss %xmm5, 28+_dest(%rip)
vmovss 36+_src1(%rip), %xmm5
vmovss 32+_src1(%rip), %xmm4
vmulss %xmm5, %xmm15, %xmm1
vmovss 40+_src1(%rip), %xmm2
vmovss 44+_src1(%rip), %xmm0
vfmadd231ss %xmm4, %xmm14, %xmm1
vfmadd231ss %xmm2, %xmm13, %xmm1
vfmadd231ss %xmm0, %xmm12, %xmm1
vmovss %xmm1, 32+_dest(%rip)
vmulss %xmm5, %xmm11, %xmm1
vfmadd231ss %xmm4, %xmm10, %xmm1
vfmadd231ss %xmm2, %xmm9, %xmm1
vfmadd231ss %xmm0, %xmm8, %xmm1
vmovss %xmm1, 36+_dest(%rip)
vmulss %xmm5, %xmm7, %xmm1
vmulss %xmm5, %xmm6, %xmm5
vfmadd231ss 8+_src2(%rip), %xmm4, %xmm1
vfmadd231ss 40+_src2(%rip), %xmm2, %xmm1
vfmadd231ss 56+_src2(%rip), %xmm0, %xmm1
vfmadd231ss 12+_src2(%rip), %xmm4, %xmm5
vfmadd231ss %xmm2, %xmm3, %xmm5
vfmadd231ss 60+_src2(%rip), %xmm0, %xmm5
vmovss %xmm5, 44+_dest(%rip)
vmovss 52+_src1(%rip), %xmm5
vmovss 48+_src1(%rip), %xmm4
vmovss %xmm1, 40+_dest(%rip)
vmulss %xmm5, %xmm15, %xmm15
vmovss 56+_src1(%rip), %xmm2
vmulss %xmm5, %xmm11, %xmm11
vmovss 60+_src1(%rip), %xmm0
vmulss %xmm5, %xmm7, %xmm7
vmulss %xmm5, %xmm6, %xmm5
vfmadd231ss %xmm4, %xmm14, %xmm15
vfmadd231ss %xmm2, %xmm13, %xmm15
vfmadd231ss %xmm0, %xmm12, %xmm15
vfmadd132ss %xmm4, %xmm11, %xmm10
vmovss %xmm15, 48+_dest(%rip)
vfmadd132ss %xmm2, %xmm10, %xmm9
vfmadd231ss 8+_src2(%rip), %xmm4, %xmm7
vfmadd231ss %xmm0, %xmm8, %xmm9
vfmadd231ss 40+_src2(%rip), %xmm2, %xmm7
vfmadd132ss 12+_src2(%rip), %xmm5, %xmm4
vfmadd132ss %xmm3, %xmm4, %xmm2
vfmadd231ss 56+_src2(%rip), %xmm0, %xmm7
vfmadd231ss 60+_src2(%rip), %xmm0, %xmm2
vmovss %xmm9, 52+_dest(%rip)
vmovss %xmm7, 56+_dest(%rip)
vmovss %xmm2, 60+_dest(%rip)
ret
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug tree-optimization/57169] fully unrolled matrix multiplication not vectorized
2013-05-04 8:55 [Bug tree-optimization/57169] New: fully unrolled matrix multiplication not vectorized vincenzo.innocente at cern dot ch
@ 2013-05-06 11:33 ` rguenth at gcc dot gnu.org
2014-03-27 10:08 ` iliyapalachev at gmail dot com
2021-12-28 6:23 ` pinskia at gcc dot gnu.org
2 siblings, 0 replies; 4+ messages in thread
From: rguenth at gcc dot gnu.org @ 2013-05-06 11:33 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57169
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Keywords| |missed-optimization
Status|UNCONFIRMED |NEW
Last reconfirmed| |2013-05-06
Blocks| |53947
Ever Confirmed|0 |1
--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> 2013-05-06 11:33:29 UTC ---
This is because basic-block SLP does not support vectorizing reductions.
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug tree-optimization/57169] fully unrolled matrix multiplication not vectorized
2013-05-04 8:55 [Bug tree-optimization/57169] New: fully unrolled matrix multiplication not vectorized vincenzo.innocente at cern dot ch
2013-05-06 11:33 ` [Bug tree-optimization/57169] " rguenth at gcc dot gnu.org
@ 2014-03-27 10:08 ` iliyapalachev at gmail dot com
2021-12-28 6:23 ` pinskia at gcc dot gnu.org
2 siblings, 0 replies; 4+ messages in thread
From: iliyapalachev at gmail dot com @ 2014-03-27 10:08 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57169
Ilya Palachev <iliyapalachev at gmail dot com> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |iliyapalachev at gmail dot com
--- Comment #2 from Ilya Palachev <iliyapalachev at gmail dot com> ---
(In reply to Richard Biener from comment #1)
> This is because basic-block SLP does not support vectorizing reductions.
At page http://gcc.gnu.org/wiki/VectorizationTasks
it is written that the generalization of reduction support
(http://gcc.gnu.org/ml/gcc-patches/2006-04/msg00172.html) can help to fix this
PR25621
Has this bug the same reason as the bug discussed in PR25621 ?
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug tree-optimization/57169] fully unrolled matrix multiplication not vectorized
2013-05-04 8:55 [Bug tree-optimization/57169] New: fully unrolled matrix multiplication not vectorized vincenzo.innocente at cern dot ch
2013-05-06 11:33 ` [Bug tree-optimization/57169] " rguenth at gcc dot gnu.org
2014-03-27 10:08 ` iliyapalachev at gmail dot com
@ 2021-12-28 6:23 ` pinskia at gcc dot gnu.org
2 siblings, 0 replies; 4+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-12-28 6:23 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57169
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Target Milestone|--- |10.0
Known to work| |10.1.0, 12.0
Status|NEW |RESOLVED
Resolution|--- |FIXED
--- Comment #3 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
GCC 6-9 vectorizes both but matmul uses scalar loads until GCC 10.
So all fixed in GCC 10+.
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2021-12-28 6:23 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-05-04 8:55 [Bug tree-optimization/57169] New: fully unrolled matrix multiplication not vectorized vincenzo.innocente at cern dot ch
2013-05-06 11:33 ` [Bug tree-optimization/57169] " rguenth at gcc dot gnu.org
2014-03-27 10:08 ` iliyapalachev at gmail dot com
2021-12-28 6:23 ` pinskia at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).