[Bug tree-optimization/57169] New: fully unrolled matrix multiplication not vectorized

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug tree-optimization/57169] New: fully unrolled matrix multiplication not vectorized
@ 2013-05-04  8:55 vincenzo.innocente at cern dot ch
  2013-05-06 11:33 ` [Bug tree-optimization/57169] " rguenth at gcc dot gnu.org
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: vincenzo.innocente at cern dot ch @ 2013-05-04  8:55 UTC (permalink / raw)
  To: gcc-bugs


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57169

             Bug #: 57169
           Summary: fully unrolled matrix multiplication not vectorized
    Classification: Unclassified
           Product: gcc
           Version: 4.9.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
        AssignedTo: unassigned@gcc.gnu.org
        ReportedBy: vincenzo.innocente@cern.ch


a lot of legacy code still fully unroll linear algebra for small dimensions

As shown below gcc fails to vectorized a unrolled 4x4 matrix multiplication
while vectorize well the corresponding loop expression 

sample code

alignas(32) float a[4][4];
alignas(32) float b[4][4];
alignas(32) float c[4][4];

void matmul() {
   for (int i=0;i!=4;++i)
     for (int j=0;j!=4;++j) {
       float sum=0;
       for (int k=0;k!=4;++k)
             sum += a[i][k]*b[k][j];
       c[i][j]=sum;
     }
}


alignas(32) float src1[4][4];
alignas(32) float src2[4][4];
alignas(32) float dest[4][4];

void matmulU(){
  dest[0][0] = src1[0][0] * src2[0][0] + src1[0][1] * src2[1][0] + src1[0][2] *
src2[2][0] + src1[0][3] * src2[3][0]; 
  dest[0][1] = src1[0][0] * src2[0][1] + src1[0][1] * src2[1][1] + src1[0][2] *
src2[2][1] + src1[0][3] * src2[3][1]; 
  dest[0][2] = src1[0][0] * src2[0][2] + src1[0][1] * src2[1][2] + src1[0][2] *
src2[2][2] + src1[0][3] * src2[3][2]; 
  dest[0][3] = src1[0][0] * src2[0][3] + src1[0][1] * src2[1][3] + src1[0][2] *
src2[2][3] + src1[0][3] * src2[3][3]; 
  dest[1][0] = src1[1][0] * src2[0][0] + src1[1][1] * src2[1][0] + src1[1][2] *
src2[2][0] + src1[1][3] * src2[3][0]; 
  dest[1][1] = src1[1][0] * src2[0][1] + src1[1][1] * src2[1][1] + src1[1][2] *
src2[2][1] + src1[1][3] * src2[3][1]; 
  dest[1][2] = src1[1][0] * src2[0][2] + src1[1][1] * src2[1][2] + src1[1][2] *
src2[2][2] + src1[1][3] * src2[3][2]; 
  dest[1][3] = src1[1][0] * src2[0][3] + src1[1][1] * src2[1][3] + src1[1][2] *
src2[2][3] + src1[1][3] * src2[3][3]; 
  dest[2][0] = src1[2][0] * src2[0][0] + src1[2][1] * src2[1][0] + src1[2][2] *
src2[2][0] + src1[2][3] * src2[3][0]; 
  dest[2][1] = src1[2][0] * src2[0][1] + src1[2][1] * src2[1][1] + src1[2][2] *
src2[2][1] + src1[2][3] * src2[3][1]; 
  dest[2][2] = src1[2][0] * src2[0][2] + src1[2][1] * src2[1][2] + src1[2][2] *
src2[2][2] + src1[2][3] * src2[3][2]; 
  dest[2][3] = src1[2][0] * src2[0][3] + src1[2][1] * src2[1][3] + src1[2][2] *
src2[2][3] + src1[2][3] * src2[3][3]; 
  dest[3][0] = src1[3][0] * src2[0][0] + src1[3][1] * src2[1][0] + src1[3][2] *
src2[2][0] + src1[3][3] * src2[3][0]; 
  dest[3][1] = src1[3][0] * src2[0][1] + src1[3][1] * src2[1][1] + src1[3][2] *
src2[2][1] + src1[3][3] * src2[3][1]; 
  dest[3][2] = src1[3][0] * src2[0][2] + src1[3][1] * src2[1][2] + src1[3][2] *
src2[2][2] + src1[3][3] * src2[3][2]; 
  dest[3][3] = src1[3][0] * src2[0][3] + src1[3][1] * src2[1][3] + src1[3][2] *
src2[2][3] + src1[3][3] * src2[3][3]; 
};

generated asm 

c++ -v
Using built-in specs.
COLLECT_GCC=c++
COLLECT_LTO_WRAPPER=/usr/local/libexec/gcc/x86_64-apple-darwin12.3.0/4.9.0/lto-wrapper
Target: x86_64-apple-darwin12.3.0
Configured with: ./configure --disable-multilib --disable-bootstrap
--enable-lto -disable-libitm --enable-languages=c,c++,fortran,lto --no-create
--no-recursion
Thread model: posix
gcc version 4.9.0 20130428 (experimental) [trunk revision 198366] (GCC) 
Vincenzos-MacBook-Pro:vectorize innocent$ c++ -O3 -march=corei7-avx  -std=c++11
-S matmul.cc -mavx2 -mfma
Vincenzos-MacBook-Pro:vectorize innocent$ cat matmul.s
    .text
    .align 4,0x90
    .globl __Z6matmulv
__Z6matmulv:
LFB0:
    vmovss    8+_b(%rip), %xmm7
    vmovss    24+_b(%rip), %xmm1
    vinsertps    $0x10, 12+_b(%rip), %xmm7, %xmm0
    vmovss    _b(%rip), %xmm7
    vmovss    16+_b(%rip), %xmm2
    vinsertps    $0x10, 4+_b(%rip), %xmm7, %xmm8
    vmovss    40+_b(%rip), %xmm3
    vmovlhps    %xmm0, %xmm8, %xmm8
    vmovss    32+_b(%rip), %xmm4
    vinsertf128    $1, %xmm8, %ymm8, %ymm8
    vinsertps    $0x10, 28+_b(%rip), %xmm1, %xmm0
    vmovss    56+_b(%rip), %xmm7
    vinsertps    $0x10, 20+_b(%rip), %xmm2, %xmm6
    vmovlhps    %xmm0, %xmm6, %xmm6
    vmovss    48+_b(%rip), %xmm1
    vinsertf128    $1, %xmm6, %ymm6, %ymm6
    vinsertps    $0x10, 44+_b(%rip), %xmm3, %xmm0
    vinsertps    $0x10, 36+_b(%rip), %xmm4, %xmm5
    vmovlhps    %xmm0, %xmm5, %xmm5
    vinsertps    $0x10, 60+_b(%rip), %xmm7, %xmm0
    vinsertps    $0x10, 52+_b(%rip), %xmm1, %xmm4
    vmovlhps    %xmm0, %xmm4, %xmm4
    vxorps    %xmm7, %xmm7, %xmm7
    vmovaps    _a(%rip), %ymm0
    vinsertf128    $1, %xmm5, %ymm5, %ymm5
    vinsertf128    $1, %xmm4, %ymm4, %ymm4
    vpermilps    $255, %ymm0, %ymm1
    vpermilps    $170, %ymm0, %ymm2
    vpermilps    $85, %ymm0, %ymm3
    vpermilps    $0, %ymm0, %ymm0
    vfmadd132ps    %ymm8, %ymm7, %ymm0
    vfmadd132ps    %ymm6, %ymm0, %ymm3
    vmovaps    32+_a(%rip), %ymm0
    vfmadd132ps    %ymm5, %ymm3, %ymm2
    vfmadd132ps    %ymm4, %ymm2, %ymm1
    vmovaps    %ymm1, _c(%rip)
    vpermilps    $170, %ymm0, %ymm2
    vpermilps    $255, %ymm0, %ymm1
    vpermilps    $85, %ymm0, %ymm3
    vpermilps    $0, %ymm0, %ymm0
    vfmadd132ps    %ymm8, %ymm7, %ymm0
    vfmadd132ps    %ymm6, %ymm0, %ymm3
    vfmadd132ps    %ymm5, %ymm3, %ymm2
    vfmadd132ps    %ymm4, %ymm2, %ymm1
    vmovaps    %ymm1, 32+_c(%rip)
    vzeroupper
    ret
LFE0:
    .align 4,0x90
    .globl __Z7matmulUv
__Z7matmulUv:
LFB1:
    vmovss    4+_src1(%rip), %xmm5
    vmovss    16+_src2(%rip), %xmm15
    vmovss    _src1(%rip), %xmm4
    vmulss    %xmm15, %xmm5, %xmm1
    vmovss    8+_src1(%rip), %xmm2
    vmovss    12+_src1(%rip), %xmm0
    vmovss    _src2(%rip), %xmm14
    vmovss    32+_src2(%rip), %xmm13
    vmovss    48+_src2(%rip), %xmm12
    vfmadd231ss    %xmm14, %xmm4, %xmm1
    vmovss    20+_src2(%rip), %xmm11
    vfmadd231ss    %xmm13, %xmm2, %xmm1
    vfmadd231ss    %xmm12, %xmm0, %xmm1
    vmovss    %xmm1, _dest(%rip)
    vmovss    4+_src2(%rip), %xmm10
    vmulss    %xmm11, %xmm5, %xmm1
    vmovss    36+_src2(%rip), %xmm9
    vmovss    52+_src2(%rip), %xmm8
    vmovss    24+_src2(%rip), %xmm7
    vmovss    28+_src2(%rip), %xmm6
    vfmadd231ss    %xmm10, %xmm4, %xmm1
    vfmadd231ss    %xmm9, %xmm2, %xmm1
    vfmadd231ss    %xmm8, %xmm0, %xmm1
    vmovss    %xmm1, 4+_dest(%rip)
    vmulss    %xmm7, %xmm5, %xmm1
    vmovss    44+_src2(%rip), %xmm3
    vmulss    %xmm6, %xmm5, %xmm5
    vfmadd231ss    8+_src2(%rip), %xmm4, %xmm1
    vfmadd231ss    40+_src2(%rip), %xmm2, %xmm1
    vfmadd231ss    56+_src2(%rip), %xmm0, %xmm1
    vfmadd231ss    12+_src2(%rip), %xmm4, %xmm5
    vfmadd231ss    %xmm3, %xmm2, %xmm5
    vfmadd231ss    60+_src2(%rip), %xmm0, %xmm5
    vmovss    %xmm5, 12+_dest(%rip)
    vmovss    20+_src1(%rip), %xmm5
    vmovss    %xmm1, 8+_dest(%rip)
    vmovss    16+_src1(%rip), %xmm4
    vmulss    %xmm5, %xmm15, %xmm1
    vmovss    24+_src1(%rip), %xmm2
    vmovss    28+_src1(%rip), %xmm0
    vfmadd231ss    %xmm4, %xmm14, %xmm1
    vfmadd231ss    %xmm2, %xmm13, %xmm1
    vfmadd231ss    %xmm0, %xmm12, %xmm1
    vmovss    %xmm1, 16+_dest(%rip)
    vmulss    %xmm5, %xmm11, %xmm1
    vfmadd231ss    %xmm4, %xmm10, %xmm1
    vfmadd231ss    %xmm2, %xmm9, %xmm1
    vfmadd231ss    %xmm0, %xmm8, %xmm1
    vmovss    %xmm1, 20+_dest(%rip)
    vmulss    %xmm5, %xmm7, %xmm1
    vmulss    %xmm5, %xmm6, %xmm5
    vfmadd231ss    8+_src2(%rip), %xmm4, %xmm1
    vfmadd231ss    40+_src2(%rip), %xmm2, %xmm1
    vfmadd231ss    56+_src2(%rip), %xmm0, %xmm1
    vmovss    %xmm1, 24+_dest(%rip)
    vfmadd231ss    12+_src2(%rip), %xmm4, %xmm5
    vfmadd231ss    %xmm2, %xmm3, %xmm5
    vfmadd231ss    60+_src2(%rip), %xmm0, %xmm5
    vmovss    %xmm5, 28+_dest(%rip)
    vmovss    36+_src1(%rip), %xmm5
    vmovss    32+_src1(%rip), %xmm4
    vmulss    %xmm5, %xmm15, %xmm1
    vmovss    40+_src1(%rip), %xmm2
    vmovss    44+_src1(%rip), %xmm0
    vfmadd231ss    %xmm4, %xmm14, %xmm1
    vfmadd231ss    %xmm2, %xmm13, %xmm1
    vfmadd231ss    %xmm0, %xmm12, %xmm1
    vmovss    %xmm1, 32+_dest(%rip)
    vmulss    %xmm5, %xmm11, %xmm1
    vfmadd231ss    %xmm4, %xmm10, %xmm1
    vfmadd231ss    %xmm2, %xmm9, %xmm1
    vfmadd231ss    %xmm0, %xmm8, %xmm1
    vmovss    %xmm1, 36+_dest(%rip)
    vmulss    %xmm5, %xmm7, %xmm1
    vmulss    %xmm5, %xmm6, %xmm5
    vfmadd231ss    8+_src2(%rip), %xmm4, %xmm1
    vfmadd231ss    40+_src2(%rip), %xmm2, %xmm1
    vfmadd231ss    56+_src2(%rip), %xmm0, %xmm1
    vfmadd231ss    12+_src2(%rip), %xmm4, %xmm5
    vfmadd231ss    %xmm2, %xmm3, %xmm5
    vfmadd231ss    60+_src2(%rip), %xmm0, %xmm5
    vmovss    %xmm5, 44+_dest(%rip)
    vmovss    52+_src1(%rip), %xmm5
    vmovss    48+_src1(%rip), %xmm4
    vmovss    %xmm1, 40+_dest(%rip)
    vmulss    %xmm5, %xmm15, %xmm15
    vmovss    56+_src1(%rip), %xmm2
    vmulss    %xmm5, %xmm11, %xmm11
    vmovss    60+_src1(%rip), %xmm0
    vmulss    %xmm5, %xmm7, %xmm7
    vmulss    %xmm5, %xmm6, %xmm5
    vfmadd231ss    %xmm4, %xmm14, %xmm15
    vfmadd231ss    %xmm2, %xmm13, %xmm15
    vfmadd231ss    %xmm0, %xmm12, %xmm15
    vfmadd132ss    %xmm4, %xmm11, %xmm10
    vmovss    %xmm15, 48+_dest(%rip)
    vfmadd132ss    %xmm2, %xmm10, %xmm9
    vfmadd231ss    8+_src2(%rip), %xmm4, %xmm7
    vfmadd231ss    %xmm0, %xmm8, %xmm9
    vfmadd231ss    40+_src2(%rip), %xmm2, %xmm7
    vfmadd132ss    12+_src2(%rip), %xmm5, %xmm4
    vfmadd132ss    %xmm3, %xmm4, %xmm2
    vfmadd231ss    56+_src2(%rip), %xmm0, %xmm7
    vfmadd231ss    60+_src2(%rip), %xmm0, %xmm2
    vmovss    %xmm9, 52+_dest(%rip)
    vmovss    %xmm7, 56+_dest(%rip)
    vmovss    %xmm2, 60+_dest(%rip)
    ret


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug tree-optimization/57169] fully unrolled matrix multiplication not vectorized
  2013-05-04  8:55 [Bug tree-optimization/57169] New: fully unrolled matrix multiplication not vectorized vincenzo.innocente at cern dot ch
@ 2013-05-06 11:33 ` rguenth at gcc dot gnu.org
  2014-03-27 10:08 ` iliyapalachev at gmail dot com
  2021-12-28  6:23 ` pinskia at gcc dot gnu.org
  2 siblings, 0 replies; 4+ messages in thread
From: rguenth at gcc dot gnu.org @ 2013-05-06 11:33 UTC (permalink / raw)
  To: gcc-bugs


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57169

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           Keywords|                            |missed-optimization
             Status|UNCONFIRMED                 |NEW
   Last reconfirmed|                            |2013-05-06
             Blocks|                            |53947
     Ever Confirmed|0                           |1

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> 2013-05-06 11:33:29 UTC ---
This is because basic-block SLP does not support vectorizing reductions.


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug tree-optimization/57169] fully unrolled matrix multiplication not vectorized
  2013-05-04  8:55 [Bug tree-optimization/57169] New: fully unrolled matrix multiplication not vectorized vincenzo.innocente at cern dot ch
  2013-05-06 11:33 ` [Bug tree-optimization/57169] " rguenth at gcc dot gnu.org
@ 2014-03-27 10:08 ` iliyapalachev at gmail dot com
  2021-12-28  6:23 ` pinskia at gcc dot gnu.org
  2 siblings, 0 replies; 4+ messages in thread
From: iliyapalachev at gmail dot com @ 2014-03-27 10:08 UTC (permalink / raw)
  To: gcc-bugs

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57169

Ilya Palachev <iliyapalachev at gmail dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |iliyapalachev at gmail dot com

--- Comment #2 from Ilya Palachev <iliyapalachev at gmail dot com> ---
(In reply to Richard Biener from comment #1)
> This is because basic-block SLP does not support vectorizing reductions.


At page http://gcc.gnu.org/wiki/VectorizationTasks

it is written that the generalization of reduction support
(http://gcc.gnu.org/ml/gcc-patches/2006-04/msg00172.html) can help to fix this
PR25621

Has this bug the same reason as the bug discussed in PR25621 ?


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug tree-optimization/57169] fully unrolled matrix multiplication not vectorized
  2013-05-04  8:55 [Bug tree-optimization/57169] New: fully unrolled matrix multiplication not vectorized vincenzo.innocente at cern dot ch
  2013-05-06 11:33 ` [Bug tree-optimization/57169] " rguenth at gcc dot gnu.org
  2014-03-27 10:08 ` iliyapalachev at gmail dot com
@ 2021-12-28  6:23 ` pinskia at gcc dot gnu.org
  2 siblings, 0 replies; 4+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-12-28  6:23 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57169

Andrew Pinski <pinskia at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
   Target Milestone|---                         |10.0
      Known to work|                            |10.1.0, 12.0
             Status|NEW                         |RESOLVED
         Resolution|---                         |FIXED

--- Comment #3 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
GCC 6-9 vectorizes both but matmul uses scalar loads until GCC 10.

So all fixed in GCC 10+.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-12-28  6:23 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-05-04  8:55 [Bug tree-optimization/57169] New: fully unrolled matrix multiplication not vectorized vincenzo.innocente at cern dot ch
2013-05-06 11:33 ` [Bug tree-optimization/57169] " rguenth at gcc dot gnu.org
2014-03-27 10:08 ` iliyapalachev at gmail dot com
2021-12-28  6:23 ` pinskia at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).