public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug target/51179] New: poor vectorization on interlagos.
@ 2011-11-16 19:25 Joost.VandeVondele at mat dot ethz.ch
  2011-11-22 11:03 ` [Bug target/51179] " ubizjak at gmail dot com
                   ` (11 more replies)
  0 siblings, 12 replies; 13+ messages in thread
From: Joost.VandeVondele at mat dot ethz.ch @ 2011-11-16 19:25 UTC (permalink / raw)
  To: gcc-bugs

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=51179

             Bug #: 51179
           Summary: poor vectorization on interlagos.
    Classification: Unclassified
           Product: gcc
           Version: 4.6.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
        AssignedTo: unassigned@gcc.gnu.org
        ReportedBy: Joost.VandeVondele@mat.ethz.ch


The following code executes significantly faster when compiled with the cray
compiler (gcc: 43.4s cray:7.7s for 100000000 calls)

SUBROUTINE smm_dnn_4_10_10_4_1_2_1(A,B,C)
   REAL(KIND=KIND(0.0D0))   :: C(4,10), B(10,10), A(4,10)
   INTEGER ::i,j,l
   DO j=           1 ,          10 ,           2
   DO l=           1 ,          10 ,           1
   DO i=           1 ,           4 ,           1
     C(i+0,j+0)=C(i+0,j+0)+A(i+0,l+0)*B(l+0,j+0)
     C(i+0,j+1)=C(i+0,j+1)+A(i+0,l+0)*B(l+0,j+1)
   ENDDO
   ENDDO
   ENDDO
 END SUBROUTINE

cray options: -h noomp -e m -F -ra -O2 -Oipa1 -v tst.f90
gfortran: -O3 -march=native -ffast-math 

which yields for gfortran:

-march=bdver1 -mcx16 -msahf -mno-movbe -maes -mpclmul -mpopcnt -mabm -mlwp
-mno-fma -mfma4 -mxop -mno-bmi -mno-tbm -mavx -msse4.2 -msse4.1

The cray code looks nice:
0000000000000000 <smm_dnn_4_10_10_4_1_2_1_>:
   0:   48 89 7c 24 f8          mov    %rdi,-0x8(%rsp)
   5:   48 89 74 24 f0          mov    %rsi,-0x10(%rsp)
   a:   48 89 54 24 e8          mov    %rdx,-0x18(%rsp)
   f:   c5 fc 10 02             vmovups (%rdx),%ymm0
  13:   c5 fc 10 4a 20          vmovups 0x20(%rdx),%ymm1
  18:   c5 fc 10 52 40          vmovups 0x40(%rdx),%ymm2
  1d:   c5 fc 10 5a 60          vmovups 0x60(%rdx),%ymm3
  22:   c5 fc 10 a2 80 00 00    vmovups 0x80(%rdx),%ymm4
  29:   00 
  2a:   c5 fc 10 aa a0 00 00    vmovups 0xa0(%rdx),%ymm5
  31:   00 
  32:   c5 fc 10 b2 c0 00 00    vmovups 0xc0(%rdx),%ymm6
  39:   00 
  3a:   c5 fc 10 ba e0 00 00    vmovups 0xe0(%rdx),%ymm7
  41:   00 
  42:   c5 7c 10 82 00 01 00    vmovups 0x100(%rdx),%ymm8
  49:   00 
  4a:   c5 7c 10 8a 20 01 00    vmovups 0x120(%rdx),%ymm9
  51:   00 
  52:   31 c0                   xor    %eax,%eax
  54:   48 89 c1                mov    %rax,%rcx
  57:   66 0f 1f 84 00 00 00    nopw   0x0(%rax,%rax,1)
  5e:   00 00 
  60:   c4 62 7d 19 94 c6 d0    vbroadcastsd 0x2d0(%rsi,%rax,8),%ymm10
  67:   02 00 00 
  6a:   c5 7c 10 1c 0f          vmovups (%rdi,%rcx,1),%ymm11
  6f:   c4 43 a5 69 c9 a0       vfmaddpd %ymm9,%ymm10,%ymm11,%ymm9
  75:   c4 62 7d 19 94 c6 80    vbroadcastsd 0x280(%rsi,%rax,8),%ymm10
  7c:   02 00 00 
  7f:   c4 43 a5 69 c0 a0       vfmaddpd %ymm8,%ymm10,%ymm11,%ymm8
  85:   c4 62 7d 19 94 c6 30    vbroadcastsd 0x230(%rsi,%rax,8),%ymm10
  8c:   02 00 00 
  8f:   c4 e3 a5 69 ff a0       vfmaddpd %ymm7,%ymm10,%ymm11,%ymm7
  95:   c4 62 7d 19 94 c6 e0    vbroadcastsd 0x1e0(%rsi,%rax,8),%ymm10
  9c:   01 00 00 
  9f:   c4 e3 a5 69 f6 a0       vfmaddpd %ymm6,%ymm10,%ymm11,%ymm6
  a5:   c4 62 7d 19 94 c6 90    vbroadcastsd 0x190(%rsi,%rax,8),%ymm10
  ac:   01 00 00 
  af:   c4 e3 a5 69 ed a0       vfmaddpd %ymm5,%ymm10,%ymm11,%ymm5
  b5:   c4 62 7d 19 94 c6 40    vbroadcastsd 0x140(%rsi,%rax,8),%ymm10
  bc:   01 00 00 
  bf:   c4 e3 a5 69 e4 a0       vfmaddpd %ymm4,%ymm10,%ymm11,%ymm4
  c5:   c4 62 7d 19 94 c6 f0    vbroadcastsd 0xf0(%rsi,%rax,8),%ymm10
  cc:   00 00 00 
  cf:   c4 e3 a5 69 db a0       vfmaddpd %ymm3,%ymm10,%ymm11,%ymm3
  d5:   c4 62 7d 19 94 c6 a0    vbroadcastsd 0xa0(%rsi,%rax,8),%ymm10
  dc:   00 00 00 
  df:   c4 e3 a5 69 d2 a0       vfmaddpd %ymm2,%ymm10,%ymm11,%ymm2
  e5:   c4 62 7d 19 54 c6 50    vbroadcastsd 0x50(%rsi,%rax,8),%ymm10
  ec:   c4 e3 a5 69 c9 a0       vfmaddpd %ymm1,%ymm10,%ymm11,%ymm1
  f2:   c4 62 7d 19 14 c6       vbroadcastsd (%rsi,%rax,8),%ymm10
  f8:   c4 e3 a5 69 c0 a0       vfmaddpd %ymm0,%ymm10,%ymm11,%ymm0
  fe:   48 83 c1 20             add    $0x20,%rcx
 102:   48 ff c0                inc    %rax
 105:   48 83 f8 0a             cmp    $0xa,%rax
 109:   0f 8c 51 ff ff ff       jl     60 <smm_dnn_4_10_10_4_1_2_1_+0x60>
 10f:   c5 78 11 8a 20 01 00    vmovups %xmm9,0x120(%rdx)
 116:   00 
 117:   c4 63 7d 19 8a 30 01    vextractf128 $0x1,%ymm9,0x130(%rdx)
 11e:   00 00 01 
 121:   c5 78 11 82 00 01 00    vmovups %xmm8,0x100(%rdx)
 128:   00 
 129:   c4 63 7d 19 82 10 01    vextractf128 $0x1,%ymm8,0x110(%rdx)
 130:   00 00 01 
 133:   c5 f8 11 ba e0 00 00    vmovups %xmm7,0xe0(%rdx)
 13a:   00 
 13b:   c4 e3 7d 19 ba f0 00    vextractf128 $0x1,%ymm7,0xf0(%rdx)
 142:   00 00 01 
 145:   c5 f8 11 b2 c0 00 00    vmovups %xmm6,0xc0(%rdx)
 14c:   00 
 14d:   c4 e3 7d 19 b2 d0 00    vextractf128 $0x1,%ymm6,0xd0(%rdx)
 154:   00 00 01 
 157:   c5 f8 11 aa a0 00 00    vmovups %xmm5,0xa0(%rdx)
 15e:   00 
 15f:   c4 e3 7d 19 aa b0 00    vextractf128 $0x1,%ymm5,0xb0(%rdx)
 166:   00 00 01 
 169:   c5 f8 11 a2 80 00 00    vmovups %xmm4,0x80(%rdx)
 170:   00 
 171:   c4 e3 7d 19 a2 90 00    vextractf128 $0x1,%ymm4,0x90(%rdx)
 178:   00 00 01 
 17b:   c5 f8 11 5a 60          vmovups %xmm3,0x60(%rdx)
 180:   c4 e3 7d 19 5a 70 01    vextractf128 $0x1,%ymm3,0x70(%rdx)
 187:   c5 f8 11 52 40          vmovups %xmm2,0x40(%rdx)
 18c:   c4 e3 7d 19 52 50 01    vextractf128 $0x1,%ymm2,0x50(%rdx)
 193:   c5 f8 11 4a 20          vmovups %xmm1,0x20(%rdx)
 198:   c4 e3 7d 19 4a 30 01    vextractf128 $0x1,%ymm1,0x30(%rdx)
 19f:   c5 f8 11 02             vmovups %xmm0,(%rdx)
 1a3:   c4 e3 7d 19 42 10 01    vextractf128 $0x1,%ymm0,0x10(%rdx)
 1aa:   c5 f8 77                vzeroupper 
 1ad:   c3                      retq   
 1ae:   66 90                   xchg   %ax,%ax

gcc's code looks more involved:

smm_dnn_4_10_10_4_1_2_1_:
.LFB0:
        pushq   %rbp
.LCFI0:
        movl    $1, %eax
        movq    %rsp, %rbp
.LCFI1:
        andq    $-32, %rsp
        subq    $616, %rsp
.LCFI2:
        vmovupd 96(%rdi), %ymm0
        vmovupd (%rdi), %ymm3
        vmovupd 32(%rdi), %ymm1
        vmovsd  280(%rdi), %xmm13
        vmovupd 64(%rdi), %ymm2
        vmovsd  288(%rdi), %xmm15
        vmovsd  256(%rdi), %xmm4
        vmovsd  264(%rdi), %xmm6
        vmovsd  272(%rdi), %xmm7
        vmovupd 128(%rdi), %ymm12
        vmovsd  %xmm13, 296(%rsp)
        vmovupd 160(%rdi), %ymm11
        vperm2f128      $32, %ymm1, %ymm3, %ymm13
        vmovsd  %xmm15, 288(%rsp)
        vperm2f128      $49, %ymm1, %ymm3, %ymm1
        vmovsd  %xmm4, 320(%rsp)
        vperm2f128      $32, %ymm0, %ymm2, %ymm15
        vmovsd  296(%rdi), %xmm4
        vperm2f128      $49, %ymm0, %ymm2, %ymm2
        vmovsd  %xmm6, 312(%rsp)
        vmovaps %ymm1, 40(%rsp)
        vunpcklpd       %ymm1, %ymm13, %ymm1
        vmovsd  304(%rdi), %xmm6
        vunpcklpd       %ymm2, %ymm15, %ymm0
        vmovsd  %xmm7, 304(%rsp)
        vmovsd  312(%rdi), %xmm7
        vmovaps %ymm2, -24(%rsp)
        vperm2f128      $32, %ymm0, %ymm1, %ymm2
        vmovupd 192(%rdi), %ymm10
        vperm2f128      $49, %ymm0, %ymm1, %ymm0
        vmovsd  %xmm4, 280(%rsp)
        vmovsd  %xmm6, 336(%rsp)
        vmovaps %ymm13, %ymm4
        vmovsd  %xmm7, 328(%rsp)
        vmovaps %ymm15, %ymm6
        vmovaps %ymm2, %ymm7
        vunpcklpd       %ymm0, %ymm2, %ymm8
        vmovupd 224(%rdi), %ymm9
        vmovaps %ymm13, 72(%rsp)
        vmovaps %ymm15, 8(%rsp)
        vmovaps %ymm2, -56(%rsp)
        vmovaps %ymm0, -88(%rsp)
        vxorps  %xmm0, %xmm0, %xmm0
.L3:
        vunpckhpd       40(%rsp), %ymm4, %ymm3
        vmovupd (%rsi), %ymm4
        vunpckhpd       -24(%rsp), %ymm6, %ymm1
        vunpckhpd       -88(%rsp), %ymm7, %ymm5
        vperm2f128      $32, %ymm1, %ymm3, %ymm2
        vperm2f128      $49, %ymm1, %ymm3, %ymm1
        vfmaddpd        %ymm0, %ymm5, %ymm4, %ymm15
        vfmaddpd        %ymm0, %ymm8, %ymm4, %ymm3
        vunpcklpd       %ymm1, %ymm2, %ymm6
        vunpckhpd       %ymm1, %ymm2, %ymm2
        vmovupd 80(%rsi), %ymm1
        vfmaddpd        %ymm0, %ymm6, %ymm4, %ymm13
        vfmaddpd        %ymm0, %ymm2, %ymm4, %ymm4
        vmovaps %ymm15, 200(%rsp)
        vmovsd  320(%rsp), %xmm15
        vfmaddpd        %ymm0, %ymm8, %ymm1, %ymm14
        vfmaddpd        %ymm0, %ymm6, %ymm1, %ymm6
        vfmaddpd        %ymm0, %ymm5, %ymm1, %ymm5
        vfmaddpd        %ymm0, %ymm2, %ymm1, %ymm1
        vperm2f128      $32, %ymm11, %ymm12, %ymm2
        vmovaps %ymm13, -120(%rsp)
        vmovsd  64(%rsi), %xmm13
        vmovaps %ymm4, 136(%rsp)
        vmovaps %ymm6, 232(%rsp)
        vfmaddsd        (%rdx), %xmm15, %xmm13, %xmm15
        vmovaps %ymm1, 104(%rsp)
        vperm2f128      $49, %ymm11, %ymm12, %ymm1
        vmovaps %ymm5, 168(%rsp)
        vperm2f128      $32, %ymm9, %ymm10, %ymm5
        vunpcklpd       %ymm1, %ymm2, %ymm6
        vmovsd  %xmm13, 344(%rsp)
        vunpckhpd       %ymm1, %ymm2, %ymm2
        vperm2f128      $49, %ymm9, %ymm10, %ymm1
        vunpcklpd       %ymm1, %ymm5, %ymm4
        vmovsd  %xmm15, 352(%rsp)
        vunpckhpd       %ymm1, %ymm5, %ymm1
        vperm2f128      $32, %ymm4, %ymm6, %ymm5
        vperm2f128      $49, %ymm4, %ymm6, %ymm4
        vunpcklpd       %ymm4, %ymm5, %ymm7
        vunpckhpd       %ymm4, %ymm5, %ymm5
        vperm2f128      $32, %ymm1, %ymm2, %ymm4
        vperm2f128      $49, %ymm1, %ymm2, %ymm1
        vmovupd 32(%rsi), %ymm2
        vunpcklpd       %ymm1, %ymm4, %ymm6
        vunpckhpd       %ymm1, %ymm4, %ymm4
        vmovupd 112(%rsi), %ymm1
        vfmaddpd        %ymm3, %ymm7, %ymm2, %ymm3
        vfmaddpd        %ymm14, %ymm7, %ymm1, %ymm7
        vhaddpd %ymm3, %ymm3, %ymm3
        vhaddpd %ymm7, %ymm7, %ymm7
        vperm2f128      $1, %ymm3, %ymm3, %ymm15
        vaddpd  %ymm15, %ymm3, %ymm3
        vmovaps %ymm3, 584(%rsp)
        vmovsd  352(%rsp), %xmm3
        vaddsd  584(%rsp), %xmm3, %xmm3
        vmovsd  144(%rsi), %xmm15
        vmovsd  %xmm3, 264(%rsp)
        vmovsd  320(%rsp), %xmm3
        vfmaddsd        32(%rdx), %xmm3, %xmm15, %xmm13
        vperm2f128      $1, %ymm7, %ymm7, %ymm3
        vaddpd  %ymm3, %ymm7, %ymm3
        vmovaps %ymm3, 552(%rsp)
        vmovsd  312(%rsp), %xmm3
        vaddsd  552(%rsp), %xmm13, %xmm13
        vmovsd  %xmm13, 272(%rsp)
        vmovsd  344(%rsp), %xmm13
        vfmaddsd        8(%rdx), %xmm3, %xmm13, %xmm7
        vfmaddpd        -120(%rsp), %ymm6, %ymm2, %ymm13
        vhaddpd %ymm13, %ymm13, %ymm13
        vperm2f128      $1, %ymm13, %ymm13, %ymm3
        vaddpd  %ymm3, %ymm13, %ymm3
        vmovaps %ymm3, 520(%rsp)
        vaddsd  520(%rsp), %xmm7, %xmm7
        vmovsd  %xmm7, 352(%rsp)
        vfmaddpd        232(%rsp), %ymm6, %ymm1, %ymm6
        vmovsd  312(%rsp), %xmm13
        vfmaddsd        40(%rdx), %xmm13, %xmm15, %xmm7
        vhaddpd %ymm6, %ymm6, %ymm6
        vperm2f128      $1, %ymm6, %ymm6, %ymm3
        vaddpd  %ymm3, %ymm6, %ymm3
        vmovsd  304(%rsp), %xmm6
        vmovaps %ymm3, 488(%rsp)
        vmovsd  344(%rsp), %xmm3
        vaddsd  488(%rsp), %xmm7, %xmm13
        vfmaddsd        16(%rdx), %xmm6, %xmm3, %xmm7
        vfmaddpd        200(%rsp), %ymm5, %ymm2, %ymm3
        vfmaddpd        168(%rsp), %ymm5, %ymm1, %ymm5
        vfmaddpd        136(%rsp), %ymm4, %ymm2, %ymm2
        vfmaddpd        104(%rsp), %ymm4, %ymm1, %ymm1
        vmovsd  288(%rsp), %xmm4
        vhaddpd %ymm3, %ymm3, %ymm3
        vhaddpd %ymm5, %ymm5, %ymm5
        vhaddpd %ymm2, %ymm2, %ymm2
        vhaddpd %ymm1, %ymm1, %ymm1
        vperm2f128      $1, %ymm3, %ymm3, %ymm6
        vaddpd  %ymm6, %ymm3, %ymm3
        vmovaps %ymm3, 456(%rsp)
        vperm2f128      $1, %ymm5, %ymm5, %ymm3
        vaddpd  %ymm3, %ymm5, %ymm3
        vaddsd  456(%rsp), %xmm7, %xmm14
        vmovsd  304(%rsp), %xmm7
        vfmaddsd        48(%rdx), %xmm7, %xmm15, %xmm6
        vmovsd  296(%rsp), %xmm7
        vmovaps %ymm3, 424(%rsp)
        vmovsd  344(%rsp), %xmm3
        vfmaddsd        24(%rdx), %xmm7, %xmm3, %xmm5
        vperm2f128      $1, %ymm2, %ymm2, %ymm3
        vaddpd  %ymm3, %ymm2, %ymm2
        vaddsd  424(%rsp), %xmm6, %xmm6
        vmovaps %ymm2, 392(%rsp)
        vperm2f128      $1, %ymm1, %ymm1, %ymm2
        vaddpd  %ymm2, %ymm1, %ymm1
        vfmaddsd        56(%rdx), %xmm7, %xmm15, %xmm15
        vaddsd  392(%rsp), %xmm5, %xmm5
        vmovaps %ymm1, 360(%rsp)
        vmovsd  72(%rsi), %xmm2
        vmovsd  152(%rsi), %xmm1
        addq    $160, %rsi
        vaddsd  360(%rsp), %xmm15, %xmm15
        vfmaddsd        264(%rsp), %xmm4, %xmm2, %xmm3
        vmovsd  %xmm3, (%rdx)
        vfmaddsd        272(%rsp), %xmm4, %xmm1, %xmm3
        vmovsd  %xmm3, 32(%rdx)
        vmovsd  280(%rsp), %xmm3
        vfmaddsd        352(%rsp), %xmm3, %xmm2, %xmm7
        vmovsd  %xmm7, 8(%rdx)
        vfmaddsd        %xmm13, %xmm3, %xmm1, %xmm7
        vfmaddsd        %xmm6, 336(%rsp), %xmm1, %xmm6
        vfmaddsd        %xmm5, 328(%rsp), %xmm2, %xmm5
        vfmaddsd        %xmm15, 328(%rsp), %xmm1, %xmm1
        vmovsd  %xmm7, 40(%rdx)
        vfmaddsd        %xmm14, 336(%rsp), %xmm2, %xmm7
        vmovsd  %xmm6, 48(%rdx)
        vmovsd  %xmm5, 24(%rdx)
        vmovsd  %xmm1, 56(%rdx)
        vmovsd  %xmm7, 16(%rdx)
        addq    $64, %rdx
        cmpl    $9, %eax
        je      .L1
        addl    $2, %eax
        vmovaps 72(%rsp), %ymm4
        vmovaps 8(%rsp), %ymm6
        vmovaps -56(%rsp), %ymm7
        jmp     .L3
        .p2align 5,,7
        .p2align 3
.L1:
        leave
.LCFI3:


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2012-07-19 10:36 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-11-16 19:25 [Bug target/51179] New: poor vectorization on interlagos Joost.VandeVondele at mat dot ethz.ch
2011-11-22 11:03 ` [Bug target/51179] " ubizjak at gmail dot com
2011-11-22 12:31 ` [Bug tree-optimization/51179] " ubizjak at gmail dot com
2011-11-22 17:57 ` jakub at gcc dot gnu.org
2011-11-22 18:53 ` Joost.VandeVondele at mat dot ethz.ch
2011-11-22 18:55 ` Joost.VandeVondele at mat dot ethz.ch
2011-11-22 21:38 ` dominiq at lps dot ens.fr
2011-11-22 23:13 ` ubizjak at gmail dot com
2011-11-23  9:28 ` Joost.VandeVondele at mat dot ethz.ch
2011-11-23 17:50 ` Joost.VandeVondele at mat dot ethz.ch
2011-11-23 20:30 ` Joost.VandeVondele at mat dot ethz.ch
2012-06-30 11:27 ` Joost.VandeVondele at mat dot ethz.ch
2012-07-19 10:36 ` rguenth at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).