public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug middle-end/99408] New: s3251 benchmark of TSVC vectorized by clang runs about 7 times faster compared to gcc
@ 2021-03-05 14:11 hubicka at gcc dot gnu.org
  2021-03-08  8:17 ` [Bug middle-end/99408] " rguenth at gcc dot gnu.org
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-05 14:11 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99408

            Bug ID: 99408
           Summary: s3251 benchmark of TSVC vectorized by clang runs about
                    7 times faster compared to gcc
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

typedef float real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
void
main(void)
{
    for (int nl = 0; nl < iterations; nl++) {
        for (int i = 0; i < LEN_1D-1; i++){
            a[i+1] = b[i]+c[i];
            b[i]   = c[i]*e[i];
            d[i]   = a[i]*e[i];
        }
    }
}

Built with -march=znver2 -Ofast I get:
main:
.LFB0:
        .cfi_startproc
        vmovaps c+127968(%rip), %xmm5
        vmovaps e+127968(%rip), %xmm4
        movl    $100000, %edx
        vmovq   c+127984(%rip), %xmm9
        vmovq   e+127984(%rip), %xmm10
        vmovss  c+127992(%rip), %xmm7
        vmovss  e+127992(%rip), %xmm3
        vmovss  c+127984(%rip), %xmm13
        vmulps  %xmm4, %xmm5, %xmm6
        vmulps  %xmm9, %xmm10, %xmm12
        vmulss  %xmm3, %xmm7, %xmm11
        .p2align 4
        .p2align 3
.L2:
        xorl    %eax, %eax
        .p2align 4
        .p2align 3
.L4:
        vmovaps c(%rax), %ymm2
        addq    $32, %rax
        vaddps  b-32(%rax), %ymm2, %ymm0
        vmovups %ymm0, a-28(%rax)
        vmulps  e-32(%rax), %ymm2, %ymm0
        vmovaps e-32(%rax), %ymm2
        vmovaps %ymm0, b-32(%rax)
        vmulps  a-32(%rax), %ymm2, %ymm0
        vmovaps %ymm0, d-32(%rax)
        cmpq    $127968, %rax
        jne     .L4
        vaddps  b+127968(%rip), %xmm5, %xmm1
        vaddss  b+127984(%rip), %xmm13, %xmm2
        decl    %edx
        vmovaps %xmm6, b+127968(%rip)
        vmovq   b+127984(%rip), %xmm0
        vmovlps %xmm12, b+127984(%rip)
        vaddps  %xmm0, %xmm9, %xmm0
        vmovups %xmm1, a+127972(%rip)
        vshufps $255, %xmm1, %xmm1, %xmm1
        vmulps  a+127968(%rip), %xmm4, %xmm8
        vunpcklps       %xmm2, %xmm1, %xmm1
        vaddss  b+127992(%rip), %xmm7, %xmm2
        vmovss  %xmm11, b+127992(%rip)
        vmulps  %xmm10, %xmm1, %xmm1
        vmovlps %xmm0, a+127988(%rip)
        vmovshdup       %xmm0, %xmm0
        vmulss  %xmm3, %xmm0, %xmm0
        vmovss  %xmm2, a+127996(%rip)
        jne     .L2
        vmovaps %xmm8, d+127968(%rip)
        vmovlps %xmm1, d+127984(%rip)
        vmovss  %xmm0, d+127992(%rip)
        vzeroupper
        ret


Clang does:

main:                                   # @main
        .cfi_startproc
# %bb.0:
        vbroadcastss    a(%rip), %ymm0
        vmovss  e+127968(%rip), %xmm1           # xmm1 = mem[0],zero,zero,zero
        vmovss  e+127980(%rip), %xmm2           # xmm2 = mem[0],zero,zero,zero
        vmovss  c+127984(%rip), %xmm4           # xmm4 = mem[0],zero,zero,zero
        vmovss  e+127984(%rip), %xmm5           # xmm5 = mem[0],zero,zero,zero
        vmovss  c+127988(%rip), %xmm8           # xmm8 = mem[0],zero,zero,zero
        vmovss  e+127988(%rip), %xmm9           # xmm9 = mem[0],zero,zero,zero
        vmovss  c+127992(%rip), %xmm11          # xmm11 = mem[0],zero,zero,zero
        vmovss  e+127992(%rip), %xmm12          # xmm12 = mem[0],zero,zero,zero
        xorl    %eax, %eax
        vmovups %ymm0, -56(%rsp)                # 32-byte Spill
        vmovss  c+127968(%rip), %xmm0           # xmm0 = mem[0],zero,zero,zero
        vmovss  %xmm1, -64(%rsp)                # 4-byte Spill
        vmulss  %xmm4, %xmm5, %xmm3
        vmulss  %xmm8, %xmm9, %xmm10
        vmulss  %xmm11, %xmm12, %xmm13
        vmovss  %xmm0, -60(%rsp)                # 4-byte Spill
        vmulss  %xmm0, %xmm1, %xmm0
        vmovss  e+127972(%rip), %xmm1           # xmm1 = mem[0],zero,zero,zero
        vmovss  %xmm0, -68(%rsp)                # 4-byte Spill
        vmovss  c+127972(%rip), %xmm0           # xmm0 = mem[0],zero,zero,zero
        vmovss  %xmm1, -76(%rsp)                # 4-byte Spill
        vmovss  %xmm0, -72(%rsp)                # 4-byte Spill
        vmulss  %xmm0, %xmm1, %xmm0
        vmovss  e+127976(%rip), %xmm1           # xmm1 = mem[0],zero,zero,zero
        vmovss  %xmm0, -80(%rsp)                # 4-byte Spill
        vmovss  c+127976(%rip), %xmm0           # xmm0 = mem[0],zero,zero,zero
        vmovss  %xmm1, -88(%rsp)                # 4-byte Spill
        vmovss  %xmm0, -84(%rsp)                # 4-byte Spill
        vmulss  %xmm0, %xmm1, %xmm0
        vmovss  c+127980(%rip), %xmm1           # xmm1 = mem[0],zero,zero,zero
        vmovss  %xmm0, -92(%rsp)                # 4-byte Spill
        vmulss  %xmm1, %xmm2, %xmm0
       vmovss  %xmm0, -96(%rsp)                # 4-byte Spill
        .p2align        4, 0x90
.LBB0_1:                                # =>This Loop Header: Depth=1
                                        #     Child Loop BB0_2 Depth 2
        vmovups -56(%rsp), %ymm14               # 32-byte Reload
        xorl    %ecx, %ecx
        .p2align        4, 0x90
.LBB0_2:                                #   Parent Loop BB0_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vmovups c(%rcx), %ymm7
        vmovaps %ymm14, %ymm15
        vmovups e(%rcx), %ymm0
        vaddps  b(%rcx), %ymm7, %ymm14
        vmulps  %ymm7, %ymm0, %ymm7
        vperm2f128      $33, %ymm14, %ymm15, %ymm15 # ymm15 =
ymm15[2,3],ymm14[0,1]
        vmovups %ymm14, a+4(%rcx)
        vmovups %ymm7, b(%rcx)
        vshufps $3, %ymm14, %ymm15, %ymm15      # ymm15 =
ymm15[3,0],ymm14[0,0],ymm15[7,4],ymm14[4,4]
        vshufps $152, %ymm14, %ymm15, %ymm15    # ymm15 =
ymm15[0,2],ymm14[1,2],ymm15[4,6],ymm14[5,6]
        vmulps  %ymm0, %ymm15, %ymm0
        vmovups %ymm0, d(%rcx)
        addq    $32, %rcx
        cmpq    $127968, %rcx                   # imm = 0x1F3E0
        jne     .LBB0_2
# %bb.3:                                #   in Loop: Header=BB0_1 Depth=1
        vextractf128    $1, %ymm14, %xmm0
        vmovss  -60(%rsp), %xmm7                # 4-byte Reload
                                        # xmm7 = mem[0],zero,zero,zero
        vmovss  -68(%rsp), %xmm6                # 4-byte Reload
                                        # xmm6 = mem[0],zero,zero,zero
        incl    %eax
        vpermilps       $231, %xmm0, %xmm0      # xmm0 = xmm0[3,1,2,3]
        vmulss  -64(%rsp), %xmm0, %xmm0         # 4-byte Folded Reload
        vaddss  b+127968(%rip), %xmm7, %xmm7
        vmovss  %xmm6, b+127968(%rip)
        vmovss  -80(%rsp), %xmm6                # 4-byte Reload
                                        # xmm6 = mem[0],zero,zero,zero
        vmovss  %xmm0, d+127968(%rip)
        vmovss  -72(%rsp), %xmm0                # 4-byte Reload
                                        # xmm0 = mem[0],zero,zero,zero
        vmovss  %xmm7, a+127972(%rip)
       vmulss  -76(%rsp), %xmm7, %xmm7         # 4-byte Folded Reload
        vaddss  b+127972(%rip), %xmm0, %xmm0
        vmovss  %xmm6, b+127972(%rip)
        vmovss  -84(%rsp), %xmm6                # 4-byte Reload
                                        # xmm6 = mem[0],zero,zero,zero
        vmovss  %xmm7, d+127972(%rip)
        vaddss  b+127976(%rip), %xmm6, %xmm7
        vmovss  -92(%rsp), %xmm6                # 4-byte Reload
                                        # xmm6 = mem[0],zero,zero,zero
        vmovss  %xmm0, a+127976(%rip)
        vmulss  -88(%rsp), %xmm0, %xmm0         # 4-byte Folded Reload
        vmovss  %xmm6, b+127976(%rip)
        vmovss  -96(%rsp), %xmm6                # 4-byte Reload
                                        # xmm6 = mem[0],zero,zero,zero
        vmovss  %xmm7, a+127980(%rip)
        vmulss  %xmm2, %xmm7, %xmm7
        vmovss  %xmm0, d+127976(%rip)
        vaddss  b+127980(%rip), %xmm1, %xmm0
        vmovss  %xmm7, d+127980(%rip)
        vmovss  %xmm6, b+127980(%rip)
        vaddss  b+127984(%rip), %xmm4, %xmm7
        vmovss  %xmm3, b+127984(%rip)
        vmovss  %xmm0, a+127984(%rip)
        vmulss  %xmm5, %xmm0, %xmm0
        vmovss  %xmm0, d+127984(%rip)
        vaddss  b+127988(%rip), %xmm8, %xmm0
        vmovss  %xmm10, b+127988(%rip)
        vaddss  b+127992(%rip), %xmm11, %xmm6
        vmovss  %xmm13, b+127992(%rip)
        vmovss  %xmm7, a+127988(%rip)
        vmulss  %xmm7, %xmm9, %xmm7
        vmovss  %xmm7, d+127988(%rip)
        vmovss  %xmm0, a+127992(%rip)
        vmulss  %xmm0, %xmm12, %xmm0
        vmovss  %xmm6, a+127996(%rip)
        vmovss  %xmm0, d+127992(%rip)
        cmpl    $100000, %eax                   # imm = 0x186A0
        jne     .LBB0_1
# %bb.4:
        vzeroupper
        retq

Runtie with clang is 0.443s and GCC 2.317s. With -fno-tree-vectorize I get
2.153s

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug middle-end/99408] s3251 benchmark of TSVC vectorized by clang runs about 7 times faster compared to gcc
  2021-03-05 14:11 [Bug middle-end/99408] New: s3251 benchmark of TSVC vectorized by clang runs about 7 times faster compared to gcc hubicka at gcc dot gnu.org
@ 2021-03-08  8:17 ` rguenth at gcc dot gnu.org
  2021-12-22 10:42 ` [Bug tree-optimization/99408] " pinskia at gcc dot gnu.org
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: rguenth at gcc dot gnu.org @ 2021-03-08  8:17 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99408

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Blocks|                            |53947
           Keywords|                            |missed-optimization

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
Hum, GCCs code _looks_ faster.  Maybe it's our tendency to duplicate memory
accesses in vector instructions (there's a PR about this somewhere).  A
load uop on every stmt is likely the bottleneck here.


Referenced Bugs:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug tree-optimization/99408] s3251 benchmark of TSVC vectorized by clang runs about 7 times faster compared to gcc
  2021-03-05 14:11 [Bug middle-end/99408] New: s3251 benchmark of TSVC vectorized by clang runs about 7 times faster compared to gcc hubicka at gcc dot gnu.org
  2021-03-08  8:17 ` [Bug middle-end/99408] " rguenth at gcc dot gnu.org
@ 2021-12-22 10:42 ` pinskia at gcc dot gnu.org
  2022-11-16 17:03 ` hubicka at gcc dot gnu.org
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-12-22 10:42 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99408

Andrew Pinski <pinskia at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           Severity|normal                      |enhancement
          Component|middle-end                  |tree-optimization

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug tree-optimization/99408] s3251 benchmark of TSVC vectorized by clang runs about 7 times faster compared to gcc
  2021-03-05 14:11 [Bug middle-end/99408] New: s3251 benchmark of TSVC vectorized by clang runs about 7 times faster compared to gcc hubicka at gcc dot gnu.org
  2021-03-08  8:17 ` [Bug middle-end/99408] " rguenth at gcc dot gnu.org
  2021-12-22 10:42 ` [Bug tree-optimization/99408] " pinskia at gcc dot gnu.org
@ 2022-11-16 17:03 ` hubicka at gcc dot gnu.org
  2023-01-11 19:03 ` hubicka at gcc dot gnu.org
  2023-01-14 22:33 ` hubicka at gcc dot gnu.org
  4 siblings, 0 replies; 6+ messages in thread
From: hubicka at gcc dot gnu.org @ 2022-11-16 17:03 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99408

--- Comment #2 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
This also reproduces with zen4 and double.

jh@alberti:~/tsvc/bin> cat tt.c
typedef double real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
void
main(void)
{
    for (int nl = 0; nl < iterations; nl++) {
        for (int i = 0; i < LEN_1D-1; i++){
            a[i+1] = b[i]+c[i];
            b[i]   = c[i]*e[i];
            d[i]   = a[i]*e[i];
        }
    }
}
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native tt.c
jh@alberti:~/tsvc/bin> time ./a.out

real    0m3.590s
user    0m3.585s
sys     0m0.004s
jh@alberti:~/tsvc/bin> clang -Ofast -march=native tt.c
tt.c:6:1: warning: return type of 'main' is not 'int' [-Wmain-return-type]
void
^
tt.c:6:1: note: change return type to 'int'
void
^~~~
int
1 warning generated.
jh@alberti:~/tsvc/bin> time ./a.out

real    0m1.538s
user    0m1.538s
sys     0m0.000s

gcc generates:

       │ 60:   vmovapd    0x67e080(%rax),%zmm7                                  
  0.15 │       vmovapd    0x601080(%rax),%zmm2                                  
  1.07 │       add        $0x40,%rax                                            
       │       vaddpd     0x6bc840(%rax),%zmm7,%zmm0                            
  0.00 │       vmovupd    %zmm0,0x6fb048(%rax)                                  
 11.10 │       vmulpd     0x601040(%rax),%zmm7,%zmm0                            
  9.46 │       vmovapd    %zmm0,0x6bc840(%rax)                                  
  0.01 │       vmulpd     0x6fb040(%rax),%zmm2,%zmm0                            
 78.20 │       vmovapd    %zmm0,0x63f840(%rax)                                  
       │       cmp        $0x3e7c0,%rax                                         
       │     ↑ jne        60                                                    

clang generates:
       │       nop                                                              
       │ a0:   vmovupd       (%r9,%rdx,1),%zmm15                                
  0.46 │       vmovupd       (%r9,%rdi,1),%zmm19                                
  0.22 │       vmovupd       0x40(%r9,%rdx,1),%zmm16                            
  0.56 │       vmovupd       0x40(%r9,%rdi,1),%zmm22                            
  0.92 │       vmovupd       0x80(%r9,%rdx,1),%zmm17                            
  1.85 │       vmovupd       0x80(%r9,%rdi,1),%zmm21                            
  1.51 │       vaddpd        (%r9,%rcx,1),%zmm15,%zmm18                         
  0.84 │       vmulpd        %zmm15,%zmm19,%zmm15                               
  0.47 │       vmovupd       %zmm15,(%r9,%rcx,1)                                
  3.37 │       vaddpd        0x40(%r9,%rcx,1),%zmm16,%zmm15                     
  0.56 │       vmulpd        %zmm16,%zmm22,%zmm16                               
  0.69 │       vmovupd       %zmm16,0x40(%r9,%rcx,1)                            
  3.82 │       vmovupd       %zmm18,0x8(%r9,%rsi,1)                             
  3.27 │       vmovapd       %zmm15,%zmm20                                      
       │       vmovupd       %zmm15,0x48(%r9,%rsi,1)                            
  3.60 │       vpermt2pd     %zmm18,%zmm13,%zmm20                               
  0.47 │       vpermt2pd     %zmm14,%zmm13,%zmm18                               
  0.36 │       vmulpd        %zmm19,%zmm18,%zmm18                               
  1.07 │       vmulpd        %zmm22,%zmm20,%zmm14                               
  1.33 │       vmovupd       %zmm18,(%r9,%r8,1)                                 
  6.31 │       vmovupd       %zmm14,0x40(%r9,%r8,1)                             
  8.02 │       vaddpd        0x80(%r9,%rcx,1),%zmm17,%zmm14                     
  0.53 │       vmovapd       %zmm14,%zmm16                                      
  0.05 │       vmovupd       %zmm14,0x88(%r9,%rsi,1)                            
  3.08 │       vpermt2pd     %zmm15,%zmm13,%zmm16                               
  0.41 │       vmulpd        %zmm17,%zmm21,%zmm15                               
  0.20 │       vmovupd       %zmm15,0x80(%r9,%rcx,1)                            
  1.60 │       vmulpd        %zmm21,%zmm16,%zmm15                               
  1.16 │       vmovupd       %zmm15,0x80(%r9,%r8,1)                             
  3.13 │       add           $0xc0,%r9                                          
       │       cmp           $0x3e7c0,%r9                                       
  0.03 │     ↑ jne           a0       

So a forward-dependency here?

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug tree-optimization/99408] s3251 benchmark of TSVC vectorized by clang runs about 7 times faster compared to gcc
  2021-03-05 14:11 [Bug middle-end/99408] New: s3251 benchmark of TSVC vectorized by clang runs about 7 times faster compared to gcc hubicka at gcc dot gnu.org
                   ` (2 preceding siblings ...)
  2022-11-16 17:03 ` hubicka at gcc dot gnu.org
@ 2023-01-11 19:03 ` hubicka at gcc dot gnu.org
  2023-01-14 22:33 ` hubicka at gcc dot gnu.org
  4 siblings, 0 replies; 6+ messages in thread
From: hubicka at gcc dot gnu.org @ 2023-01-11 19:03 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99408

--- Comment #3 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
with zen4 gcc build loop takes 19s, while aocc 6.6.

aocc:

.LBB0_1:                                # %for.cond22.preheader
                                        # =>This Loop Header: Depth=1
                                        #     Child Loop BB0_2 Depth 2
        vbroadcastss    a(%rip), %zmm20
        xorl    %ecx, %ecx
        .p2align        4, 0x90
.LBB0_2:                                # %vector.body
                                        #   Parent Loop BB0_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vmovups c(%rcx), %zmm13
        vmovaps %zmm20, %zmm12
        vmovups e(%rcx), %zmm0
        vaddps  b(%rcx), %zmm13, %zmm20
        vmulps  %zmm13, %zmm0, %zmm13
        vmovaps %zmm20, %zmm15
        vpermt2ps       %zmm12, %zmm29, %zmm15
        vmovups %zmm20, a+4(%rcx)
        vmovups %zmm13, b(%rcx)
        vmulps  %zmm0, %zmm15, %zmm12
        vmovups %zmm12, d(%rcx)
        addq    $64, %rcx
        cmpq    $127936, %rcx                   # imm = 0x1F3C0
        jne     .LBB0_2
# %bb.3:                                # %middle.block

        vextractf32x4   $3, %zmm20, %xmm5
        vmovss  -4(%rsp), %xmm2                 # 4-byte Reload
                                        # xmm2 = mem[0],zero,zero,zero
        vmovss  -12(%rsp), %xmm0                # 4-byte Reload
                                        # xmm0 = mem[0],zero,zero,zero
        incl    %eax
        vaddss  b+127936(%rip), %xmm2, %xmm2
        vpermilps       $231, %xmm5, %xmm5      # xmm5 = xmm5[3,1,2,3]
        vmulss  -8(%rsp), %xmm5, %xmm5          # 4-byte Folded Reload
        vmovss  %xmm0, b+127936(%rip)
        vmovss  -16(%rsp), %xmm0                # 4-byte Reload
                                        # xmm0 = mem[0],zero,zero,zero
        vmovss  %xmm2, a+127940(%rip)
        vmulss  -20(%rsp), %xmm2, %xmm2         # 4-byte Folded Reload
        vmovss  %xmm5, d+127936(%rip)
        vaddss  b+127940(%rip), %xmm0, %xmm5
        vmovss  -24(%rsp), %xmm0                # 4-byte Reload
                                        # xmm0 = mem[0],zero,zero,zero
        vmovss  %xmm0, b+127940(%rip)
        vmovss  -28(%rsp), %xmm0                # 4-byte Reload
                                        # xmm0 = mem[0],zero,zero,zero
        vmovss  %xmm2, d+127940(%rip)
        vaddss  b+127944(%rip), %xmm0, %xmm2
        vmovss  -36(%rsp), %xmm0                # 4-byte Reload
                                        # xmm0 = mem[0],zero,zero,zero
        vmovss  %xmm5, a+127944(%rip)
        vmulss  -32(%rsp), %xmm5, %xmm5         # 4-byte Folded Reload
        vmovss  %xmm0, b+127944(%rip)
        vmovss  -40(%rsp), %xmm0                # 4-byte Reload
                                        # xmm0 = mem[0],zero,zero,zero
        vmovss  %xmm2, a+127948(%rip)
        vmulss  %xmm22, %xmm2, %xmm2
        vmovss  %xmm5, d+127944(%rip)
        vaddss  b+127948(%rip), %xmm21, %xmm5
        vmovss  %xmm2, d+127948(%rip)
        vmovss  %xmm0, b+127948(%rip)
        vmovss  -44(%rsp), %xmm0                # 4-byte Reload
                                        # xmm0 = mem[0],zero,zero,zero
        vaddss  b+127952(%rip), %xmm24, %xmm2
        vmovss  %xmm5, a+127952(%rip)
        vmulss  %xmm25, %xmm5, %xmm5
        vmovss  %xmm0, b+127952(%rip)
        vmovss  -48(%rsp), %xmm0                # 4-byte Reload
                                        # xmm0 = mem[0],zero,zero,zero
        vmovss  %xmm5, d+127952(%rip)
        vaddss  b+127956(%rip), %xmm27, %xmm5
        vmovss  %xmm2, a+127956(%rip)
        vmulss  %xmm28, %xmm2, %xmm2
        vmovss  %xmm2, d+127956(%rip)
        vmovss  %xmm0, b+127956(%rip)
        vmovss  -52(%rsp), %xmm0                # 4-byte Reload
                                        # xmm0 = mem[0],zero,zero,zero
        vaddss  b+127960(%rip), %xmm30, %xmm2
        vmovss  %xmm5, a+127960(%rip)
        vmulss  %xmm31, %xmm5, %xmm5
        vmovss  %xmm5, d+127960(%rip)
        vmovss  %xmm0, b+127960(%rip)
        vmovss  -56(%rsp), %xmm0                # 4-byte Reload
                                        # xmm0 = mem[0],zero,zero,zero
        vaddss  b+127964(%rip), %xmm16, %xmm5
        vmovss  %xmm2, a+127964(%rip)
        vmulss  %xmm18, %xmm2, %xmm2
        vmovss  %xmm2, d+127964(%rip)
        vmovss  %xmm0, b+127964(%rip)
        vmovss  -60(%rsp), %xmm0                # 4-byte Reload
                                        # xmm0 = mem[0],zero,zero,zero
        vaddss  b+127968(%rip), %xmm19, %xmm2
        vmovss  %xmm5, a+127968(%rip)
        vmulss  %xmm1, %xmm5, %xmm5
        vmovss  %xmm5, d+127968(%rip)
        vmovss  %xmm0, b+127968(%rip)
        vmovss  -64(%rsp), %xmm0                # 4-byte Reload
                                        # xmm0 = mem[0],zero,zero,zero
        vaddss  b+127972(%rip), %xmm3, %xmm5
        vmovss  %xmm2, a+127972(%rip)
        vmulss  %xmm4, %xmm2, %xmm2
        vmovss  %xmm2, d+127972(%rip)
        vmovss  %xmm0, b+127972(%rip)
        vmovss  -68(%rsp), %xmm0                # 4-byte Reload
                                        # xmm0 = mem[0],zero,zero,zero
        vaddss  b+127976(%rip), %xmm6, %xmm2
        vmovss  %xmm5, a+127976(%rip)
        vmulss  %xmm7, %xmm5, %xmm5
        vmovss  %xmm5, d+127976(%rip)
        vmovss  %xmm0, b+127976(%rip)
        vmovss  -72(%rsp), %xmm0                # 4-byte Reload
                                        # xmm0 = mem[0],zero,zero,zero
        vaddss  b+127980(%rip), %xmm9, %xmm5
        vmovss  %xmm2, a+127980(%rip)
        vmulss  %xmm2, %xmm10, %xmm2
        vmovss  %xmm2, d+127980(%rip)
        vmovss  %xmm0, b+127980(%rip)
        vmovss  -76(%rsp), %xmm0                # 4-byte Reload
                                        # xmm0 = mem[0],zero,zero,zero
        vaddss  b+127984(%rip), %xmm14, %xmm2
        vmovss  %xmm5, a+127984(%rip)
        vmulss  %xmm17, %xmm5, %xmm5
        vmovss  %xmm5, d+127984(%rip)
        vmovss  %xmm0, b+127984(%rip)
        vmovss  -80(%rsp), %xmm0                # 4-byte Reload
                                        # xmm0 = mem[0],zero,zero,zero
        vaddss  b+127988(%rip), %xmm23, %xmm5
        vmovss  %xmm2, a+127988(%rip)
        vmulss  %xmm26, %xmm2, %xmm2
        vmovss  %xmm2, d+127988(%rip)
        vmovss  %xmm0, b+127988(%rip)
        vaddss  b+127992(%rip), %xmm8, %xmm2
        vmulss  %xmm5, %xmm11, %xmm0
        vmovss  %xmm5, a+127992(%rip)
        vmovss  %xmm0, d+127992(%rip)
        vmovss  -84(%rsp), %xmm0                # 4-byte Reload
                                        # xmm0 = mem[0],zero,zero,zero
        vmovss  %xmm2, a+127996(%rip)
        vmovss  %xmm0, b+127992(%rip)
        cmpl    $1000000, %eax                  # imm = 0xF4240
        jne     .LBB0_1

gcc:
.L2:
        vmovdqa32       %zmm5, %zmm1
        addq    $320, %rax
        vpaddd  %zmm2, %zmm5, %zmm5
        vmovdqa32       %zmm6, %zmm0
        vpaddd  %zmm2, %zmm6, %zmm6
        vpaddd  %zmm24, %zmm1, %zmm25
        vpaddd  %zmm23, %zmm1, %zmm1
        valignq $3, %ymm25, %ymm25, %ymm26
        vmovq   %xmm25, -320(%rax)
        vpextrq $1, %xmm25, -300(%rax)
        vmovq   %xmm1, -160(%rax)
        vpextrq $1, %xmm1, -140(%rax)
        vextracti64x2   $1, %ymm25, %xmm27
        vextracti64x4   $0x1, %zmm25, %ymm25
        vmovq   %xmm26, -260(%rax)
        vmovq   %xmm25, -240(%rax)
        vpextrq $1, %xmm25, -220(%rax)
        vextracti64x2   $1, %ymm25, %xmm26
        vmovq   %xmm27, -280(%rax)
        valignq $3, %ymm25, %ymm25, %ymm25
        vmovq   %xmm26, -200(%rax)
        vmovq   %xmm25, -180(%rax)
        valignq $3, %ymm1, %ymm1, %ymm25
        vextracti64x2   $1, %ymm1, %xmm26
        vextracti64x4   $0x1, %zmm1, %ymm1
        vmovq   %xmm25, -100(%rax)
        vmovq   %xmm1, -80(%rax)
        vpextrq $1, %xmm1, -60(%rax)
        vextracti64x2   $1, %ymm1, %xmm25
        vmovq   %xmm26, -120(%rax)
        vmovdqa32       %zmm0, %zmm26
        valignq $3, %ymm1, %ymm1, %ymm1
        vmovq   %xmm25, -40(%rax)
        vpaddd  %zmm3, %zmm0, %zmm25
        vmovq   %xmm1, -20(%rax)
        vpaddd  %zmm4, %zmm0, %zmm1
        vpermt2d        %zmm1, %zmm22, %zmm26
        vmovq   %xmm26, -312(%rax)
        vmovdqa32       %zmm0, %zmm26
        vpermt2d        %zmm1, %zmm21, %zmm26
        vmovq   %xmm26, %rdx
        vmovdqa32       %zmm0, %zmm26
        movq    %rdx, -292(%rax)
        vpermt2d        %zmm1, %zmm20, %zmm26
        vmovq   %xmm26, -272(%rax)
        vmovdqa32       %zmm0, %zmm26
        vpermt2d        %zmm1, %zmm19, %zmm26
        vmovq   %xmm26, %rdx
        vmovdqa32       %zmm0, %zmm26
        movq    %rdx, -252(%rax)
        vpermt2d        %zmm1, %zmm18, %zmm26
        vmovq   %xmm26, -232(%rax)
        vmovdqa32       %zmm0, %zmm26
        vpermt2d        %zmm1, %zmm17, %zmm26
        vmovq   %xmm26, %rdx
        vmovdqa32       %zmm0, %zmm26
        movq    %rdx, -212(%rax)
        vpermt2d        %zmm1, %zmm16, %zmm26
        vmovq   %xmm26, -192(%rax)
        vmovdqa32       %zmm0, %zmm26
        vpermt2d        %zmm1, %zmm15, %zmm26
        vmovq   %xmm26, %rdx
        vmovdqa32       %zmm0, %zmm26
        movq    %rdx, -172(%rax)
        vpermt2d        %zmm1, %zmm14, %zmm26
        vmovq   %xmm26, -152(%rax)
        vmovdqa32       %zmm0, %zmm26
        vpermt2d        %zmm1, %zmm13, %zmm26
        vmovq   %xmm26, %rdx
        vmovdqa32       %zmm0, %zmm26
        movq    %rdx, -132(%rax)
        vpermt2d        %zmm1, %zmm12, %zmm26
        vmovq   %xmm26, -112(%rax)
        vmovdqa32       %zmm0, %zmm26
        vpermt2d        %zmm1, %zmm11, %zmm26
        vmovq   %xmm26, %rdx
        vmovdqa32       %zmm0, %zmm26
        movq    %rdx, -92(%rax)
        vpermt2d        %zmm1, %zmm10, %zmm26
        vmovq   %xmm26, -72(%rax)
        vmovdqa32       %zmm0, %zmm26
        vpermt2d        %zmm1, %zmm9, %zmm26
        vmovq   %xmm26, %rdx
        vmovdqa32       %zmm0, %zmm26
        vpermt2d        %zmm1, %zmm7, %zmm0
        vmovq   %xmm0, -12(%rax)
        movq    %rdx, -52(%rax)
        vmovdqa32       %ymm25, %ymm0
        vpermt2d        %zmm1, %zmm8, %zmm26
        vextracti32x4   $1, %ymm25, %xmm1
        vmovq   %xmm26, -32(%rax)
        vmovd   %xmm25, -304(%rax)
        vpextrd $1, %xmm0, -284(%rax)
        vpextrd $2, %xmm0, -264(%rax)
        vmovd   %xmm1, -224(%rax)
        valignd $5, %ymm25, %ymm25, %ymm1
        vpextrd $3, %xmm0, -244(%rax)
        valignd $7, %ymm25, %ymm25, %ymm0
        vmovd   %xmm1, -204(%rax)
        valignd $6, %ymm25, %ymm25, %ymm1
        vmovd   %xmm0, -164(%rax)
        vextracti32x8   $0x1, %zmm25, %ymm0
        vmovd   %xmm0, -144(%rax)
        vpextrd $1, %xmm0, -124(%rax)
        vmovd   %xmm1, -184(%rax)
        vextracti32x4   $1, %ymm0, %xmm1
        vpextrd $2, %xmm0, -104(%rax)
        vpextrd $3, %xmm0, -84(%rax)
        vmovd   %xmm1, -64(%rax)
        valignd $5, %ymm0, %ymm0, %ymm1
        vmovd   %xmm1, -44(%rax)
        valignd $6, %ymm0, %ymm0, %ymm1
        valignd $7, %ymm0, %ymm0, %ymm0
        vmovd   %xmm1, -24(%rax)
        vmovd   %xmm0, -4(%rax)
        cmpq    %rcx, %rax
        jne     .L2

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug tree-optimization/99408] s3251 benchmark of TSVC vectorized by clang runs about 7 times faster compared to gcc
  2021-03-05 14:11 [Bug middle-end/99408] New: s3251 benchmark of TSVC vectorized by clang runs about 7 times faster compared to gcc hubicka at gcc dot gnu.org
                   ` (3 preceding siblings ...)
  2023-01-11 19:03 ` hubicka at gcc dot gnu.org
@ 2023-01-14 22:33 ` hubicka at gcc dot gnu.org
  4 siblings, 0 replies; 6+ messages in thread
From: hubicka at gcc dot gnu.org @ 2023-01-14 22:33 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99408

--- Comment #4 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
On Zen4 it is 20s for gcc and 6.9s for aocc, so still a problem.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-01-14 22:33 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-05 14:11 [Bug middle-end/99408] New: s3251 benchmark of TSVC vectorized by clang runs about 7 times faster compared to gcc hubicka at gcc dot gnu.org
2021-03-08  8:17 ` [Bug middle-end/99408] " rguenth at gcc dot gnu.org
2021-12-22 10:42 ` [Bug tree-optimization/99408] " pinskia at gcc dot gnu.org
2022-11-16 17:03 ` hubicka at gcc dot gnu.org
2023-01-11 19:03 ` hubicka at gcc dot gnu.org
2023-01-14 22:33 ` hubicka at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).