public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug middle-end/108376] New: TSVC s1279 runs 40% faster with aocc than gcc at zen4
@ 2023-01-11 18:56 hubicka at gcc dot gnu.org
  2023-01-11 20:21 ` [Bug middle-end/108376] " amonakov at gcc dot gnu.org
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: hubicka at gcc dot gnu.org @ 2023-01-11 18:56 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108376

            Bug ID: 108376
           Summary: TSVC s1279 runs 40% faster with aocc than gcc at zen4
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

jh@alberti:~/tsvc/bin> more s1279.c
#include <math.h>
#include <malloc.h>

typedef float real_t;
#define iterations 1000000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
real_t aa[LEN_2D][LEN_2D];
real_t bb[LEN_2D][LEN_2D];
real_t cc[LEN_2D][LEN_2D];
real_t qq;
int
main(void)
{
//    reductions
//    if to max reduction

    real_t x;
    int * __restrict__ ip = (int *) malloc(LEN_1D*sizeof(real_t));

    for (int i = 0; i < LEN_1D; i = i+5){
        (ip)[i]   = (i+4);
        (ip)[i+1] = (i+2);
        (ip)[i+2] = (i);
        (ip)[i+3] = (i+3);
        (ip)[i+4] = (i+1);
    }
    for (int nl = 0; nl < iterations; nl++) {
        for (int i = 0; i < LEN_1D; i++) {
            if (a[i] < (real_t)0.) {
                if (b[i] > a[i]) {
                    c[i] += d[i] * e[i];
                }
            }
        }
        //dummy(a, b, c, d, e, aa, bb, cc, 0.);
    }

    return x;
}
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native s1279.c   
jh@alberti:~/tsvc/bin> perf stat ./a.out

 Performance counter stats for './a.out':

           2762.85 msec task-clock:u              #    0.999 CPUs utilized      
                 0      context-switches:u        #    0.000 /sec               
                 0      cpu-migrations:u          #    0.000 /sec               
               265      page-faults:u             #   95.915 /sec               
       10155904052      cycles:u                  #    3.676 GHz               
      (83.34%)
             20767      stalled-cycles-frontend:u #    0.00% frontend cycles
idle     (83.36%)
             36970      stalled-cycles-backend:u  #    0.00% backend cycles
idle      (83.36%)
       27985795691      instructions:u            #    2.76  insn per cycle     
                                                  #    0.00  stalled cycles per
insn  (83.36%)
        1999265642      branches:u                #  723.624 M/sec             
      (83.36%)
            502031      branch-misses:u           #    0.03% of all branches   
      (83.23%)

       2.764553907 seconds time elapsed

       2.763249000 seconds user
       0.000000000 seconds sys


jh@alberti:~/tsvc/bin> ~/aocc-compiler-4.0.0/bin/clang -Ofast -march=native
s1279.c 
jh@alberti:~/tsvc/bin> perf stat ./a.out

 Performance counter stats for './a.out':

           1980.94 msec task-clock:u              #    0.999 CPUs utilized      
                 0      context-switches:u        #    0.000 /sec               
                 0      cpu-migrations:u          #    0.000 /sec               
                77      page-faults:u             #   38.871 /sec               
        7261166980      cycles:u                  #    3.666 GHz               
      (83.25%)
             16796      stalled-cycles-frontend:u #    0.00% frontend cycles
idle     (83.25%)
             34506      stalled-cycles-backend:u  #    0.00% backend cycles
idle      (83.25%)
       10498254812      instructions:u            #    1.45  insn per cycle     
                                                  #    0.00  stalled cycles per
insn  (83.40%)
        1500160478      branches:u                #  757.299 M/sec             
      (83.45%)
           1000905      branch-misses:u           #    0.07% of all branches   
      (83.40%)

       1.982364055 seconds time elapsed

       1.981460000 seconds user
       0.000000000 seconds sys


aocc does:
.LBB0_6:                                # %for.inc43.vec.bb
                                        #   in Loop: Header=BB0_2 Depth=2
        addq    $256, %rcx                      # imm = 0x100
        cmpq    $128000, %rcx                   # imm = 0x1F400
        je      .LBB0_7
.LBB0_2:                                # %vector.body
                                        #   Parent Loop BB0_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vmovups a(%rcx), %zmm1
        vmovups a+64(%rcx), %zmm2
        vmovups a+128(%rcx), %zmm3
        vmovups a+192(%rcx), %zmm4
                                        # implicit-def: $k4
        vcmpltps        %zmm0, %zmm1, %k0
        vcmpltps        %zmm0, %zmm2, %k1
        vcmpltps        %zmm0, %zmm3, %k2
        vcmpltps        %zmm0, %zmm4, %k3
        kunpckwd        %k0, %k1, %k0
        kunpckwd        %k2, %k3, %k1   
                                        # implicit-def: $k2
                                        # implicit-def: $k3
        kunpckdq        %k0, %k1, %k0   
                                        # implicit-def: $k1
        kortestq        %k0, %k0
        je      .LBB0_4
# %bb.3:                                # %if.then.vec.bb
                                        #   in Loop: Header=BB0_2 Depth=2
        vcmpltps        b(%rcx), %zmm1, %k1
        vcmpltps        b+64(%rcx), %zmm2, %k2
        vcmpltps        b+128(%rcx), %zmm3, %k3
        vcmpltps        b+192(%rcx), %zmm4, %k4
.LBB0_4:                                # %if.then.vec.join.bb
                                        #   in Loop: Header=BB0_2 Depth=2
        kunpckwd        %k1, %k2, %k5
        kunpckwd        %k3, %k4, %k6
        kunpckdq        %k5, %k6, %k5
        ktestq  %k0, %k5
        je      .LBB0_6

So mask registers does the conditionals
and GCC with 256bit vectors:

.L2:
        vmovdqa %ymm7, %ymm1
        vmovdqa %ymm8, %ymm0
        addq    $160, %rax
        vpaddd  %ymm4, %ymm8, %ymm8
        vpaddd  %ymm18, %ymm1, %ymm2
        vpaddd  %ymm17, %ymm1, %ymm1
        vpaddd  %ymm4, %ymm7, %ymm7
        vextracti64x2   $1, %ymm2, %xmm3
        vmovq   %xmm2, -160(%rax)
        vpextrq $1, %xmm2, -140(%rax)
        vmovq   %xmm1, -80(%rax)
        vpextrq $1, %xmm1, -60(%rax)
        valignq $3, %ymm2, %ymm2, %ymm2
        vmovq   %xmm3, -120(%rax)
        vmovdqa %ymm0, %ymm3
        vmovq   %xmm2, -100(%rax)
        vextracti64x2   $1, %ymm1, %xmm2
        valignq $3, %ymm1, %ymm1, %ymm1
        vmovq   %xmm2, -40(%rax)
        vpaddd  %ymm5, %ymm0, %ymm2
        vmovd   %xmm2, -144(%rax)
        vpextrd $1, %xmm2, -124(%rax)
        vpextrd $2, %xmm2, -104(%rax)
        vmovq   %xmm1, -20(%rax)
        vpaddd  %ymm6, %ymm0, %ymm1
        vpermt2d        %ymm1, %ymm16, %ymm3
        vpextrd $3, %xmm2, -84(%rax)
        vmovq   %xmm3, -152(%rax)
        vmovdqa %ymm0, %ymm3
        vpermt2d        %ymm1, %ymm15, %ymm3
        vmovq   %xmm3, -132(%rax)
        vmovdqa %ymm0, %ymm3
        vpermt2d        %ymm1, %ymm14, %ymm3
        vmovq   %xmm3, -112(%rax)
        vmovdqa %ymm0, %ymm3
        vpermt2d        %ymm1, %ymm13, %ymm3
        vmovq   %xmm3, -92(%rax)
        vmovdqa %ymm0, %ymm3
        vpermt2d        %ymm1, %ymm12, %ymm3
        vmovq   %xmm3, -72(%rax)
        vmovdqa %ymm0, %ymm3
        vpermt2d        %ymm1, %ymm11, %ymm3
        vmovq   %xmm3, -52(%rax)
        vmovdqa %ymm0, %ymm3
        vpermt2d        %ymm1, %ymm9, %ymm0
        vmovq   %xmm0, -12(%rax)
        vpermt2d        %ymm1, %ymm10, %ymm3
        vextracti32x4   $1, %ymm2, %xmm0
        vmovq   %xmm3, -32(%rax)
        vmovd   %xmm0, -64(%rax)
        valignd $5, %ymm2, %ymm2, %ymm0
        vmovd   %xmm0, -44(%rax)
        valignd $6, %ymm2, %ymm2, %ymm0
        valignd $7, %ymm2, %ymm2, %ymm2
        vmovd   %xmm0, -24(%rax)
        vmovd   %xmm2, -4(%rax)
        cmpq    %rax, %rcx
        jne     .L2

with 512bit vectors:

.L2:
        vmovdqa32       %zmm5, %zmm1
        addq    $320, %rax
        vpaddd  %zmm2, %zmm5, %zmm5
        vmovdqa32       %zmm6, %zmm0
        vpaddd  %zmm2, %zmm6, %zmm6
        vpaddd  %zmm24, %zmm1, %zmm25
        vpaddd  %zmm23, %zmm1, %zmm1
        valignq $3, %ymm25, %ymm25, %ymm26
        vmovq   %xmm25, -320(%rax)
        vpextrq $1, %xmm25, -300(%rax)
        vmovq   %xmm1, -160(%rax)
        vpextrq $1, %xmm1, -140(%rax)
        vextracti64x2   $1, %ymm25, %xmm27
        vextracti64x4   $0x1, %zmm25, %ymm25
        vmovq   %xmm26, -260(%rax)
        vmovq   %xmm25, -240(%rax)
        vpextrq $1, %xmm25, -220(%rax)
        vextracti64x2   $1, %ymm25, %xmm26
        vmovq   %xmm27, -280(%rax)
        valignq $3, %ymm25, %ymm25, %ymm25
        vmovq   %xmm26, -200(%rax)
        vmovq   %xmm25, -180(%rax)
        valignq $3, %ymm1, %ymm1, %ymm25
        vextracti64x2   $1, %ymm1, %xmm26
        vextracti64x4   $0x1, %zmm1, %ymm1
        vmovq   %xmm25, -100(%rax)
        vmovq   %xmm1, -80(%rax)
        vpextrq $1, %xmm1, -60(%rax)
        vextracti64x2   $1, %ymm1, %xmm25
        vmovq   %xmm26, -120(%rax)
        vmovdqa32       %zmm0, %zmm26
        valignq $3, %ymm1, %ymm1, %ymm1
        vmovq   %xmm25, -40(%rax)
        vpaddd  %zmm3, %zmm0, %zmm25
        vmovq   %xmm1, -20(%rax)
        vpaddd  %zmm4, %zmm0, %zmm1
        vpermt2d        %zmm1, %zmm22, %zmm26
        vmovq   %xmm26, -312(%rax)
        vmovdqa32       %zmm0, %zmm26
        vpermt2d        %zmm1, %zmm21, %zmm26
        vmovq   %xmm26, %rdx
        vmovdqa32       %zmm0, %zmm26
        movq    %rdx, -292(%rax)
        vpermt2d        %zmm1, %zmm20, %zmm26
        vmovq   %xmm26, -272(%rax)
        vmovdqa32       %zmm0, %zmm26
        vpermt2d        %zmm1, %zmm19, %zmm26
        vmovq   %xmm26, %rdx
        vmovdqa32       %zmm0, %zmm26
        movq    %rdx, -252(%rax)
        vpermt2d        %zmm1, %zmm18, %zmm26
        vmovq   %xmm26, -232(%rax)
        vmovdqa32       %zmm0, %zmm26
        vpermt2d        %zmm1, %zmm17, %zmm26
        vmovq   %xmm26, %rdx
        vmovdqa32       %zmm0, %zmm26
        movq    %rdx, -212(%rax)
        vpermt2d        %zmm1, %zmm16, %zmm26
        vmovq   %xmm26, -192(%rax)
        vmovdqa32       %zmm0, %zmm26
        vmovq   %xmm26, %rdx
        vmovdqa32       %zmm0, %zmm26
        movq    %rdx, -172(%rax)
        vpermt2d        %zmm1, %zmm14, %zmm26
        vmovq   %xmm26, -152(%rax)
        vmovdqa32       %zmm0, %zmm26
        vpermt2d        %zmm1, %zmm13, %zmm26
        vmovq   %xmm26, %rdx
        vmovdqa32       %zmm0, %zmm26
        movq    %rdx, -132(%rax)
        vpermt2d        %zmm1, %zmm12, %zmm26
        vmovq   %xmm26, -112(%rax)
        vmovdqa32       %zmm0, %zmm26
        vpermt2d        %zmm1, %zmm11, %zmm26
        vmovq   %xmm26, %rdx
        vmovdqa32       %zmm0, %zmm26
        movq    %rdx, -92(%rax)
        vpermt2d        %zmm1, %zmm10, %zmm26
        vmovq   %xmm26, -72(%rax)
        vmovdqa32       %zmm0, %zmm26
        vpermt2d        %zmm1, %zmm9, %zmm26
        vmovq   %xmm26, %rdx
        vmovdqa32       %zmm0, %zmm26
        vpermt2d        %zmm1, %zmm7, %zmm0
        vmovq   %xmm0, -12(%rax)
        movq    %rdx, -52(%rax)
        vmovdqa32       %ymm25, %ymm0
        vpermt2d        %zmm1, %zmm8, %zmm26
        vextracti32x4   $1, %ymm25, %xmm1
        vmovq   %xmm26, -32(%rax)
        vmovd   %xmm25, -304(%rax)
        vpextrd $1, %xmm0, -284(%rax)
        vpextrd $2, %xmm0, -264(%rax)
        vmovd   %xmm1, -224(%rax)
        valignd $5, %ymm25, %ymm25, %ymm1
        vpextrd $3, %xmm0, -244(%rax)
        valignd $7, %ymm25, %ymm25, %ymm0
        vmovd   %xmm1, -204(%rax)
        valignd $6, %ymm25, %ymm25, %ymm1
        vmovd   %xmm0, -164(%rax)
        vextracti32x8   $0x1, %zmm25, %ymm0
        vmovd   %xmm0, -144(%rax)
        vpextrd $1, %xmm0, -124(%rax)
        vmovd   %xmm1, -184(%rax)
        vextracti32x4   $1, %ymm0, %xmm1
        vpextrd $2, %xmm0, -104(%rax)
        vpextrd $3, %xmm0, -84(%rax)
        vmovd   %xmm1, -64(%rax)
        valignd $5, %ymm0, %ymm0, %ymm1
        vmovd   %xmm1, -44(%rax)
        valignd $6, %ymm0, %ymm0, %ymm1
        valignd $7, %ymm0, %ymm0, %ymm0
        vmovd   %xmm1, -24(%rax)
        vmovd   %xmm0, -4(%rax)
        cmpq    %rax, %rcx
        jne     .L2

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug middle-end/108376] TSVC s1279 runs 40% faster with aocc than gcc at zen4
  2023-01-11 18:56 [Bug middle-end/108376] New: TSVC s1279 runs 40% faster with aocc than gcc at zen4 hubicka at gcc dot gnu.org
@ 2023-01-11 20:21 ` amonakov at gcc dot gnu.org
  2023-01-12 10:34 ` rguenth at gcc dot gnu.org
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: amonakov at gcc dot gnu.org @ 2023-01-11 20:21 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108376

Alexander Monakov <amonakov at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |amonakov at gcc dot gnu.org

--- Comment #1 from Alexander Monakov <amonakov at gcc dot gnu.org> ---
I think your GCC dumps are for the wrong loop.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug middle-end/108376] TSVC s1279 runs 40% faster with aocc than gcc at zen4
  2023-01-11 18:56 [Bug middle-end/108376] New: TSVC s1279 runs 40% faster with aocc than gcc at zen4 hubicka at gcc dot gnu.org
  2023-01-11 20:21 ` [Bug middle-end/108376] " amonakov at gcc dot gnu.org
@ 2023-01-12 10:34 ` rguenth at gcc dot gnu.org
  2023-01-14 22:30 ` hubicka at gcc dot gnu.org
  2024-02-09 13:47 ` rguenth at gcc dot gnu.org
  3 siblings, 0 replies; 5+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-01-12 10:34 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108376

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
   Last reconfirmed|                            |2023-01-12
             Status|UNCONFIRMED                 |NEW
     Ever confirmed|0                           |1

--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
As far as I can see a[] is all zeros.  AOCC basically preserves the
loop control flow when if (a[i] < 0.) for all elements processed in the
iteration, likewise for if (b[i] > a[i]) but GCC if-converts this all
down to combined masking of the guarded code.

I think the testcase as-is is too artificial to be relevant.  GCC
has code to do such thing to convert masked stores, but in this case
we are not using masked stores or masked loads:

.L3:
        vmovaps a(%rax), %ymm3
        vmovaps b(%rax), %ymm4
        vmovaps c(%rax), %ymm7
        addq    $32, %rax
        vmovaps c-32(%rax), %ymm0
        vmovaps e-32(%rax), %ymm5
        vcmpps  $1, %ymm1, %ymm3, %k1
        vcmpps  $14, %ymm3, %ymm4, %k1{%k1}
        vfmadd231ps     d-32(%rax), %ymm5, %ymm0{%k1}
        vfmadd231ps     d-32(%rax), %ymm5, %ymm0
        vblendmps       %ymm0, %ymm7, %ymm0{%k1}
        vmovaps %ymm0, c-32(%rax)
        cmpq    $128000, %rax
        jne     .L3

I suspect if you do a less optimal initialization of a/b then the AOCC
code will be slower.

Note GCC applies unroll-and-jam to the loop (the outer iteration is
visibly redundant, so we are eventually doing half of the work as AOCC ;))

Confirmed for us not vectorizing control flow but if-converting.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug middle-end/108376] TSVC s1279 runs 40% faster with aocc than gcc at zen4
  2023-01-11 18:56 [Bug middle-end/108376] New: TSVC s1279 runs 40% faster with aocc than gcc at zen4 hubicka at gcc dot gnu.org
  2023-01-11 20:21 ` [Bug middle-end/108376] " amonakov at gcc dot gnu.org
  2023-01-12 10:34 ` rguenth at gcc dot gnu.org
@ 2023-01-14 22:30 ` hubicka at gcc dot gnu.org
  2024-02-09 13:47 ` rguenth at gcc dot gnu.org
  3 siblings, 0 replies; 5+ messages in thread
From: hubicka at gcc dot gnu.org @ 2023-01-14 22:30 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108376

--- Comment #3 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
If I make the arrays random then GCC code is indeed faster:
#include <math.h>
#include <malloc.h>

typedef float real_t;
#define iterations 1000000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
real_t aa[LEN_2D][LEN_2D];
real_t bb[LEN_2D][LEN_2D];
real_t cc[LEN_2D][LEN_2D];
real_t qq;
int
main(void)
{
//    reductions
//    if to max reduction

    real_t x;
    for (int i = 0; i < LEN_1D; i++)
    {
       a[i]=(rand() %5) - 3;
       b[i]=(rand() %6) - 3;
    }
    for (int nl = 0; nl < iterations; nl++) {
        for (int i = 0; i < LEN_1D; i++) {
            if (a[i] < (real_t)0.) {
                if (b[i] > a[i]) {
                    c[i] += d[i] * e[i];
                }
            }
        }
        //dummy(a, b, c, d, e, aa, bb, cc, 0.);
    }

    return x;
}

jh@alberti:~/tsvc/bin> ~/aocc-compiler-4.0.0/bin/clang -Ofast s1279.c
-march=native
s1279.c:23:14: warning: implicit declaration of function 'rand' is invalid in
C99 [-Wimplicit-function-declaration]
       a[i]=(rand() %5) - 3;
             ^
1 warning generated.
jh@alberti:~/tsvc/bin> time ./a.out

real    0m5.638s
user    0m5.636s
sys     0m0.000s
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast s1279.c -march=native
s1279.c: In function 'main':
s1279.c:23:14: warning: implicit declaration of function 'rand'
[-Wimplicit-function-declaration]
   23 |        a[i]=(rand() %5) - 3;
      |              ^~~~
jh@alberti:~/tsvc/bin> time ./a.out

real    0m2.791s
user    0m2.790s
sys     0m0.000s


sorry for wrong code, just for reference the loop compiles as:
.L4:
        xorl    %eax, %eax
        .p2align 4
        .p2align 3
.L3:
        vmovaps a(%rax), %ymm2
        vmovaps b(%rax), %ymm3
        vmovaps c(%rax), %ymm6
        addq    $32, %rax
        vmovaps c-32(%rax), %ymm0
        vmovaps e-32(%rax), %ymm4
        vcmpps  $1, %ymm1, %ymm2, %k1
        vcmpps  $14, %ymm2, %ymm3, %k1{%k1}
        vfmadd231ps     d-32(%rax), %ymm4, %ymm0{%k1}
        vfmadd231ps     d-32(%rax), %ymm4, %ymm0
        vblendmps       %ymm0, %ymm6, %ymm0{%k1}
        vmovaps %ymm0, c-32(%rax)
        cmpq    $128000, %rax
        jne     .L3
        decl    %edx
        jne     .L4

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug middle-end/108376] TSVC s1279 runs 40% faster with aocc than gcc at zen4
  2023-01-11 18:56 [Bug middle-end/108376] New: TSVC s1279 runs 40% faster with aocc than gcc at zen4 hubicka at gcc dot gnu.org
                   ` (2 preceding siblings ...)
  2023-01-14 22:30 ` hubicka at gcc dot gnu.org
@ 2024-02-09 13:47 ` rguenth at gcc dot gnu.org
  3 siblings, 0 replies; 5+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-02-09 13:47 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108376

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
         Resolution|---                         |WONTFIX
             Status|NEW                         |RESOLVED

--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
So I'd say INVALID or WONTFIX.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-02-09 13:47 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-11 18:56 [Bug middle-end/108376] New: TSVC s1279 runs 40% faster with aocc than gcc at zen4 hubicka at gcc dot gnu.org
2023-01-11 20:21 ` [Bug middle-end/108376] " amonakov at gcc dot gnu.org
2023-01-12 10:34 ` rguenth at gcc dot gnu.org
2023-01-14 22:30 ` hubicka at gcc dot gnu.org
2024-02-09 13:47 ` rguenth at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).