[Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc
@ 2021-03-05 14:30 hubicka at gcc dot gnu.org
  2021-03-05 14:41 ` [Bug middle-end/99411] s311 and s31111 " hubicka at gcc dot gnu.org
                   ` (7 more replies)
  0 siblings, 8 replies; 9+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-05 14:30 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411

            Bug ID: 99411
           Summary: s311 benchmark of TSVC is vectorized by clang better
                    than by gcc
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

typedef float real_t;

#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D];

int main()
{

//    reductions
//    sum reduction

    real_t sum;
    for (int nl = 0; nl < iterations*10; nl++) {
        sum = (real_t)0.;
        for (int i = 0; i < LEN_1D; i++) {
            sum += a[i];
        }
    }
  return sum > 4;
}

We produce with -O2 -march=znver2

.L2:
        movl    $a, %eax
        vxorps  %xmm0, %xmm0, %xmm0
        .p2align 4
        .p2align 3
.L3:
        vaddps  (%rax), %ymm0, %ymm0
        addq    $32, %rax
        cmpq    $a+128000, %rax
        jne     .L3
        vextractf128    $0x1, %ymm0, %xmm1
        decl    %edx
        vaddps  %xmm0, %xmm1, %xmm1
        vmovhlps        %xmm1, %xmm1, %xmm0
        vaddps  %xmm1, %xmm0, %xmm0
        vshufps $85, %xmm0, %xmm0, %xmm1
        vaddps  %xmm0, %xmm1, %xmm0
        jne     .L2
        xorl    %eax, %eax
        vcomiss .LC0(%rip), %xmm0
        seta    %al
        vzeroupper
        ret
        .cfi_endproc


clang does:
main:                                   # @main
        .cfi_startproc
# %bb.0:
        xorl    %eax, %eax
        .p2align        4, 0x90
.LBB0_1:                                # =>This Loop Header: Depth=1
                                        #     Child Loop BB0_2 Depth 2
        vxorps  %xmm0, %xmm0, %xmm0
        movq    $-128000, %rcx                  # imm = 0xFFFE0C00
        vxorps  %xmm1, %xmm1, %xmm1
        vxorps  %xmm2, %xmm2, %xmm2
        vxorps  %xmm3, %xmm3, %xmm3
        .p2align        4, 0x90
.LBB0_2:                                #   Parent Loop BB0_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vaddps  a+128000(%rcx), %ymm0, %ymm0
        vaddps  a+128032(%rcx), %ymm1, %ymm1
        vaddps  a+128064(%rcx), %ymm2, %ymm2
        vaddps  a+128096(%rcx), %ymm3, %ymm3
        subq    $-128, %rcx
        jne     .LBB0_2
# %bb.3:                                #   in Loop: Header=BB0_1 Depth=1
        incl    %eax
        cmpl    $1000000, %eax                  # imm = 0xF4240
        jne     .LBB0_1
# %bb.4:
        vaddps  %ymm0, %ymm1, %ymm0
        xorl    %eax, %eax
        vaddps  %ymm0, %ymm2, %ymm0
        vaddps  %ymm0, %ymm3, %ymm0
        vextractf128    $1, %ymm0, %xmm1
        vaddps  %xmm1, %xmm0, %xmm0
        vpermilpd       $1, %xmm0, %xmm1        # xmm1 = xmm0[1,0]
        vaddps  %xmm1, %xmm0, %xmm0
        vmovshdup       %xmm0, %xmm1            # xmm1 = xmm0[1,1,3,3]
        vaddss  %xmm1, %xmm0, %xmm0
        vucomiss        .LCPI0_0(%rip), %xmm0
        seta    %al
        vzeroupper
        retq

On zen3 hardware gcc version runs 2.4s, while clang's 0.8s

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug middle-end/99411] s311 and s31111 benchmark of TSVC is vectorized by clang better than by gcc
  2021-03-05 14:30 [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc hubicka at gcc dot gnu.org
@ 2021-03-05 14:41 ` hubicka at gcc dot gnu.org
  2021-03-05 14:43 ` [Bug middle-end/99411] s311, s312 " hubicka at gcc dot gnu.org
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-05 14:41 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411

Jan Hubicka <hubicka at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
            Summary|s311 benchmark of TSVC is   |s311 and s31111 benchmark
                   |vectorized by clang better  |of TSVC is vectorized by
                   |than by gcc                 |clang better than by gcc

--- Comment #1 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
I think this is same case

typedef float real_t;

#define iterations 1000000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D];
real_t test(real_t* A){
  real_t s = (real_t)0.0;
  for (int i = 0; i < 4; i++)
    s += A[i];
  return s;
}

int main()
{

//    reductions
//    sum reduction
    real_t sum;
    for (int nl = 0; nl < 2000*iterations; nl++) {
        sum = (real_t)0.;
        sum += test(a);
        sum += test(&a[4]);
        sum += test(&a[8]);
        sum += test(&a[12]);
        sum += test(&a[16]);
        sum += test(&a[20]);
        sum += test(&a[24]);
        sum += test(&a[28]);
    }
  return sum>4;
}

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug middle-end/99411] s311, s312 and s31111 benchmark of TSVC is vectorized by clang better than by gcc
  2021-03-05 14:30 [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc hubicka at gcc dot gnu.org
  2021-03-05 14:41 ` [Bug middle-end/99411] s311 and s31111 " hubicka at gcc dot gnu.org
@ 2021-03-05 14:43 ` hubicka at gcc dot gnu.org
  2021-03-05 14:46 ` [Bug middle-end/99411] s311, s312, s31111 " hubicka at gcc dot gnu.org
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-05 14:43 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411

Jan Hubicka <hubicka at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
            Summary|s311 and s31111 benchmark   |s311, s312 and s31111
                   |of TSVC is vectorized by    |benchmark of TSVC is
                   |clang better than by gcc    |vectorized by clang better
                   |                            |than by gcc

--- Comment #2 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
another one:
// %3.1
typedef float real_t;

#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D];

int main ()
{

//    reductions
//    product reduction

    real_t prod;
    for (int nl = 0; nl < 10*iterations; nl++) {
        prod = (real_t)1.;
        for (int i = 0; i < LEN_1D; i++) {
            prod *= a[i];
        }
    }
    return prod > 0;
}

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug middle-end/99411] s311, s312, s31111 and s31111 benchmark of TSVC is vectorized by clang better than by gcc
  2021-03-05 14:30 [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc hubicka at gcc dot gnu.org
  2021-03-05 14:41 ` [Bug middle-end/99411] s311 and s31111 " hubicka at gcc dot gnu.org
  2021-03-05 14:43 ` [Bug middle-end/99411] s311, s312 " hubicka at gcc dot gnu.org
@ 2021-03-05 14:46 ` hubicka at gcc dot gnu.org
  2021-03-05 14:49 ` [Bug middle-end/99411] s311, s312, s31111 and s31111, s3110 " hubicka at gcc dot gnu.org
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-05 14:46 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411

Jan Hubicka <hubicka at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
            Summary|s311, s312 and s31111       |s311, s312, s31111 and
                   |benchmark of TSVC is        |s31111 benchmark of TSVC is
                   |vectorized by clang better  |vectorized by clang better
                   |than by gcc                 |than by gcc

--- Comment #3 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
and yet another one
typedef float real_t;

#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D];
int main()
{

//    reductions
//    conditional sum reduction

    real_t sum;
    for (int nl = 0; nl < iterations/2; nl++) {
        sum = 0.;
        for (int i = 0; i < LEN_1D; i++) {
            if (a[i] > (real_t)0.) {
                sum += a[i];
            }
        }
    }
   return sum > 4;
}

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug middle-end/99411] s311, s312, s31111 and s31111, s3110 benchmark of TSVC is vectorized by clang better than by gcc
  2021-03-05 14:30 [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc hubicka at gcc dot gnu.org
                   ` (2 preceding siblings ...)
  2021-03-05 14:46 ` [Bug middle-end/99411] s311, s312, s31111 " hubicka at gcc dot gnu.org
@ 2021-03-05 14:49 ` hubicka at gcc dot gnu.org
  2021-03-05 15:03 ` [Bug middle-end/99411] s311, s312, s31111, s31111, s3110, vsumr " hubicka at gcc dot gnu.org
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-05 14:49 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411

Jan Hubicka <hubicka at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
            Summary|s311, s312, s31111 and      |s311, s312, s31111 and
                   |s31111 benchmark of TSVC is |s31111, s3110 benchmark of
                   |vectorized by clang better  |TSVC is vectorized by clang
                   |than by gcc                 |better than by gcc

--- Comment #4 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
typedef float real_t;

#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D];
real_t aa[LEN_2D][LEN_2D];
int main()
{

//    reductions
//    if to max with index reductio 2 dimensions
//    similar to S315

    int xindex, yindex;
    real_t max, chksum;
    for (int nl = 0; nl < 100*(iterations/(LEN_2D)); nl++) {
        max = aa[(0)][0];
        xindex = 0;
        yindex = 0;
        for (int i = 0; i < LEN_2D; i++) {
            for (int j = 0; j < LEN_2D; j++) {
                if (aa[i][j] > max) {
                    max = aa[i][j];
                    xindex = i;
                    yindex = j;
                }
            }
        }
        chksum = max + (real_t) xindex + (real_t) yindex;
    }
    return max + xindex+1 + yindex+1;
}

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug middle-end/99411] s311, s312, s31111, s31111, s3110, vsumr benchmark of TSVC is vectorized by clang better than by gcc
  2021-03-05 14:30 [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc hubicka at gcc dot gnu.org
                   ` (3 preceding siblings ...)
  2021-03-05 14:49 ` [Bug middle-end/99411] s311, s312, s31111 and s31111, s3110 " hubicka at gcc dot gnu.org
@ 2021-03-05 15:03 ` hubicka at gcc dot gnu.org
  2021-03-08  8:24 ` [Bug tree-optimization/99411] " rguenth at gcc dot gnu.org
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-05 15:03 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411

Jan Hubicka <hubicka at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
            Summary|s311, s312, s31111 and      |s311, s312, s31111, s31111,
                   |s31111, s3110 benchmark of  |s3110, vsumr benchmark of
                   |TSVC is vectorized by clang |TSVC is vectorized by clang
                   |better than by gcc          |better than by gcc

--- Comment #5 from Jan Hubicka <hubicka at gcc dot gnu.org> ---

typedef float real_t;

#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D];
int main()
{

//    control loops
//    vector sum reduction

    real_t sum;
    for (int nl = 0; nl < iterations*10; nl++) {
        sum = 0.;
        for (int i = 0; i < LEN_1D; i++) {
            sum += a[i];
        }
    }

    return sum;
}

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug tree-optimization/99411] s311, s312, s31111, s31111, s3110, vsumr benchmark of TSVC is vectorized by clang better than by gcc
  2021-03-05 14:30 [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc hubicka at gcc dot gnu.org
                   ` (4 preceding siblings ...)
  2021-03-05 15:03 ` [Bug middle-end/99411] s311, s312, s31111, s31111, s3110, vsumr " hubicka at gcc dot gnu.org
@ 2021-03-08  8:24 ` rguenth at gcc dot gnu.org
  2022-11-16 17:16 ` hubicka at gcc dot gnu.org
  2023-01-11 22:36 ` hubicka at gcc dot gnu.org
  7 siblings, 0 replies; 9+ messages in thread
From: rguenth at gcc dot gnu.org @ 2021-03-08  8:24 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Blocks|                            |53947
           Keywords|                            |missed-optimization
          Component|middle-end                  |tree-optimization

--- Comment #6 from Richard Biener <rguenth at gcc dot gnu.org> ---
So clang uses a larger VF (unroll of the vectorized loop) here.  I think we
have another PR about this.


Referenced Bugs:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug tree-optimization/99411] s311, s312, s31111, s31111, s3110, vsumr benchmark of TSVC is vectorized by clang better than by gcc
  2021-03-05 14:30 [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc hubicka at gcc dot gnu.org
                   ` (5 preceding siblings ...)
  2021-03-08  8:24 ` [Bug tree-optimization/99411] " rguenth at gcc dot gnu.org
@ 2022-11-16 17:16 ` hubicka at gcc dot gnu.org
  2023-01-11 22:36 ` hubicka at gcc dot gnu.org
  7 siblings, 0 replies; 9+ messages in thread
From: hubicka at gcc dot gnu.org @ 2022-11-16 17:16 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411

--- Comment #7 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
With znver4 current trunk and clang15 I still see this problem (clang code is
about 60% faster) for s311, s312 and s3111.
Curious s31111 and s3110 no longer shows a regression.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug tree-optimization/99411] s311, s312, s31111, s31111, s3110, vsumr benchmark of TSVC is vectorized by clang better than by gcc
  2021-03-05 14:30 [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc hubicka at gcc dot gnu.org
                   ` (6 preceding siblings ...)
  2022-11-16 17:16 ` hubicka at gcc dot gnu.org
@ 2023-01-11 22:36 ` hubicka at gcc dot gnu.org
  7 siblings, 0 replies; 9+ messages in thread
From: hubicka at gcc dot gnu.org @ 2023-01-11 22:36 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411

--- Comment #8 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
Compared to aocc we also do worse on zen4:
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native s311.c      
jh@alberti:~/tsvc/bin> time ./a.out

real    0m3.207s
user    0m3.206s
sys     0m0.000s
jh@alberti:~/tsvc/bin> ~/aocc-compiler-4.0.0/bin/clang -Ofast -march=native
s311.c 
jh@alberti:~/tsvc/bin> time ./a.out

real    0m1.221s
user    0m1.221s
sys     0m0.000s

aocc code seems similar to clangs from two years ago except for additional use
of avx512.

main:                                   # @main
        .cfi_startproc
# %bb.0:                                # %entry
        xorl    %eax, %eax
        .p2align        4, 0x90
.LBB0_1:                                # %vector.ph
                                        # =>This Loop Header: Depth=1
                                        #     Child Loop BB0_2 Depth 2
        vxorps  %xmm0, %xmm0, %xmm0
        movq    $-128000, %rcx                  # imm = 0xFFFE0C00
        vxorps  %xmm1, %xmm1, %xmm1
        vxorps  %xmm2, %xmm2, %xmm2
        vxorps  %xmm3, %xmm3, %xmm3
        .p2align        4, 0x90
.LBB0_2:                                # %vector.body
                                        #   Parent Loop BB0_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vaddps  a+128000(%rcx), %zmm0, %zmm0
        vaddps  a+128064(%rcx), %zmm1, %zmm1
        vaddps  a+128128(%rcx), %zmm2, %zmm2
        vaddps  a+128192(%rcx), %zmm3, %zmm3
        addq    $256, %rcx                      # imm = 0x100
        jne     .LBB0_2
# %bb.3:                                # %middle.block
                                        #   in Loop: Header=BB0_1 Depth=1
        incl    %eax
        cmpl    $1000000, %eax                  # imm = 0xF4240
        jne     .LBB0_1
# %bb.4:                                # %for.cond.cleanup
        vaddps  %zmm0, %zmm1, %zmm0
        xorl    %eax, %eax
        vaddps  %zmm0, %zmm2, %zmm0
        vaddps  %zmm0, %zmm3, %zmm0
        vextractf64x4   $1, %zmm0, %ymm1
        vaddps  %zmm1, %zmm0, %zmm0
        vextractf128    $1, %ymm0, %xmm1
        vaddps  %xmm1, %xmm0, %xmm0
        vpermilpd       $1, %xmm0, %xmm1        # xmm1 = xmm0[1,0]
        vaddps  %xmm1, %xmm0, %xmm0
        vmovshdup       %xmm0, %xmm1            # xmm1 = xmm0[1,1,3,3]
        vaddss  %xmm1, %xmm0, %xmm0
        vucomiss        .LCPI0_0(%rip), %xmm0
        seta    %al
        vzeroupper
        retq

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2023-01-11 22:36 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-05 14:30 [Bug middle-end/99411] New: s311 benchmark of TSVC is vectorized by clang better than by gcc hubicka at gcc dot gnu.org
2021-03-05 14:41 ` [Bug middle-end/99411] s311 and s31111 " hubicka at gcc dot gnu.org
2021-03-05 14:43 ` [Bug middle-end/99411] s311, s312 " hubicka at gcc dot gnu.org
2021-03-05 14:46 ` [Bug middle-end/99411] s311, s312, s31111 " hubicka at gcc dot gnu.org
2021-03-05 14:49 ` [Bug middle-end/99411] s311, s312, s31111 and s31111, s3110 " hubicka at gcc dot gnu.org
2021-03-05 15:03 ` [Bug middle-end/99411] s311, s312, s31111, s31111, s3110, vsumr " hubicka at gcc dot gnu.org
2021-03-08  8:24 ` [Bug tree-optimization/99411] " rguenth at gcc dot gnu.org
2022-11-16 17:16 ` hubicka at gcc dot gnu.org
2023-01-11 22:36 ` hubicka at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).