public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug middle-end/107719] New: 14% regression on TSVC s3113 on znve4 compared to GCC 7.5
@ 2022-11-16 17:30 hubicka at gcc dot gnu.org
  2022-11-16 17:42 ` [Bug middle-end/107719] " amonakov at gcc dot gnu.org
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: hubicka at gcc dot gnu.org @ 2022-11-16 17:30 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107719

            Bug ID: 107719
           Summary: 14% regression on TSVC s3113 on znve4 compared to GCC
                    7.5
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

jh@alberti:~/tsvc/bin> cat tt5.c
#include <math.h>

typedef double real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
real_t qq;
int
main(void)
{
//    reductions
//    maximum of absolute value


    real_t max;
    for (int nl = 0; nl < iterations*4; nl++) {
        max = fabs(a[0]);
        for (int i = 0; i < LEN_1D; i++) {
            if ((fabs(a[i])) > max) {
                max = fabs(a[i]);
            }
        }
        qq += max;
    }

    return max;
}
jh@alberti:~/tsvc/bin> /home/jh/trunk-install/bin/gcc -Ofast -march=native
tt5.c  ; perf stat ./a.out

 Performance counter stats for './a.out':

            913.92 msec task-clock:u              #    0.999 CPUs utilized      
                 0      context-switches:u        #    0.000 /sec               
                 0      cpu-migrations:u          #    0.000 /sec               
               108      page-faults:u             #  118.172 /sec               
     3,342,731,634      cycles:u                  #    3.658 GHz               
      (83.37%)
            15,353      stalled-cycles-frontend:u #    0.00% frontend cycles
idle     (83.37%)
            12,484      stalled-cycles-backend:u  #    0.00% backend cycles
idle      (83.38%)
     7,989,930,772      instructions:u            #    2.39  insn per cycle     
                                                  #    0.00  stalled cycles per
insn  (83.37%)
     1,597,552,117      branches:u                #    1.748 G/sec             
      (83.37%)
           401,094      branch-misses:u           #    0.03% of all branches   
      (83.13%)

       0.914933333 seconds time elapsed

       0.914630000 seconds user
       0.000000000 seconds sys


jh@alberti:~/tsvc/bin> gcc -Ofast -march=native tt5.c  ; perf stat ./a.out

 Performance counter stats for './a.out':

            880.97 msec task-clock:u              #    0.999 CPUs utilized      
                 0      context-switches:u        #    0.000 /sec               
                 0      cpu-migrations:u          #    0.000 /sec               
               110      page-faults:u             #  124.862 /sec               
     3,218,698,288      cycles:u                  #    3.654 GHz               
      (83.21%)
            11,566      stalled-cycles-frontend:u #    0.00% frontend cycles
idle     (83.21%)
            12,185      stalled-cycles-backend:u  #    0.00% backend cycles
idle      (83.21%)
     7,989,544,164      instructions:u            #    2.48  insn per cycle     
                                                  #    0.00  stalled cycles per
insn  (83.48%)
     1,597,229,244      branches:u                #    1.813 G/sec             
      (83.66%)
           401,157      branch-misses:u           #    0.03% of all branches   
      (83.23%)

       0.881919601 seconds time elapsed

       0.881627000 seconds user
       0.000000000 seconds sys


It is off-noise. GCC 7.5 does:
main:
.LFB0:
        .cfi_startproc
        vmovsd  a(%rip), %xmm4
        vmovsd  qq(%rip), %xmm3
        movl    $400000, %ecx
        movl    $a+256000, %edx
        vmovapd .LC1(%rip), %zmm2
        vandps  .LC0(%rip), %xmm4, %xmm4
        vbroadcastsd    %xmm4, %zmm4
        .p2align 4,,15
.L3:
        movl    $a, %eax
        vmovapd %zmm4, %zmm0
        .p2align 4,,15
.L2:
        vandpd  (%rax), %zmm2, %zmm1
        addq    $64, %rax
        vmaxpd  %zmm1, %zmm0, %zmm0
        cmpq    %rax, %rdx
        jne     .L2
        vshufi32x4      $78, %zmm0, %zmm0, %zmm1
        decl    %ecx
        vmaxpd  %zmm0, %zmm1, %zmm0
        vshufi32x4      $77, %zmm0, %zmm0, %zmm1
        vmaxpd  %zmm0, %zmm1, %zmm1
        vpshufd $254, %zmm1, %zmm0
        vmaxpd  %zmm1, %zmm0, %zmm0
        vaddsd  %xmm0, %xmm3, %xmm3
        jne     .L3
        vmovsd  %xmm3, qq(%rip)
        vcvttsd2si      %xmm0, %eax
        vzeroupper
        ret
        .cfi_endproc

while trunk
main:
.LFB0:
        .cfi_startproc
        vmovsd  a(%rip), %xmm4
        vmovsd  qq(%rip), %xmm3
        movl    $400000, %ecx
        movl    $a+256000, %edx
        vandpd  .LC0(%rip), %xmm4, %xmm4
        vbroadcastsd    .LC2(%rip), %zmm2
        vbroadcastsd    %xmm4, %zmm4
        .p2align 4
        .p2align 3
.L3:
        vmovapd %zmm4, %zmm0
        movl    $a, %eax
        .p2align 4
        .p2align 3
.L2:
        vandpd  (%rax), %zmm2, %zmm1
        addq    $64, %rax
        vmaxpd  %zmm1, %zmm0, %zmm0
        cmpq    %rax, %rdx
        jne     .L2
        vextractf64x4   $0x1, %zmm0, %ymm1
        decl    %ecx
        vmaxpd  %ymm0, %ymm1, %ymm0
        vextractf64x2   $0x1, %ymm0, %xmm1
        vmaxpd  %xmm0, %xmm1, %xmm1
        vunpckhpd       %xmm1, %xmm1, %xmm0
        vmaxpd  %xmm1, %xmm0, %xmm0
        vaddsd  %xmm0, %xmm3, %xmm3
        jne     .L3
        vmovsd  %xmm3, qq(%rip)
        vcvttsd2sil     %xmm0, %eax
        vzeroupper
        ret
        .cfi_endproc
So no difference in the internal loop
@@ -11,67 +11,82 @@
        vmovsd  qq(%rip), %xmm3
        movl    $400000, %ecx
        movl    $a+256000, %edx
-       vmovapd .LC1(%rip), %zmm2
-       vandps  .LC0(%rip), %xmm4, %xmm4
+       vandpd  .LC0(%rip), %xmm4, %xmm4
+       vbroadcastsd    .LC2(%rip), %zmm2
        vbroadcastsd    %xmm4, %zmm4
-       .p2align 4,,15
+       .p2align 4
+       .p2align 3
 .L3:
-       movl    $a, %eax
        vmovapd %zmm4, %zmm0
-       .p2align 4,,15
+       movl    $a, %eax
+       .p2align 4
+       .p2align 3
 .L2:
        vandpd  (%rax), %zmm2, %zmm1
        addq    $64, %rax
        vmaxpd  %zmm1, %zmm0, %zmm0
        cmpq    %rax, %rdx
        jne     .L2
-       vshufi32x4      $78, %zmm0, %zmm0, %zmm1
+       vextractf64x4   $0x1, %zmm0, %ymm1
        decl    %ecx
-       vmaxpd  %zmm0, %zmm1, %zmm0
-       vshufi32x4      $77, %zmm0, %zmm0, %zmm1
-       vmaxpd  %zmm0, %zmm1, %zmm1
-       vpshufd $254, %zmm1, %zmm0
-       vmaxpd  %zmm1, %zmm0, %zmm0
+       vmaxpd  %ymm0, %ymm1, %ymm0
+       vextractf64x2   $0x1, %ymm0, %xmm1
+       vmaxpd  %xmm0, %xmm1, %xmm1
+       vunpckhpd       %xmm1, %xmm1, %xmm0
+       vmaxpd  %xmm1, %xmm0, %xmm0
        vaddsd  %xmm0, %xmm3, %xmm3
        jne     .L3
        vmovsd  %xmm3, qq(%rip)
-       vcvttsd2si      %xmm0, %eax
+       vcvttsd2sil     %xmm0, %eax

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug middle-end/107719] 14% regression on TSVC s3113 on znve4 compared to GCC 7.5
  2022-11-16 17:30 [Bug middle-end/107719] New: 14% regression on TSVC s3113 on znve4 compared to GCC 7.5 hubicka at gcc dot gnu.org
@ 2022-11-16 17:42 ` amonakov at gcc dot gnu.org
  2023-01-13 10:22 ` rguenth at gcc dot gnu.org
  2023-07-19 11:12 ` rguenth at gcc dot gnu.org
  2 siblings, 0 replies; 4+ messages in thread
From: amonakov at gcc dot gnu.org @ 2022-11-16 17:42 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107719

Alexander Monakov <amonakov at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |amonakov at gcc dot gnu.org

--- Comment #1 from Alexander Monakov <amonakov at gcc dot gnu.org> ---
As you say, the inner loop is the same, and it iterates 32000 times. Most
likely it crosses an instruction fetch boundary differently, try
-falign-loops=32.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug middle-end/107719] 14% regression on TSVC s3113 on znve4 compared to GCC 7.5
  2022-11-16 17:30 [Bug middle-end/107719] New: 14% regression on TSVC s3113 on znve4 compared to GCC 7.5 hubicka at gcc dot gnu.org
  2022-11-16 17:42 ` [Bug middle-end/107719] " amonakov at gcc dot gnu.org
@ 2023-01-13 10:22 ` rguenth at gcc dot gnu.org
  2023-07-19 11:12 ` rguenth at gcc dot gnu.org
  2 siblings, 0 replies; 4+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-01-13 10:22 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107719

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Target|                            |x86_64-*-*

--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
yep, probably not "real"

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug middle-end/107719] 14% regression on TSVC s3113 on znve4 compared to GCC 7.5
  2022-11-16 17:30 [Bug middle-end/107719] New: 14% regression on TSVC s3113 on znve4 compared to GCC 7.5 hubicka at gcc dot gnu.org
  2022-11-16 17:42 ` [Bug middle-end/107719] " amonakov at gcc dot gnu.org
  2023-01-13 10:22 ` rguenth at gcc dot gnu.org
@ 2023-07-19 11:12 ` rguenth at gcc dot gnu.org
  2 siblings, 0 replies; 4+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-07-19 11:12 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107719

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
         Resolution|---                         |INVALID
             Status|UNCONFIRMED                 |RESOLVED

--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
I can't reproduce the difference in runtime.  I also think you're running into
a misaligned inner loop here.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-07-19 11:12 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-16 17:30 [Bug middle-end/107719] New: 14% regression on TSVC s3113 on znve4 compared to GCC 7.5 hubicka at gcc dot gnu.org
2022-11-16 17:42 ` [Bug middle-end/107719] " amonakov at gcc dot gnu.org
2023-01-13 10:22 ` rguenth at gcc dot gnu.org
2023-07-19 11:12 ` rguenth at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).