public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
From: "hubicka at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug middle-end/107719] New: 14% regression on TSVC s3113 on znve4 compared to GCC 7.5
Date: Wed, 16 Nov 2022 17:30:09 +0000	[thread overview]
Message-ID: <bug-107719-4@http.gcc.gnu.org/bugzilla/> (raw)

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107719

            Bug ID: 107719
           Summary: 14% regression on TSVC s3113 on znve4 compared to GCC
                    7.5
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

jh@alberti:~/tsvc/bin> cat tt5.c
#include <math.h>

typedef double real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
real_t qq;
int
main(void)
{
//    reductions
//    maximum of absolute value


    real_t max;
    for (int nl = 0; nl < iterations*4; nl++) {
        max = fabs(a[0]);
        for (int i = 0; i < LEN_1D; i++) {
            if ((fabs(a[i])) > max) {
                max = fabs(a[i]);
            }
        }
        qq += max;
    }

    return max;
}
jh@alberti:~/tsvc/bin> /home/jh/trunk-install/bin/gcc -Ofast -march=native
tt5.c  ; perf stat ./a.out

 Performance counter stats for './a.out':

            913.92 msec task-clock:u              #    0.999 CPUs utilized      
                 0      context-switches:u        #    0.000 /sec               
                 0      cpu-migrations:u          #    0.000 /sec               
               108      page-faults:u             #  118.172 /sec               
     3,342,731,634      cycles:u                  #    3.658 GHz               
      (83.37%)
            15,353      stalled-cycles-frontend:u #    0.00% frontend cycles
idle     (83.37%)
            12,484      stalled-cycles-backend:u  #    0.00% backend cycles
idle      (83.38%)
     7,989,930,772      instructions:u            #    2.39  insn per cycle     
                                                  #    0.00  stalled cycles per
insn  (83.37%)
     1,597,552,117      branches:u                #    1.748 G/sec             
      (83.37%)
           401,094      branch-misses:u           #    0.03% of all branches   
      (83.13%)

       0.914933333 seconds time elapsed

       0.914630000 seconds user
       0.000000000 seconds sys


jh@alberti:~/tsvc/bin> gcc -Ofast -march=native tt5.c  ; perf stat ./a.out

 Performance counter stats for './a.out':

            880.97 msec task-clock:u              #    0.999 CPUs utilized      
                 0      context-switches:u        #    0.000 /sec               
                 0      cpu-migrations:u          #    0.000 /sec               
               110      page-faults:u             #  124.862 /sec               
     3,218,698,288      cycles:u                  #    3.654 GHz               
      (83.21%)
            11,566      stalled-cycles-frontend:u #    0.00% frontend cycles
idle     (83.21%)
            12,185      stalled-cycles-backend:u  #    0.00% backend cycles
idle      (83.21%)
     7,989,544,164      instructions:u            #    2.48  insn per cycle     
                                                  #    0.00  stalled cycles per
insn  (83.48%)
     1,597,229,244      branches:u                #    1.813 G/sec             
      (83.66%)
           401,157      branch-misses:u           #    0.03% of all branches   
      (83.23%)

       0.881919601 seconds time elapsed

       0.881627000 seconds user
       0.000000000 seconds sys


It is off-noise. GCC 7.5 does:
main:
.LFB0:
        .cfi_startproc
        vmovsd  a(%rip), %xmm4
        vmovsd  qq(%rip), %xmm3
        movl    $400000, %ecx
        movl    $a+256000, %edx
        vmovapd .LC1(%rip), %zmm2
        vandps  .LC0(%rip), %xmm4, %xmm4
        vbroadcastsd    %xmm4, %zmm4
        .p2align 4,,15
.L3:
        movl    $a, %eax
        vmovapd %zmm4, %zmm0
        .p2align 4,,15
.L2:
        vandpd  (%rax), %zmm2, %zmm1
        addq    $64, %rax
        vmaxpd  %zmm1, %zmm0, %zmm0
        cmpq    %rax, %rdx
        jne     .L2
        vshufi32x4      $78, %zmm0, %zmm0, %zmm1
        decl    %ecx
        vmaxpd  %zmm0, %zmm1, %zmm0
        vshufi32x4      $77, %zmm0, %zmm0, %zmm1
        vmaxpd  %zmm0, %zmm1, %zmm1
        vpshufd $254, %zmm1, %zmm0
        vmaxpd  %zmm1, %zmm0, %zmm0
        vaddsd  %xmm0, %xmm3, %xmm3
        jne     .L3
        vmovsd  %xmm3, qq(%rip)
        vcvttsd2si      %xmm0, %eax
        vzeroupper
        ret
        .cfi_endproc

while trunk
main:
.LFB0:
        .cfi_startproc
        vmovsd  a(%rip), %xmm4
        vmovsd  qq(%rip), %xmm3
        movl    $400000, %ecx
        movl    $a+256000, %edx
        vandpd  .LC0(%rip), %xmm4, %xmm4
        vbroadcastsd    .LC2(%rip), %zmm2
        vbroadcastsd    %xmm4, %zmm4
        .p2align 4
        .p2align 3
.L3:
        vmovapd %zmm4, %zmm0
        movl    $a, %eax
        .p2align 4
        .p2align 3
.L2:
        vandpd  (%rax), %zmm2, %zmm1
        addq    $64, %rax
        vmaxpd  %zmm1, %zmm0, %zmm0
        cmpq    %rax, %rdx
        jne     .L2
        vextractf64x4   $0x1, %zmm0, %ymm1
        decl    %ecx
        vmaxpd  %ymm0, %ymm1, %ymm0
        vextractf64x2   $0x1, %ymm0, %xmm1
        vmaxpd  %xmm0, %xmm1, %xmm1
        vunpckhpd       %xmm1, %xmm1, %xmm0
        vmaxpd  %xmm1, %xmm0, %xmm0
        vaddsd  %xmm0, %xmm3, %xmm3
        jne     .L3
        vmovsd  %xmm3, qq(%rip)
        vcvttsd2sil     %xmm0, %eax
        vzeroupper
        ret
        .cfi_endproc
So no difference in the internal loop
@@ -11,67 +11,82 @@
        vmovsd  qq(%rip), %xmm3
        movl    $400000, %ecx
        movl    $a+256000, %edx
-       vmovapd .LC1(%rip), %zmm2
-       vandps  .LC0(%rip), %xmm4, %xmm4
+       vandpd  .LC0(%rip), %xmm4, %xmm4
+       vbroadcastsd    .LC2(%rip), %zmm2
        vbroadcastsd    %xmm4, %zmm4
-       .p2align 4,,15
+       .p2align 4
+       .p2align 3
 .L3:
-       movl    $a, %eax
        vmovapd %zmm4, %zmm0
-       .p2align 4,,15
+       movl    $a, %eax
+       .p2align 4
+       .p2align 3
 .L2:
        vandpd  (%rax), %zmm2, %zmm1
        addq    $64, %rax
        vmaxpd  %zmm1, %zmm0, %zmm0
        cmpq    %rax, %rdx
        jne     .L2
-       vshufi32x4      $78, %zmm0, %zmm0, %zmm1
+       vextractf64x4   $0x1, %zmm0, %ymm1
        decl    %ecx
-       vmaxpd  %zmm0, %zmm1, %zmm0
-       vshufi32x4      $77, %zmm0, %zmm0, %zmm1
-       vmaxpd  %zmm0, %zmm1, %zmm1
-       vpshufd $254, %zmm1, %zmm0
-       vmaxpd  %zmm1, %zmm0, %zmm0
+       vmaxpd  %ymm0, %ymm1, %ymm0
+       vextractf64x2   $0x1, %ymm0, %xmm1
+       vmaxpd  %xmm0, %xmm1, %xmm1
+       vunpckhpd       %xmm1, %xmm1, %xmm0
+       vmaxpd  %xmm1, %xmm0, %xmm0
        vaddsd  %xmm0, %xmm3, %xmm3
        jne     .L3
        vmovsd  %xmm3, qq(%rip)
-       vcvttsd2si      %xmm0, %eax
+       vcvttsd2sil     %xmm0, %eax

             reply	other threads:[~2022-11-16 17:30 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-16 17:30 hubicka at gcc dot gnu.org [this message]
2022-11-16 17:42 ` [Bug middle-end/107719] " amonakov at gcc dot gnu.org
2023-01-13 10:22 ` rguenth at gcc dot gnu.org
2023-07-19 11:12 ` rguenth at gcc dot gnu.org

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bug-107719-4@http.gcc.gnu.org/bugzilla/ \
    --to=gcc-bugzilla@gcc.gnu.org \
    --cc=gcc-bugs@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).