public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/107715] New: TSVC s161 for double runs at zen4 30 times slower when vectorization is enabled
@ 2022-11-16 14:10 hubicka at gcc dot gnu.org
  2022-11-16 15:08 ` [Bug tree-optimization/107715] " rguenth at gcc dot gnu.org
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: hubicka at gcc dot gnu.org @ 2022-11-16 14:10 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107715

            Bug ID: 107715
           Summary: TSVC s161 for double runs at zen4 30 times slower when
                    vectorization is enabled
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

jh@alberti:~/tsvc/bin> more test.c
typedef double real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
int
main()
{
    for (int nl = 0; nl < iterations/2; nl++) {
        for (int i = 0; i < LEN_1D-1; ++i) {
            if (b[i] < (real_t)0.) {
                goto L20;
            }
            a[i] = c[i] + d[i] * e[i];
            goto L10;
L20:
            c[i+1] = a[i] + d[i] * d[i];
L10:
            ;
        }
    }
    return 0;
}

jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native test.c
-fno-tree-vectorize
jh@alberti:~/tsvc/bin> time ./a.out

real    0m1.170s
user    0m1.170s
sys     0m0.000s

jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native test.c 
jh@alberti:~/tsvc/bin> time ./a.out

real    0m37.269s
user    0m37.258s
sys     0m0.004s


It is not quite clear to me why this happens. It seems that all the time is
spent by movapd:
       │ b0:┌─→vmovapd     0x6bc880(%rax),%zmm2
       │    │  vmovapd     0x63f880(%rax),%zmm0
  0.00 │    │  vcmpltpd    %zmm1,%zmm2,%k1     
       │    │  vmovapd     0x6fb080(%rax),%zmm2
       │    │  vfmadd132pd %zmm0,%zmm2,%zmm0   
       │    │  vmovapd     0x6bc880(%rax),%zmm2
       │    │  vmovupd     %zmm0,0x67e088(%rax){%k1}
 99.94 │    │  vmovapd     0x63f880(%rax),%zmm0
       │    │  add         $0x40,%rax          
       │    │  vcmpgepd    %zmm1,%zmm2,%k1     
       │    │  vmovapd     0x67e040(%rax),%zmm2
  0.02 │    │  vfmadd132pd 0x601040(%rax),%zmm2,%zmm0
  0.04 │    │  vmovapd     0x6fb040(%rax),%zmm2
  0.00 │    │  vblendmpd   %zmm0,%zmm2,%zmm0{%k1}
       │    │  vmovapd     %zmm0,0x6fb040(%rax)  
       │    │  cmp         $0x3e7c0,%rax         
       │    └──jne         b0   

Since I do not initialize the array in reduced testcase we always execute the
jump to L20.

Exctending the testcase by array initialization:
typedef double real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
enum {SET1D_RECIP_IDX = -1, SET1D_RECIP_IDX_SQ = -2};
void set_1d_array(real_t * arr, int length, real_t value, int stride)
{
    if (stride == SET1D_RECIP_IDX) {
        for (int i = 0; i < length; i++) {
            arr[i] = 1. / (real_t) (i+1);
        }
    } else if (stride == SET1D_RECIP_IDX_SQ) {
        for (int i = 0; i < length; i++) {
            arr[i] = 1. / (real_t) ((i+1) * (i+1));
        }
    } else {
        for (int i = 0; i < length; i += stride) {
            arr[i] = value;
        }
    }
}
int
main()
{
    set_1d_array(a, LEN_1D, 1.,1);
    set_1d_array(b, LEN_1D, 1.,1);
    set_1d_array(c, LEN_1D, 1.,1);
    set_1d_array(d, LEN_1D, 1.,1);
    set_1d_array(e, LEN_1D, 1.,1);
    for (int nl = 0; nl < iterations/2; nl++) {
        for (int i = 0; i < LEN_1D-1; ++i) {
            if (b[i] < (real_t)0.) {
                goto L20;
            }
            a[i] = c[i] + d[i] * e[i];
            goto L10;
L20:
            c[i+1] = a[i] + d[i] * d[i];
L10:
            ;
        }
    }
    return 0;
}
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native test.c 
-fno-tree-vectorize
jh@alberti:~/tsvc/bin> time ./a.out

real    0m0.910s
user    0m0.910s
sys     0m0.000s
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native test.c  
jh@alberti:~/tsvc/bin> time ./a.out

real    0m1.866s
user    0m1.866s
sys     0m0.000s
jh@alberti:~/tsvc/bin> 

still gets about 2x regression for vectorization.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2022-11-21 10:02 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-16 14:10 [Bug tree-optimization/107715] New: TSVC s161 for double runs at zen4 30 times slower when vectorization is enabled hubicka at gcc dot gnu.org
2022-11-16 15:08 ` [Bug tree-optimization/107715] " rguenth at gcc dot gnu.org
2022-11-16 15:28 ` hubicka at ucw dot cz
2022-11-16 15:35 ` amonakov at gcc dot gnu.org
2022-11-16 17:20 ` [Bug tree-optimization/107715] TSVC s161 and s277 " hubicka at gcc dot gnu.org
2022-11-21 10:02 ` marxin at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).