[Bug middle-end/99414] New: s235 benchmark of TSVC is vectorized better by icc than gcc (loop interchange)

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug middle-end/99414] New: s235 benchmark of TSVC is vectorized better by icc than gcc (loop interchange)
@ 2021-03-05 15:40 hubicka at gcc dot gnu.org
  2021-03-08  8:38 ` [Bug tree-optimization/99414] " rguenth at gcc dot gnu.org
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-05 15:40 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99414

            Bug ID: 99414
           Summary: s235 benchmark of TSVC is vectorized better by icc
                    than gcc (loop interchange)
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

typedef float real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D],
aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D],tt[LEN_2D][LEN_2D];
// %2.3
real_t main(struct args_t * func_args)
{

//    loop interchanging
//    imperfectly nested loops

    for (int nl = 0; nl < 200*(iterations/LEN_2D); nl++) {
        for (int i = 0; i < LEN_2D; i++) {
            a[i] += b[i] * c[i];
            for (int j = 1; j < LEN_2D; j++) {
                aa[j][i] = aa[j-1][i] + bb[j][i] * a[i];
            }
        }
    }
}


runs about 10 times faster on zen3 built by icc -O3 -ip -Ofast -g
-march=core-avx2 -mtune=core-avx2 -vec s235.c

main:
# parameter 1: %rdi
..B1.1:                         # Preds ..B1.0
                                # Execution count [1.77e+00]
        .cfi_startproc
..___tag_value_main.1:
..L2:
                                                          #9.1
        pushq     %rbp                                          #9.1
        .cfi_def_cfa_offset 16
        movq      %rsp, %rbp                                    #9.1
        .cfi_def_cfa 6, 16
        .cfi_offset 6, -16
        andq      $-128, %rsp                                   #9.1
        subq      $128, %rsp                                    #9.1
        movl      $3, %edi                                      #9.1
        xorl      %esi, %esi                                    #9.1
        call      __intel_new_feature_proc_init                 #9.1
                                # LOE rbx r12 r13 r14 r15
..B1.12:                        # Preds ..B1.1
                                # Execution count [1.77e+00]
        vstmxcsr  (%rsp)                                        #9.1
        xorl      %eax, %eax                                    #14.5
        orl       $32832, (%rsp)                                #9.1
        vldmxcsr  (%rsp)                                        #9.1
                                # LOE rbx r12 r13 r14 r15 eax
..B1.2:                         # Preds ..B1.8 ..B1.12
                                # Execution count [7.83e+04]
        xorl      %edx, %edx                                    #15.9
                                # LOE rdx rbx r12 r13 r14 r15 eax
..B1.3:                         # Preds ..B1.3 ..B1.2
                                # Execution count [2.00e+07]
        vmovups   b(,%rdx,4), %ymm1                             #16.21
        lea       (,%rdx,4), %rcx                               #16.13
        vmovups   32+b(,%rdx,4), %ymm3                          #16.21
        vmovups   64+b(,%rdx,4), %ymm5                          #16.21
        vmovups   96+b(,%rdx,4), %ymm7                          #16.21
        vmovups   128+b(,%rdx,4), %ymm9                         #16.21
        vmovups   160+b(,%rdx,4), %ymm11                        #16.21
        vmovups   192+b(,%rdx,4), %ymm13                        #16.21
        vmovups   224+b(,%rdx,4), %ymm15                        #16.21
        vmovups   c(,%rdx,4), %ymm0                             #16.28
        vmovups   32+c(,%rdx,4), %ymm2                          #16.28
        vmovups   64+c(,%rdx,4), %ymm4                          #16.28
        vmovups   96+c(,%rdx,4), %ymm6                          #16.28
        vmovups   128+c(,%rdx,4), %ymm8                         #16.28
        vmovups   160+c(,%rdx,4), %ymm10                        #16.28
        vmovups   192+c(,%rdx,4), %ymm12                        #16.28
        vmovups   224+c(,%rdx,4), %ymm14                        #16.28
        vfmadd213ps a(,%rdx,4), %ymm0, %ymm1                    #16.13
        vfmadd213ps 32+a(,%rdx,4), %ymm2, %ymm3                 #16.13
        vfmadd213ps 64+a(,%rdx,4), %ymm4, %ymm5                 #16.13
        vfmadd213ps 96+a(,%rdx,4), %ymm6, %ymm7                 #16.13
        vfmadd213ps 128+a(,%rdx,4), %ymm8, %ymm9                #16.13
        vfmadd213ps 160+a(,%rdx,4), %ymm10, %ymm11              #16.13
        vfmadd213ps 192+a(,%rdx,4), %ymm12, %ymm13              #16.13
        vfmadd213ps 224+a(,%rdx,4), %ymm14, %ymm15              #16.13
        vmovups   %ymm1, a(%rcx)                                #16.13
        vmovups   %ymm3, 32+a(%rcx)                             #16.13
        vmovups   %ymm5, 64+a(%rcx)                             #16.13
        vmovups   %ymm7, 96+a(%rcx)                             #16.13
        vmovups   %ymm9, 128+a(%rcx)                            #16.13
        vmovups   %ymm11, 160+a(%rcx)                           #16.13
        vmovups   %ymm13, 192+a(%rcx)                           #16.13
        vmovups   %ymm15, 224+a(%rcx)                           #16.13
        addq      $64, %rdx                                     #15.9
        cmpq      $256, %rdx                                    #15.9
        jb        ..B1.3        # Prob 99%                      #15.9
                                # LOE rdx rbx r12 r13 r14 r15 eax
..B1.4:                         # Preds ..B1.3
                                # Execution count [7.83e+04]
        xorl      %ecx, %ecx                                    #17.13
        xorl      %edx, %edx                                    #17.13
                                # LOE rdx rbx r12 r13 r14 r15 eax ecx
..B1.5:                         # Preds ..B1.7 ..B1.4
                                # Execution count [2.00e+07]
        xorl      %esi, %esi                                    #15.9
                                # LOE rdx rbx rsi r12 r13 r14 r15 eax ecx
..B1.6:                         # Preds ..B1.6 ..B1.5
                                # Execution count [5.11e+09]
        vmovups   a(,%rsi,4), %ymm1                             #18.52
        lea       (%rdx,%rsi,4), %rdi                           #18.17
       vmovups   32+a(,%rsi,4), %ymm3                          #18.52
        vmovups   64+a(,%rsi,4), %ymm5                          #18.52
        vmovups   96+a(,%rsi,4), %ymm7                          #18.52
        vmovups   128+a(,%rsi,4), %ymm9                         #18.52
        vmovups   160+a(,%rsi,4), %ymm11                        #18.52
        vmovups   192+a(,%rsi,4), %ymm13                        #18.52
        vmovups   224+a(,%rsi,4), %ymm15                        #18.52
        vmovups   1024+bb(%rdx,%rsi,4), %ymm0                   #18.41
        vmovups   1056+bb(%rdx,%rsi,4), %ymm2                   #18.41
        vmovups   1088+bb(%rdx,%rsi,4), %ymm4                   #18.41
        vmovups   1120+bb(%rdx,%rsi,4), %ymm6                   #18.41
        vmovups   1152+bb(%rdx,%rsi,4), %ymm8                   #18.41
        vmovups   1184+bb(%rdx,%rsi,4), %ymm10                  #18.41
        vmovups   1216+bb(%rdx,%rsi,4), %ymm12                  #18.41
        vmovups   1248+bb(%rdx,%rsi,4), %ymm14                  #18.41
        vfmadd213ps aa(%rdx,%rsi,4), %ymm0, %ymm1               #18.52
        vfmadd213ps 32+aa(%rdx,%rsi,4), %ymm2, %ymm3            #18.52
        vfmadd213ps 64+aa(%rdx,%rsi,4), %ymm4, %ymm5            #18.52
        vfmadd213ps 96+aa(%rdx,%rsi,4), %ymm6, %ymm7            #18.52
        vfmadd213ps 128+aa(%rdx,%rsi,4), %ymm8, %ymm9           #18.52
        vfmadd213ps 160+aa(%rdx,%rsi,4), %ymm10, %ymm11         #18.52
        vfmadd213ps 192+aa(%rdx,%rsi,4), %ymm12, %ymm13         #18.52
        vfmadd213ps 224+aa(%rdx,%rsi,4), %ymm14, %ymm15         #18.52
        vmovups   %ymm1, 1024+aa(%rdi)                          #18.17
        vmovups   %ymm3, 1056+aa(%rdi)                          #18.17
        vmovups   %ymm5, 1088+aa(%rdi)                          #18.17
        vmovups   %ymm7, 1120+aa(%rdi)                          #18.17
        vmovups   %ymm9, 1152+aa(%rdi)                          #18.17
        vmovups   %ymm11, 1184+aa(%rdi)                         #18.17
        vmovups   %ymm13, 1216+aa(%rdi)                         #18.17
        vmovups   %ymm15, 1248+aa(%rdi)                         #18.17
        addq      $64, %rsi                                     #15.9
        cmpq      $256, %rsi                                    #15.9
        jb        ..B1.6        # Prob 99%                      #15.9
                                # LOE rdx rbx rsi r12 r13 r14 r15 eax ecx
..B1.7:                         # Preds ..B1.6
                                # Execution count [2.00e+07]
        incl      %ecx                                          #17.13
        addq      $1024, %rdx                                   #17.13
        cmpl      $255, %ecx                                    #17.13
        jb        ..B1.5        # Prob 99%                      #17.13
                                # LOE rdx rbx r12 r13 r14 r15 eax ecx
..B1.8:                         # Preds ..B1.7
                                # Execution count [7.83e+04]
        incl      %eax                                          #14.5
        cmpl      $78000, %eax                                  #14.5
        jb        ..B1.2        # Prob 99%                      #14.5
                                # LOE rbx r12 r13 r14 r15 eax
..B1.9:                         # Preds ..B1.8
                                # Execution count [1.00e+00]
        vzeroupper                                              #22.1
        xorl      %eax, %eax                                    #22.1
        movq      %rbp, %rsp                                    #22.1
        popq      %rbp                                          #22.1
        .cfi_def_cfa 7, 8
        .cfi_restore 6
        ret                                                     #22.1

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/99414] s235 benchmark of TSVC is vectorized better by icc than gcc (loop interchange)
  2021-03-05 15:40 [Bug middle-end/99414] New: s235 benchmark of TSVC is vectorized better by icc than gcc (loop interchange) hubicka at gcc dot gnu.org
@ 2021-03-08  8:38 ` rguenth at gcc dot gnu.org
  2021-03-17 18:30 ` [Bug tree-optimization/99414] s235 and s233 benchmarks " hubicka at gcc dot gnu.org
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2021-03-08  8:38 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99414

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |amker at gcc dot gnu.org,
                   |                            |rguenth at gcc dot gnu.org
             Blocks|                            |53947
             Status|UNCONFIRMED                 |NEW
   Last reconfirmed|                            |2021-03-08
     Ever confirmed|0                           |1
          Component|middle-end                  |tree-optimization
           Keywords|                            |missed-optimization

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
linterchange says:

Consider loop interchange for loop_nest<1 - 3>
Access Strides for DRs:
  a[i_33]:              <0,     4,      0>
  b[i_33]:              <0,     4,      0>
  c[i_33]:              <0,     4,      0>
  a[i_33]:              <0,     4,      0>
  aa[_6][i_33]:         <0,     4,      1024>
  bb[j_34][i_33]:               <0,     4,      1024>
  aa[j_34][i_33]:               <0,     4,      1024>

Loop(3) carried vars:
  Induction:  j_34 = {1, 1}_3
  Induction:  ivtmp_53 = {255, 4294967295}_3

Loop(2) carried vars:
  Induction:  i_33 = {0, 1}_2
  Induction:  ivtmp_51 = {256, 4294967295}_2

and then doesn't do anything.

I suppose the best thing to do here is to first distribute the loop nest,
but our cost modeling fuses the two obvious candidates:

Fuse partitions because they have shared memory refs:
  Part 1: 0, 1, 2, 3, 4, 5, 6, 7, 19, 20, 21
  Part 2: 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21

so this is a case that asks for better cost modeling there.


Referenced Bugs:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/99414] s235 and s233 benchmarks of TSVC is vectorized better by icc than gcc (loop interchange)
  2021-03-05 15:40 [Bug middle-end/99414] New: s235 benchmark of TSVC is vectorized better by icc than gcc (loop interchange) hubicka at gcc dot gnu.org
  2021-03-08  8:38 ` [Bug tree-optimization/99414] " rguenth at gcc dot gnu.org
@ 2021-03-17 18:30 ` hubicka at gcc dot gnu.org
  2021-03-17 18:32 ` [Bug tree-optimization/99414] s235, s2233 " hubicka at gcc dot gnu.org
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-17 18:30 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99414

Jan Hubicka <hubicka at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
            Summary|s235 benchmark of TSVC is   |s235 and s233 benchmarks of
                   |vectorized better by icc    |TSVC is vectorized better
                   |than gcc (loop interchange) |by icc than gcc (loop
                   |                            |interchange)

--- Comment #2 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
another testcase

typedef float real_t;

#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
// array definitions

real_t
aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D],tt[LEN_2D][LEN_2D];

int main(struct args_t * func_args)
{
//    loop interchange
//    interchanging with one of two inner loops

    for (int nl = 0; nl < 100*(iterations/LEN_2D); nl++) {
        for (int i = 1; i < LEN_2D; i++) {
            for (int j = 1; j < LEN_2D; j++) {
                aa[j][i] = aa[j-1][i] + cc[j][i];
            }
            for (int j = 1; j < LEN_2D; j++) {
                bb[j][i] = bb[j][i-1] + cc[j][i];
            }
        }
        dummy();
    }

   return aa[0][0];
}

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/99414] s235, s2233 and s233 benchmarks of TSVC is vectorized better by icc than gcc (loop interchange)
  2021-03-05 15:40 [Bug middle-end/99414] New: s235 benchmark of TSVC is vectorized better by icc than gcc (loop interchange) hubicka at gcc dot gnu.org
  2021-03-08  8:38 ` [Bug tree-optimization/99414] " rguenth at gcc dot gnu.org
  2021-03-17 18:30 ` [Bug tree-optimization/99414] s235 and s233 benchmarks " hubicka at gcc dot gnu.org
@ 2021-03-17 18:32 ` hubicka at gcc dot gnu.org
  2021-03-17 18:42 ` [Bug tree-optimization/99414] s235, s2233, s275 " hubicka at gcc dot gnu.org
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-17 18:32 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99414

Jan Hubicka <hubicka at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
            Summary|s235 and s233 benchmarks of |s235, s2233 and s233
                   |TSVC is vectorized better   |benchmarks of TSVC is
                   |by icc than gcc (loop       |vectorized better by icc
                   |interchange)                |than gcc (loop interchange)

--- Comment #3 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
this one is 7s with gcc and 0.4s with icc.

typedef float real_t;

#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
// array definitions

real_t
aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D],tt[LEN_2D][LEN_2D];

int main(struct args_t * func_args)
{
//    loop interchange
//    interchanging with one of two inner loops

    for (int nl = 0; nl < 100*(iterations/LEN_2D); nl++) {
        for (int i = 1; i < LEN_2D; i++) {
            for (int j = 1; j < LEN_2D; j++) {
                aa[j][i] = aa[j-1][i] + cc[j][i];
            }
            for (int j = 1; j < LEN_2D; j++) {
                bb[i][j] = bb[i-1][j] + cc[i][j];
            }
        }
        dummy();
    }

   return aa[0][0];
}

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/99414] s235, s2233, s275 and s233 benchmarks of TSVC is vectorized better by icc than gcc (loop interchange)
  2021-03-05 15:40 [Bug middle-end/99414] New: s235 benchmark of TSVC is vectorized better by icc than gcc (loop interchange) hubicka at gcc dot gnu.org
                   ` (2 preceding siblings ...)
  2021-03-17 18:32 ` [Bug tree-optimization/99414] s235, s2233 " hubicka at gcc dot gnu.org
@ 2021-03-17 18:42 ` hubicka at gcc dot gnu.org
  2021-03-17 18:44 ` [Bug tree-optimization/99414] s235, s2233, s275, s2275 " hubicka at gcc dot gnu.org
  2021-03-18  8:26 ` rguenth at gcc dot gnu.org
  5 siblings, 0 replies; 7+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-17 18:42 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99414

Jan Hubicka <hubicka at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
            Summary|s235, s2233 and s233        |s235, s2233, s275 and s233
                   |benchmarks of TSVC is       |benchmarks of TSVC is
                   |vectorized better by icc    |vectorized better by icc
                   |than gcc (loop interchange) |than gcc (loop interchange)

--- Comment #4 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
s275:
typedef float real_t;

#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
// array definitions

real_t
a[LEN_2D],d[LEN_2D],aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D],tt[LEN_2D][LEN_2D];

int main(struct args_t * func_args)
{
//    control flow
//    if around inner loop, interchanging needed

    for (int i = 0; i < LEN_2D; i++) 
                aa[0][i]=1;

    for (int nl = 0; nl < 10*(iterations/LEN_2D); nl++) {
        for (int i = 0; i < LEN_2D; i++) {
            if (aa[0][i] > (real_t)0.) {
                for (int j = 1; j < LEN_2D; j++) {
                    aa[j][i] = aa[j-1][i] + bb[j][i] * cc[j][i];
                }
            }
        }
        dummy();
    }
   return aa[0][0];
}

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/99414] s235, s2233, s275, s2275 and s233 benchmarks of TSVC is vectorized better by icc than gcc (loop interchange)
  2021-03-05 15:40 [Bug middle-end/99414] New: s235 benchmark of TSVC is vectorized better by icc than gcc (loop interchange) hubicka at gcc dot gnu.org
                   ` (3 preceding siblings ...)
  2021-03-17 18:42 ` [Bug tree-optimization/99414] s235, s2233, s275 " hubicka at gcc dot gnu.org
@ 2021-03-17 18:44 ` hubicka at gcc dot gnu.org
  2021-03-18  8:26 ` rguenth at gcc dot gnu.org
  5 siblings, 0 replies; 7+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-17 18:44 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99414

Jan Hubicka <hubicka at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
            Summary|s235, s2233, s275 and s233  |s235, s2233, s275, s2275
                   |benchmarks of TSVC is       |and s233 benchmarks of TSVC
                   |vectorized better by icc    |is vectorized better by icc
                   |than gcc (loop interchange) |than gcc (loop interchange)

--- Comment #5 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
s2275:
typedef float real_t;

#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
// array definitions

real_t
a[LEN_2D],b[LEN_2D],c[LEN_2D],d[LEN_2D],aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D],tt[LEN_2D][LEN_2D];

int main(struct args_t * func_args)
{
//    loop distribution is needed to be able to interchange

    for (int nl = 0; nl < 100*(iterations/LEN_2D); nl++) {
        for (int i = 0; i < LEN_2D; i++) {
            for (int j = 0; j < LEN_2D; j++) {
                aa[j][i] = aa[j][i] + bb[j][i] * cc[j][i];
            }
            a[i] = b[i] + c[i] * d[i];
        }
        dummy();
    }
   return aa[0][0];
}

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/99414] s235, s2233, s275, s2275 and s233 benchmarks of TSVC is vectorized better by icc than gcc (loop interchange)
  2021-03-05 15:40 [Bug middle-end/99414] New: s235 benchmark of TSVC is vectorized better by icc than gcc (loop interchange) hubicka at gcc dot gnu.org
                   ` (4 preceding siblings ...)
  2021-03-17 18:44 ` [Bug tree-optimization/99414] s235, s2233, s275, s2275 " hubicka at gcc dot gnu.org
@ 2021-03-18  8:26 ` rguenth at gcc dot gnu.org
  5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2021-03-18  8:26 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99414

--- Comment #6 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Jan Hubicka from comment #4)
> s275:
> typedef float real_t;
> 
> #define iterations 100000
> #define LEN_1D 32000
> #define LEN_2D 256
> // array definitions
> 
> real_t
> a[LEN_2D],d[LEN_2D],aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D],
> tt[LEN_2D][LEN_2D];
> 
> int main(struct args_t * func_args)
> {
> //    control flow
> //    if around inner loop, interchanging needed
> 
>     for (int i = 0; i < LEN_2D; i++) 
>                 aa[0][i]=1;
> 
>     for (int nl = 0; nl < 10*(iterations/LEN_2D); nl++) {
>         for (int i = 0; i < LEN_2D; i++) {
>             if (aa[0][i] > (real_t)0.) {
>                 for (int j = 1; j < LEN_2D; j++) {
>                     aa[j][i] = aa[j-1][i] + bb[j][i] * cc[j][i];
>                 }
>             }
>         }
>         dummy();
>     }
>    return aa[0][0];
> }

This would need to be supported by interchange itself - it's basically
un-unswitching the condition, moving it inside the innermost loop and then
performing interchange.  Not sure how difficult it would be to handle this,
but it looks like a candidate for diagnostic aka 'note: interchanging loops
will improve performance' aka "fix your code".

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-03-18  8:26 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-05 15:40 [Bug middle-end/99414] New: s235 benchmark of TSVC is vectorized better by icc than gcc (loop interchange) hubicka at gcc dot gnu.org
2021-03-08  8:38 ` [Bug tree-optimization/99414] " rguenth at gcc dot gnu.org
2021-03-17 18:30 ` [Bug tree-optimization/99414] s235 and s233 benchmarks " hubicka at gcc dot gnu.org
2021-03-17 18:32 ` [Bug tree-optimization/99414] s235, s2233 " hubicka at gcc dot gnu.org
2021-03-17 18:42 ` [Bug tree-optimization/99414] s235, s2233, s275 " hubicka at gcc dot gnu.org
2021-03-17 18:44 ` [Bug tree-optimization/99414] s235, s2233, s275, s2275 " hubicka at gcc dot gnu.org
2021-03-18  8:26 ` rguenth at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).