public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug c/112331] New: middle-end: Fail vectorization
@ 2023-11-01  9:20 juzhe.zhong at rivai dot ai
  2023-11-01  9:24 ` [Bug c/112331] " juzhe.zhong at rivai dot ai
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-11-01  9:20 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331

            Bug ID: 112331
           Summary: middle-end: Fail vectorization
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: juzhe.zhong at rivai dot ai
  Target Milestone: ---

https://gcc.godbolt.org/z/x7GGzezGh


#include <stdio.h>
#define LEN 32000
#define ntimes 200000
#define TYPE float
#define lll LEN
#define LEN2 256
#define ALIGNMENT 16
__attribute__ ((aligned(ALIGNMENT))) TYPE X[lll],Y[lll],Z[lll],U[lll],V[lll];

struct GlobalData {
  __attribute__((aligned(ALIGNMENT))) TYPE a[LEN];
  int pad1[3];
  __attribute__((aligned(ALIGNMENT))) TYPE b[LEN];
  int pad2[5];
  __attribute__((aligned(ALIGNMENT))) TYPE c[LEN];
  int pad3[7];
  __attribute__((aligned(ALIGNMENT))) TYPE d[LEN];
  int pad4[11];
  __attribute__((aligned(ALIGNMENT))) TYPE e[LEN];

  int pad5[13];
  __attribute__((aligned(ALIGNMENT))) TYPE aa[LEN2][LEN2];
  int pad6[17];
  __attribute__((aligned(ALIGNMENT))) TYPE bb[LEN2][LEN2];
  int pad7[19];
  __attribute__((aligned(ALIGNMENT))) TYPE cc[LEN2][LEN2];
  int pad8[23];
  __attribute__((aligned(ALIGNMENT))) TYPE tt[LEN2][LEN2];
} global_data;

__attribute__((aligned(ALIGNMENT))) TYPE * const a = global_data.a;
__attribute__((aligned(ALIGNMENT))) TYPE * const b = global_data.b;
__attribute__((aligned(ALIGNMENT))) TYPE * const c = global_data.c;
__attribute__((aligned(ALIGNMENT))) TYPE * const d = global_data.d;
__attribute__((aligned(ALIGNMENT))) TYPE * const e = global_data.e;
__attribute__((aligned(ALIGNMENT))) TYPE (* const aa)[LEN2] = global_data.aa;
__attribute__((aligned(ALIGNMENT))) TYPE (* const bb)[LEN2] = global_data.bb;
__attribute__((aligned(ALIGNMENT))) TYPE (* const cc)[LEN2] = global_data.cc;
__attribute__((aligned(ALIGNMENT))) TYPE (* const tt)[LEN2] = global_data.tt;

int foo()
{

//      linear dependence testing
//      no dependence - vectorizable


        for (int nl = 0; nl < 2*ntimes; nl++) {
//              #pragma vector always
                for (int i = 1; i < LEN; i += 2) {
                        a[i] = a[i - 1] + b[i];
                }
        }

        return 0;
}

Both RVV and ARM SVE faild to vectorize it wheras Clang can vectorize it.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug c/112331] middle-end: Fail vectorization
  2023-11-01  9:20 [Bug c/112331] New: middle-end: Fail vectorization juzhe.zhong at rivai dot ai
@ 2023-11-01  9:24 ` juzhe.zhong at rivai dot ai
  2023-11-02  7:42 ` [Bug tree-optimization/112331] " juzhe.zhong at rivai dot ai
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-11-01  9:24 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331

--- Comment #1 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
I suspect it is SRA issue again ?

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug tree-optimization/112331] middle-end: Fail vectorization
  2023-11-01  9:20 [Bug c/112331] New: middle-end: Fail vectorization juzhe.zhong at rivai dot ai
  2023-11-01  9:24 ` [Bug c/112331] " juzhe.zhong at rivai dot ai
@ 2023-11-02  7:42 ` juzhe.zhong at rivai dot ai
  2023-11-02 10:00 ` [Bug tree-optimization/112331] Fail vectorization after loop interchange rguenth at gcc dot gnu.org
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-11-02  7:42 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331

--- Comment #2 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
Reduced case:


#include <stdio.h>
#define LEN 32000
#define ntimes 200000
#define TYPE int
#define lll LEN
#define LEN2 256
#define ALIGNMENT 16
__attribute__ ((aligned(ALIGNMENT))) TYPE X[lll],Y[lll],Z[lll],U[lll],V[lll];

struct GlobalData {
  __attribute__((aligned(ALIGNMENT))) TYPE a[LEN];
  int pad1[3];
  __attribute__((aligned(ALIGNMENT))) TYPE b[LEN];
  int pad2[5];
  __attribute__((aligned(ALIGNMENT))) TYPE c[LEN];
  int pad3[7];
  __attribute__((aligned(ALIGNMENT))) TYPE d[LEN];
  int pad4[11];
  __attribute__((aligned(ALIGNMENT))) TYPE e[LEN];

  int pad5[13];
  __attribute__((aligned(ALIGNMENT))) TYPE aa[LEN2][LEN2];
  int pad6[17];
  __attribute__((aligned(ALIGNMENT))) TYPE bb[LEN2][LEN2];
  int pad7[19];
  __attribute__((aligned(ALIGNMENT))) TYPE cc[LEN2][LEN2];
  int pad8[23];
  __attribute__((aligned(ALIGNMENT))) TYPE tt[LEN2][LEN2];
} global_data;

__attribute__((aligned(ALIGNMENT))) TYPE * const a = global_data.a;
__attribute__((aligned(ALIGNMENT))) TYPE * const b = global_data.b;
__attribute__((aligned(ALIGNMENT))) TYPE * const c = global_data.c;
__attribute__((aligned(ALIGNMENT))) TYPE * const d = global_data.d;
__attribute__((aligned(ALIGNMENT))) TYPE * const e = global_data.e;
__attribute__((aligned(ALIGNMENT))) TYPE (* const aa)[LEN2] = global_data.aa;
__attribute__((aligned(ALIGNMENT))) TYPE (* const bb)[LEN2] = global_data.bb;
__attribute__((aligned(ALIGNMENT))) TYPE (* const cc)[LEN2] = global_data.cc;
__attribute__((aligned(ALIGNMENT))) TYPE (* const tt)[LEN2] = global_data.tt;

int s111()
{




        for (int nl = 0; nl < 2*ntimes; nl++) {
                for (int i = 0; i < lll; i++) {
                        X[i] = Y[i] + 1;
                }
        }
        return 0;
}

Also failed to vectorize.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug tree-optimization/112331] Fail vectorization after loop interchange
  2023-11-01  9:20 [Bug c/112331] New: middle-end: Fail vectorization juzhe.zhong at rivai dot ai
  2023-11-01  9:24 ` [Bug c/112331] " juzhe.zhong at rivai dot ai
  2023-11-02  7:42 ` [Bug tree-optimization/112331] " juzhe.zhong at rivai dot ai
@ 2023-11-02 10:00 ` rguenth at gcc dot gnu.org
  2023-11-02 10:14 ` juzhe.zhong at rivai dot ai
  2023-11-02 10:48 ` rguenth at gcc dot gnu.org
  4 siblings, 0 replies; 6+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-11-02 10:00 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
            Summary|middle-end: Fail            |Fail vectorization after
                   |vectorization               |loop interchange
                 CC|                            |rguenth at gcc dot gnu.org

--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
Well, the "issue" is that we are performing loop interchange on this benchmark
loop and the vectorizer doesn't like the zero-step in the then innermost loop.

It's not a practical example, nobody would do such outer loop in practice.

There's a missed optimization in that we fail to elide the then inner loop.

The solution is to insert a use of 'a' after the inner loop, like TSVC
benchmarks usually have:

real_t s111(struct args_t * func_args)
{
//    linear dependence testing
//    no dependence - vectorizable

    initialise_arrays(__func__);

    for (int nl = 0; nl < 2*iterations; nl++) {
        for (int i = 1; i < LEN_1D; i += 2) {
            a[i] = a[i - 1] + b[i];
        }
        dummy(a, b, c, d, e, aa, bb, cc, 0.);
    }

    return calc_checksum(__func__);
}

the it just works(TM).

WONTFIX (in the vectorizer).  In "theory" the interchanged loop could be
vectorized by outer loop vectorization.  But as said, IMHO a waste of time
to cheat badly written benchmarks.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug tree-optimization/112331] Fail vectorization after loop interchange
  2023-11-01  9:20 [Bug c/112331] New: middle-end: Fail vectorization juzhe.zhong at rivai dot ai
                   ` (2 preceding siblings ...)
  2023-11-02 10:00 ` [Bug tree-optimization/112331] Fail vectorization after loop interchange rguenth at gcc dot gnu.org
@ 2023-11-02 10:14 ` juzhe.zhong at rivai dot ai
  2023-11-02 10:48 ` rguenth at gcc dot gnu.org
  4 siblings, 0 replies; 6+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-11-02 10:14 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331

--- Comment #4 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
I see. 

It does vectorize it with  -fno-vect-cost-model -fno-loop-interchange:

https://gcc.godbolt.org/z/8EEWcPro3

Codegen same as LLVM.

I am gonna revisit it in GCC-15 (GCC-14 stage 1 is closing soon).

Thanks a lot!

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug tree-optimization/112331] Fail vectorization after loop interchange
  2023-11-01  9:20 [Bug c/112331] New: middle-end: Fail vectorization juzhe.zhong at rivai dot ai
                   ` (3 preceding siblings ...)
  2023-11-02 10:14 ` juzhe.zhong at rivai dot ai
@ 2023-11-02 10:48 ` rguenth at gcc dot gnu.org
  4 siblings, 0 replies; 6+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-11-02 10:48 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331

--- Comment #5 from Richard Biener <rguenth at gcc dot gnu.org> ---
I'm not sure what the problem is with a zero DR step for an inner loop
reference 
(possibly dependence analysis runs into some unhandled cases - who knows).  The
following vectorizes the inner loop (the load is hoisted as invariant, but
the store is not sunk - there's no sinking phase after interchange).

diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index d5c9c4a11c2..7d1f0697fe7 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -2944,6 +2944,7 @@ vect_analyze_data_ref_access (vec_info *vinfo,
dr_vec_info *dr_info)
       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
       if (!nested_in_vect_loop_p (loop, stmt_info))
        return DR_IS_READ (dr);
+#if 0
       /* Allow references with zero step for outer loops marked
         with pragma omp simd only - it guarantees absence of
         loop-carried dependencies between inner loop iterations.  */
@@ -2954,6 +2955,7 @@ vect_analyze_data_ref_access (vec_info *vinfo,
dr_vec_info *dr_info)
                             "zero step in inner loop of nest\n");
          return false;
        }
+#endif
     }

   if (loop && nested_in_vect_loop_p (loop, stmt_info))


Note when we don't vectorize we are eliding the inner loop later, when
we vectorize we don't.

unvectorized:

s111:
.LFB0:
        .cfi_startproc
        xorl    %eax, %eax
.L2:
        movl    Y(%rax), %ecx
        addq    $4, %rax
        leal    1(%rcx), %edx
        movl    %edx, X-4(%rax)
        cmpq    $128000, %rax
        jne     .L2
        xorl    %eax, %eax
        ret

vectorized:

s111:
.LFB0:
        .cfi_startproc
        movdqa  .LC0(%rip), %xmm1
        xorl    %ecx, %ecx
.L2:
        movdqa  Y(%rcx), %xmm0
        leaq    X(%rcx), %rdx
        movl    $400000, %eax
        paddd   %xmm1, %xmm0
        .p2align 4,,10
        .p2align 3
.L3:
        movaps  %xmm0, (%rdx)
        subl    $2, %eax
        jne     .L3
        addq    $16, %rcx
        cmpq    $128000, %rcx
        jne     .L2
        xorl    %eax, %eax
        ret

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-11-02 10:48 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-01  9:20 [Bug c/112331] New: middle-end: Fail vectorization juzhe.zhong at rivai dot ai
2023-11-01  9:24 ` [Bug c/112331] " juzhe.zhong at rivai dot ai
2023-11-02  7:42 ` [Bug tree-optimization/112331] " juzhe.zhong at rivai dot ai
2023-11-02 10:00 ` [Bug tree-optimization/112331] Fail vectorization after loop interchange rguenth at gcc dot gnu.org
2023-11-02 10:14 ` juzhe.zhong at rivai dot ai
2023-11-02 10:48 ` rguenth at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).