public inbox for gcc-bugs@sourceware.org help / color / mirror / Atom feed
* [Bug c/112331] New: middle-end: Fail vectorization @ 2023-11-01 9:20 juzhe.zhong at rivai dot ai 2023-11-01 9:24 ` [Bug c/112331] " juzhe.zhong at rivai dot ai ` (4 more replies) 0 siblings, 5 replies; 6+ messages in thread From: juzhe.zhong at rivai dot ai @ 2023-11-01 9:20 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331 Bug ID: 112331 Summary: middle-end: Fail vectorization Product: gcc Version: 14.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c Assignee: unassigned at gcc dot gnu.org Reporter: juzhe.zhong at rivai dot ai Target Milestone: --- https://gcc.godbolt.org/z/x7GGzezGh #include <stdio.h> #define LEN 32000 #define ntimes 200000 #define TYPE float #define lll LEN #define LEN2 256 #define ALIGNMENT 16 __attribute__ ((aligned(ALIGNMENT))) TYPE X[lll],Y[lll],Z[lll],U[lll],V[lll]; struct GlobalData { __attribute__((aligned(ALIGNMENT))) TYPE a[LEN]; int pad1[3]; __attribute__((aligned(ALIGNMENT))) TYPE b[LEN]; int pad2[5]; __attribute__((aligned(ALIGNMENT))) TYPE c[LEN]; int pad3[7]; __attribute__((aligned(ALIGNMENT))) TYPE d[LEN]; int pad4[11]; __attribute__((aligned(ALIGNMENT))) TYPE e[LEN]; int pad5[13]; __attribute__((aligned(ALIGNMENT))) TYPE aa[LEN2][LEN2]; int pad6[17]; __attribute__((aligned(ALIGNMENT))) TYPE bb[LEN2][LEN2]; int pad7[19]; __attribute__((aligned(ALIGNMENT))) TYPE cc[LEN2][LEN2]; int pad8[23]; __attribute__((aligned(ALIGNMENT))) TYPE tt[LEN2][LEN2]; } global_data; __attribute__((aligned(ALIGNMENT))) TYPE * const a = global_data.a; __attribute__((aligned(ALIGNMENT))) TYPE * const b = global_data.b; __attribute__((aligned(ALIGNMENT))) TYPE * const c = global_data.c; __attribute__((aligned(ALIGNMENT))) TYPE * const d = global_data.d; __attribute__((aligned(ALIGNMENT))) TYPE * const e = global_data.e; __attribute__((aligned(ALIGNMENT))) TYPE (* const aa)[LEN2] = global_data.aa; __attribute__((aligned(ALIGNMENT))) TYPE (* const bb)[LEN2] = global_data.bb; __attribute__((aligned(ALIGNMENT))) TYPE (* const cc)[LEN2] = global_data.cc; __attribute__((aligned(ALIGNMENT))) TYPE (* const tt)[LEN2] = global_data.tt; int foo() { // linear dependence testing // no dependence - vectorizable for (int nl = 0; nl < 2*ntimes; nl++) { // #pragma vector always for (int i = 1; i < LEN; i += 2) { a[i] = a[i - 1] + b[i]; } } return 0; } Both RVV and ARM SVE faild to vectorize it wheras Clang can vectorize it. ^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug c/112331] middle-end: Fail vectorization 2023-11-01 9:20 [Bug c/112331] New: middle-end: Fail vectorization juzhe.zhong at rivai dot ai @ 2023-11-01 9:24 ` juzhe.zhong at rivai dot ai 2023-11-02 7:42 ` [Bug tree-optimization/112331] " juzhe.zhong at rivai dot ai ` (3 subsequent siblings) 4 siblings, 0 replies; 6+ messages in thread From: juzhe.zhong at rivai dot ai @ 2023-11-01 9:24 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331 --- Comment #1 from JuzheZhong <juzhe.zhong at rivai dot ai> --- I suspect it is SRA issue again ? ^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/112331] middle-end: Fail vectorization 2023-11-01 9:20 [Bug c/112331] New: middle-end: Fail vectorization juzhe.zhong at rivai dot ai 2023-11-01 9:24 ` [Bug c/112331] " juzhe.zhong at rivai dot ai @ 2023-11-02 7:42 ` juzhe.zhong at rivai dot ai 2023-11-02 10:00 ` [Bug tree-optimization/112331] Fail vectorization after loop interchange rguenth at gcc dot gnu.org ` (2 subsequent siblings) 4 siblings, 0 replies; 6+ messages in thread From: juzhe.zhong at rivai dot ai @ 2023-11-02 7:42 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331 --- Comment #2 from JuzheZhong <juzhe.zhong at rivai dot ai> --- Reduced case: #include <stdio.h> #define LEN 32000 #define ntimes 200000 #define TYPE int #define lll LEN #define LEN2 256 #define ALIGNMENT 16 __attribute__ ((aligned(ALIGNMENT))) TYPE X[lll],Y[lll],Z[lll],U[lll],V[lll]; struct GlobalData { __attribute__((aligned(ALIGNMENT))) TYPE a[LEN]; int pad1[3]; __attribute__((aligned(ALIGNMENT))) TYPE b[LEN]; int pad2[5]; __attribute__((aligned(ALIGNMENT))) TYPE c[LEN]; int pad3[7]; __attribute__((aligned(ALIGNMENT))) TYPE d[LEN]; int pad4[11]; __attribute__((aligned(ALIGNMENT))) TYPE e[LEN]; int pad5[13]; __attribute__((aligned(ALIGNMENT))) TYPE aa[LEN2][LEN2]; int pad6[17]; __attribute__((aligned(ALIGNMENT))) TYPE bb[LEN2][LEN2]; int pad7[19]; __attribute__((aligned(ALIGNMENT))) TYPE cc[LEN2][LEN2]; int pad8[23]; __attribute__((aligned(ALIGNMENT))) TYPE tt[LEN2][LEN2]; } global_data; __attribute__((aligned(ALIGNMENT))) TYPE * const a = global_data.a; __attribute__((aligned(ALIGNMENT))) TYPE * const b = global_data.b; __attribute__((aligned(ALIGNMENT))) TYPE * const c = global_data.c; __attribute__((aligned(ALIGNMENT))) TYPE * const d = global_data.d; __attribute__((aligned(ALIGNMENT))) TYPE * const e = global_data.e; __attribute__((aligned(ALIGNMENT))) TYPE (* const aa)[LEN2] = global_data.aa; __attribute__((aligned(ALIGNMENT))) TYPE (* const bb)[LEN2] = global_data.bb; __attribute__((aligned(ALIGNMENT))) TYPE (* const cc)[LEN2] = global_data.cc; __attribute__((aligned(ALIGNMENT))) TYPE (* const tt)[LEN2] = global_data.tt; int s111() { for (int nl = 0; nl < 2*ntimes; nl++) { for (int i = 0; i < lll; i++) { X[i] = Y[i] + 1; } } return 0; } Also failed to vectorize. ^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/112331] Fail vectorization after loop interchange 2023-11-01 9:20 [Bug c/112331] New: middle-end: Fail vectorization juzhe.zhong at rivai dot ai 2023-11-01 9:24 ` [Bug c/112331] " juzhe.zhong at rivai dot ai 2023-11-02 7:42 ` [Bug tree-optimization/112331] " juzhe.zhong at rivai dot ai @ 2023-11-02 10:00 ` rguenth at gcc dot gnu.org 2023-11-02 10:14 ` juzhe.zhong at rivai dot ai 2023-11-02 10:48 ` rguenth at gcc dot gnu.org 4 siblings, 0 replies; 6+ messages in thread From: rguenth at gcc dot gnu.org @ 2023-11-02 10:00 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331 Richard Biener <rguenth at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- Summary|middle-end: Fail |Fail vectorization after |vectorization |loop interchange CC| |rguenth at gcc dot gnu.org --- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> --- Well, the "issue" is that we are performing loop interchange on this benchmark loop and the vectorizer doesn't like the zero-step in the then innermost loop. It's not a practical example, nobody would do such outer loop in practice. There's a missed optimization in that we fail to elide the then inner loop. The solution is to insert a use of 'a' after the inner loop, like TSVC benchmarks usually have: real_t s111(struct args_t * func_args) { // linear dependence testing // no dependence - vectorizable initialise_arrays(__func__); for (int nl = 0; nl < 2*iterations; nl++) { for (int i = 1; i < LEN_1D; i += 2) { a[i] = a[i - 1] + b[i]; } dummy(a, b, c, d, e, aa, bb, cc, 0.); } return calc_checksum(__func__); } the it just works(TM). WONTFIX (in the vectorizer). In "theory" the interchanged loop could be vectorized by outer loop vectorization. But as said, IMHO a waste of time to cheat badly written benchmarks. ^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/112331] Fail vectorization after loop interchange 2023-11-01 9:20 [Bug c/112331] New: middle-end: Fail vectorization juzhe.zhong at rivai dot ai ` (2 preceding siblings ...) 2023-11-02 10:00 ` [Bug tree-optimization/112331] Fail vectorization after loop interchange rguenth at gcc dot gnu.org @ 2023-11-02 10:14 ` juzhe.zhong at rivai dot ai 2023-11-02 10:48 ` rguenth at gcc dot gnu.org 4 siblings, 0 replies; 6+ messages in thread From: juzhe.zhong at rivai dot ai @ 2023-11-02 10:14 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331 --- Comment #4 from JuzheZhong <juzhe.zhong at rivai dot ai> --- I see. It does vectorize it with -fno-vect-cost-model -fno-loop-interchange: https://gcc.godbolt.org/z/8EEWcPro3 Codegen same as LLVM. I am gonna revisit it in GCC-15 (GCC-14 stage 1 is closing soon). Thanks a lot! ^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/112331] Fail vectorization after loop interchange 2023-11-01 9:20 [Bug c/112331] New: middle-end: Fail vectorization juzhe.zhong at rivai dot ai ` (3 preceding siblings ...) 2023-11-02 10:14 ` juzhe.zhong at rivai dot ai @ 2023-11-02 10:48 ` rguenth at gcc dot gnu.org 4 siblings, 0 replies; 6+ messages in thread From: rguenth at gcc dot gnu.org @ 2023-11-02 10:48 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331 --- Comment #5 from Richard Biener <rguenth at gcc dot gnu.org> --- I'm not sure what the problem is with a zero DR step for an inner loop reference (possibly dependence analysis runs into some unhandled cases - who knows). The following vectorizes the inner loop (the load is hoisted as invariant, but the store is not sunk - there's no sinking phase after interchange). diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index d5c9c4a11c2..7d1f0697fe7 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -2944,6 +2944,7 @@ vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info) DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; if (!nested_in_vect_loop_p (loop, stmt_info)) return DR_IS_READ (dr); +#if 0 /* Allow references with zero step for outer loops marked with pragma omp simd only - it guarantees absence of loop-carried dependencies between inner loop iterations. */ @@ -2954,6 +2955,7 @@ vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info) "zero step in inner loop of nest\n"); return false; } +#endif } if (loop && nested_in_vect_loop_p (loop, stmt_info)) Note when we don't vectorize we are eliding the inner loop later, when we vectorize we don't. unvectorized: s111: .LFB0: .cfi_startproc xorl %eax, %eax .L2: movl Y(%rax), %ecx addq $4, %rax leal 1(%rcx), %edx movl %edx, X-4(%rax) cmpq $128000, %rax jne .L2 xorl %eax, %eax ret vectorized: s111: .LFB0: .cfi_startproc movdqa .LC0(%rip), %xmm1 xorl %ecx, %ecx .L2: movdqa Y(%rcx), %xmm0 leaq X(%rcx), %rdx movl $400000, %eax paddd %xmm1, %xmm0 .p2align 4,,10 .p2align 3 .L3: movaps %xmm0, (%rdx) subl $2, %eax jne .L3 addq $16, %rcx cmpq $128000, %rcx jne .L2 xorl %eax, %eax ret ^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2023-11-02 10:48 UTC | newest] Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2023-11-01 9:20 [Bug c/112331] New: middle-end: Fail vectorization juzhe.zhong at rivai dot ai 2023-11-01 9:24 ` [Bug c/112331] " juzhe.zhong at rivai dot ai 2023-11-02 7:42 ` [Bug tree-optimization/112331] " juzhe.zhong at rivai dot ai 2023-11-02 10:00 ` [Bug tree-optimization/112331] Fail vectorization after loop interchange rguenth at gcc dot gnu.org 2023-11-02 10:14 ` juzhe.zhong at rivai dot ai 2023-11-02 10:48 ` rguenth at gcc dot gnu.org
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).