public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug c/112331] New: middle-end: Fail vectorization
@ 2023-11-01 9:20 juzhe.zhong at rivai dot ai
2023-11-01 9:24 ` [Bug c/112331] " juzhe.zhong at rivai dot ai
` (4 more replies)
0 siblings, 5 replies; 6+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-11-01 9:20 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331
Bug ID: 112331
Summary: middle-end: Fail vectorization
Product: gcc
Version: 14.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
Assignee: unassigned at gcc dot gnu.org
Reporter: juzhe.zhong at rivai dot ai
Target Milestone: ---
https://gcc.godbolt.org/z/x7GGzezGh
#include <stdio.h>
#define LEN 32000
#define ntimes 200000
#define TYPE float
#define lll LEN
#define LEN2 256
#define ALIGNMENT 16
__attribute__ ((aligned(ALIGNMENT))) TYPE X[lll],Y[lll],Z[lll],U[lll],V[lll];
struct GlobalData {
__attribute__((aligned(ALIGNMENT))) TYPE a[LEN];
int pad1[3];
__attribute__((aligned(ALIGNMENT))) TYPE b[LEN];
int pad2[5];
__attribute__((aligned(ALIGNMENT))) TYPE c[LEN];
int pad3[7];
__attribute__((aligned(ALIGNMENT))) TYPE d[LEN];
int pad4[11];
__attribute__((aligned(ALIGNMENT))) TYPE e[LEN];
int pad5[13];
__attribute__((aligned(ALIGNMENT))) TYPE aa[LEN2][LEN2];
int pad6[17];
__attribute__((aligned(ALIGNMENT))) TYPE bb[LEN2][LEN2];
int pad7[19];
__attribute__((aligned(ALIGNMENT))) TYPE cc[LEN2][LEN2];
int pad8[23];
__attribute__((aligned(ALIGNMENT))) TYPE tt[LEN2][LEN2];
} global_data;
__attribute__((aligned(ALIGNMENT))) TYPE * const a = global_data.a;
__attribute__((aligned(ALIGNMENT))) TYPE * const b = global_data.b;
__attribute__((aligned(ALIGNMENT))) TYPE * const c = global_data.c;
__attribute__((aligned(ALIGNMENT))) TYPE * const d = global_data.d;
__attribute__((aligned(ALIGNMENT))) TYPE * const e = global_data.e;
__attribute__((aligned(ALIGNMENT))) TYPE (* const aa)[LEN2] = global_data.aa;
__attribute__((aligned(ALIGNMENT))) TYPE (* const bb)[LEN2] = global_data.bb;
__attribute__((aligned(ALIGNMENT))) TYPE (* const cc)[LEN2] = global_data.cc;
__attribute__((aligned(ALIGNMENT))) TYPE (* const tt)[LEN2] = global_data.tt;
int foo()
{
// linear dependence testing
// no dependence - vectorizable
for (int nl = 0; nl < 2*ntimes; nl++) {
// #pragma vector always
for (int i = 1; i < LEN; i += 2) {
a[i] = a[i - 1] + b[i];
}
}
return 0;
}
Both RVV and ARM SVE faild to vectorize it wheras Clang can vectorize it.
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug c/112331] middle-end: Fail vectorization
2023-11-01 9:20 [Bug c/112331] New: middle-end: Fail vectorization juzhe.zhong at rivai dot ai
@ 2023-11-01 9:24 ` juzhe.zhong at rivai dot ai
2023-11-02 7:42 ` [Bug tree-optimization/112331] " juzhe.zhong at rivai dot ai
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-11-01 9:24 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331
--- Comment #1 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
I suspect it is SRA issue again ?
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/112331] middle-end: Fail vectorization
2023-11-01 9:20 [Bug c/112331] New: middle-end: Fail vectorization juzhe.zhong at rivai dot ai
2023-11-01 9:24 ` [Bug c/112331] " juzhe.zhong at rivai dot ai
@ 2023-11-02 7:42 ` juzhe.zhong at rivai dot ai
2023-11-02 10:00 ` [Bug tree-optimization/112331] Fail vectorization after loop interchange rguenth at gcc dot gnu.org
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-11-02 7:42 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331
--- Comment #2 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
Reduced case:
#include <stdio.h>
#define LEN 32000
#define ntimes 200000
#define TYPE int
#define lll LEN
#define LEN2 256
#define ALIGNMENT 16
__attribute__ ((aligned(ALIGNMENT))) TYPE X[lll],Y[lll],Z[lll],U[lll],V[lll];
struct GlobalData {
__attribute__((aligned(ALIGNMENT))) TYPE a[LEN];
int pad1[3];
__attribute__((aligned(ALIGNMENT))) TYPE b[LEN];
int pad2[5];
__attribute__((aligned(ALIGNMENT))) TYPE c[LEN];
int pad3[7];
__attribute__((aligned(ALIGNMENT))) TYPE d[LEN];
int pad4[11];
__attribute__((aligned(ALIGNMENT))) TYPE e[LEN];
int pad5[13];
__attribute__((aligned(ALIGNMENT))) TYPE aa[LEN2][LEN2];
int pad6[17];
__attribute__((aligned(ALIGNMENT))) TYPE bb[LEN2][LEN2];
int pad7[19];
__attribute__((aligned(ALIGNMENT))) TYPE cc[LEN2][LEN2];
int pad8[23];
__attribute__((aligned(ALIGNMENT))) TYPE tt[LEN2][LEN2];
} global_data;
__attribute__((aligned(ALIGNMENT))) TYPE * const a = global_data.a;
__attribute__((aligned(ALIGNMENT))) TYPE * const b = global_data.b;
__attribute__((aligned(ALIGNMENT))) TYPE * const c = global_data.c;
__attribute__((aligned(ALIGNMENT))) TYPE * const d = global_data.d;
__attribute__((aligned(ALIGNMENT))) TYPE * const e = global_data.e;
__attribute__((aligned(ALIGNMENT))) TYPE (* const aa)[LEN2] = global_data.aa;
__attribute__((aligned(ALIGNMENT))) TYPE (* const bb)[LEN2] = global_data.bb;
__attribute__((aligned(ALIGNMENT))) TYPE (* const cc)[LEN2] = global_data.cc;
__attribute__((aligned(ALIGNMENT))) TYPE (* const tt)[LEN2] = global_data.tt;
int s111()
{
for (int nl = 0; nl < 2*ntimes; nl++) {
for (int i = 0; i < lll; i++) {
X[i] = Y[i] + 1;
}
}
return 0;
}
Also failed to vectorize.
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/112331] Fail vectorization after loop interchange
2023-11-01 9:20 [Bug c/112331] New: middle-end: Fail vectorization juzhe.zhong at rivai dot ai
2023-11-01 9:24 ` [Bug c/112331] " juzhe.zhong at rivai dot ai
2023-11-02 7:42 ` [Bug tree-optimization/112331] " juzhe.zhong at rivai dot ai
@ 2023-11-02 10:00 ` rguenth at gcc dot gnu.org
2023-11-02 10:14 ` juzhe.zhong at rivai dot ai
2023-11-02 10:48 ` rguenth at gcc dot gnu.org
4 siblings, 0 replies; 6+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-11-02 10:00 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Summary|middle-end: Fail |Fail vectorization after
|vectorization |loop interchange
CC| |rguenth at gcc dot gnu.org
--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
Well, the "issue" is that we are performing loop interchange on this benchmark
loop and the vectorizer doesn't like the zero-step in the then innermost loop.
It's not a practical example, nobody would do such outer loop in practice.
There's a missed optimization in that we fail to elide the then inner loop.
The solution is to insert a use of 'a' after the inner loop, like TSVC
benchmarks usually have:
real_t s111(struct args_t * func_args)
{
// linear dependence testing
// no dependence - vectorizable
initialise_arrays(__func__);
for (int nl = 0; nl < 2*iterations; nl++) {
for (int i = 1; i < LEN_1D; i += 2) {
a[i] = a[i - 1] + b[i];
}
dummy(a, b, c, d, e, aa, bb, cc, 0.);
}
return calc_checksum(__func__);
}
the it just works(TM).
WONTFIX (in the vectorizer). In "theory" the interchanged loop could be
vectorized by outer loop vectorization. But as said, IMHO a waste of time
to cheat badly written benchmarks.
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/112331] Fail vectorization after loop interchange
2023-11-01 9:20 [Bug c/112331] New: middle-end: Fail vectorization juzhe.zhong at rivai dot ai
` (2 preceding siblings ...)
2023-11-02 10:00 ` [Bug tree-optimization/112331] Fail vectorization after loop interchange rguenth at gcc dot gnu.org
@ 2023-11-02 10:14 ` juzhe.zhong at rivai dot ai
2023-11-02 10:48 ` rguenth at gcc dot gnu.org
4 siblings, 0 replies; 6+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-11-02 10:14 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331
--- Comment #4 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
I see.
It does vectorize it with -fno-vect-cost-model -fno-loop-interchange:
https://gcc.godbolt.org/z/8EEWcPro3
Codegen same as LLVM.
I am gonna revisit it in GCC-15 (GCC-14 stage 1 is closing soon).
Thanks a lot!
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/112331] Fail vectorization after loop interchange
2023-11-01 9:20 [Bug c/112331] New: middle-end: Fail vectorization juzhe.zhong at rivai dot ai
` (3 preceding siblings ...)
2023-11-02 10:14 ` juzhe.zhong at rivai dot ai
@ 2023-11-02 10:48 ` rguenth at gcc dot gnu.org
4 siblings, 0 replies; 6+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-11-02 10:48 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331
--- Comment #5 from Richard Biener <rguenth at gcc dot gnu.org> ---
I'm not sure what the problem is with a zero DR step for an inner loop
reference
(possibly dependence analysis runs into some unhandled cases - who knows). The
following vectorizes the inner loop (the load is hoisted as invariant, but
the store is not sunk - there's no sinking phase after interchange).
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index d5c9c4a11c2..7d1f0697fe7 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -2944,6 +2944,7 @@ vect_analyze_data_ref_access (vec_info *vinfo,
dr_vec_info *dr_info)
DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
if (!nested_in_vect_loop_p (loop, stmt_info))
return DR_IS_READ (dr);
+#if 0
/* Allow references with zero step for outer loops marked
with pragma omp simd only - it guarantees absence of
loop-carried dependencies between inner loop iterations. */
@@ -2954,6 +2955,7 @@ vect_analyze_data_ref_access (vec_info *vinfo,
dr_vec_info *dr_info)
"zero step in inner loop of nest\n");
return false;
}
+#endif
}
if (loop && nested_in_vect_loop_p (loop, stmt_info))
Note when we don't vectorize we are eliding the inner loop later, when
we vectorize we don't.
unvectorized:
s111:
.LFB0:
.cfi_startproc
xorl %eax, %eax
.L2:
movl Y(%rax), %ecx
addq $4, %rax
leal 1(%rcx), %edx
movl %edx, X-4(%rax)
cmpq $128000, %rax
jne .L2
xorl %eax, %eax
ret
vectorized:
s111:
.LFB0:
.cfi_startproc
movdqa .LC0(%rip), %xmm1
xorl %ecx, %ecx
.L2:
movdqa Y(%rcx), %xmm0
leaq X(%rcx), %rdx
movl $400000, %eax
paddd %xmm1, %xmm0
.p2align 4,,10
.p2align 3
.L3:
movaps %xmm0, (%rdx)
subl $2, %eax
jne .L3
addq $16, %rcx
cmpq $128000, %rcx
jne .L2
xorl %eax, %eax
ret
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2023-11-02 10:48 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-01 9:20 [Bug c/112331] New: middle-end: Fail vectorization juzhe.zhong at rivai dot ai
2023-11-01 9:24 ` [Bug c/112331] " juzhe.zhong at rivai dot ai
2023-11-02 7:42 ` [Bug tree-optimization/112331] " juzhe.zhong at rivai dot ai
2023-11-02 10:00 ` [Bug tree-optimization/112331] Fail vectorization after loop interchange rguenth at gcc dot gnu.org
2023-11-02 10:14 ` juzhe.zhong at rivai dot ai
2023-11-02 10:48 ` rguenth at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).