public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/111793] New: OpenMP SIMD inbranch clones for AVX are highly sub-optimal
@ 2023-10-13 7:28 rguenth at gcc dot gnu.org
2023-10-13 7:33 ` [Bug tree-optimization/111793] " rguenth at gcc dot gnu.org
` (5 more replies)
0 siblings, 6 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-10-13 7:28 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111793
Bug ID: 111793
Summary: OpenMP SIMD inbranch clones for AVX are highly
sub-optimal
Product: gcc
Version: 14.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: rguenth at gcc dot gnu.org
Target Milestone: ---
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/111793] OpenMP SIMD inbranch clones for AVX are highly sub-optimal
2023-10-13 7:28 [Bug tree-optimization/111793] New: OpenMP SIMD inbranch clones for AVX are highly sub-optimal rguenth at gcc dot gnu.org
@ 2023-10-13 7:33 ` rguenth at gcc dot gnu.org
2023-10-13 7:43 ` [Bug tree-optimization/111793] OpenMP SIMD inbranch clones for AVX512 " rguenth at gcc dot gnu.org
` (4 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-10-13 7:33 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111793
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Blocks| |53947
Target| |x86_64-*-*
Keywords| |missed-optimization
--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
#pragma omp declare simd simdlen(16) inbranch
__attribute__((noinline)) int
foo (int a, int b)
{
return a + b;
}
we present if-conversion with the following which I think is good:
<bb 3> [local count: 1073741824]:
# iter.49_5 = PHI <0(2), iter.49_6(7)>
# ivtmp_19 = PHI <16(2), ivtmp_18(7)>
b_2 = b.45[iter.49_5];
a_1 = a.44[iter.49_5];
_8 = mask.48_7(D) >> iter.49_5;
_9 = _8 & 1;
if (_9 == 0)
goto <bb 8>; [20.00%]
else
goto <bb 4>; [80.00%]
<bb 8> [local count: 214748360]:
goto <bb 5>; [100.00%]
<bb 4> [local count: 858993464]:
_3 = a_1 + b_2;
retval.43[iter.49_5] = _3;
<bb 5> [local count: 1073741824]:
iter.49_6 = iter.49_5 + 1;
ivtmp_18 = ivtmp_19 - 1;
if (ivtmp_18 != 0)
goto <bb 7>; [100.00%]
else
goto <bb 6>; [0.00%]
and we if-convert the body to
<bb 3> [local count: 1073741824]:
# iter.49_5 = PHI <iter.49_6(7), 0(15)>
# ivtmp_19 = PHI <ivtmp_18(7), 16(15)>
b_2 = b.45[iter.49_5];
a_1 = a.44[iter.49_5];
_8 = mask.48_7(D) >> iter.49_5;
_9 = _8 & 1;
_22 = _9 == 0;
_36 = (unsigned int) a_1;
_37 = (unsigned int) b_2;
_38 = _36 + _37;
_3 = (int) _38;
_39 = ~_22;
_40 = &retval.43[iter.49_5];
.MASK_STORE (_40, 32B, _39, _3);
iter.49_6 = iter.49_5 + 1;
ivtmp_18 = ivtmp_19 - 1;
if (ivtmp_18 != 0)
goto <bb 7>; [100.00%]
else
goto <bb 6>; [0.00%]
but rather than recovering the mask in its original form we vectorize this as
vect_cst__50 = {mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
mask.48_7(D), mask.48_7(D)};
<bb 3> [local count: 10631108]:
# vect_vec_iv_.125_42 = PHI <_43(7), { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15 }(2)>
# vectp_b.126_44 = PHI <vectp_b.126_45(7), &b.45(2)>
# vectp_a.129_47 = PHI <vectp_a.129_48(7), &a.44(2)>
# vectp_retval.140_61 = PHI <vectp_retval.140_62(7), &retval.43(2)>
# ivtmp_64 = PHI <ivtmp_65(7), 0(2)>
_43 = vect_vec_iv_.125_42 + { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16 };
vect_b_2.128_46 = MEM <vector(16) int> [(int *)vectp_b.126_44];
vect_a_1.131_49 = MEM <vector(16) int> [(int *)vectp_a.129_47];
vect__8.132_51 = vect_cst__50 >> vect_vec_iv_.125_42;
vect__9.133_53 = vect__8.132_51 & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1 };
mask__22.134_55 = vect__9.133_53 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0 };
vect__36.135_56 = VIEW_CONVERT_EXPR<vector(16) unsigned
int>(vect_a_1.131_49);
vect__37.136_57 = VIEW_CONVERT_EXPR<vector(16) unsigned
int>(vect_b_2.128_46);
vect__38.137_58 = vect__36.135_56 + vect__37.136_57;
vect__3.138_59 = VIEW_CONVERT_EXPR<vector(16) int>(vect__38.137_58);
mask__39.139_60 = ~mask__22.134_55;
if (mask__39.139_60 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 })
goto <bb 20>; [20.00%]
else
goto <bb 21>; [80.00%]
<bb 21> [local count: 8504886]:
.MASK_STORE (vectp_retval.140_61, 512B, mask__39.139_60, vect__3.138_59);
<bb 20> [local count: 10631108]:
vectp_b.126_45 = vectp_b.126_44 + 64;
vectp_a.129_48 = vectp_a.129_47 + 64;
vectp_retval.140_62 = vectp_retval.140_61 + 64;
ivtmp_65 = ivtmp_64 + 1;
if (ivtmp_65 < 1)
goto <bb 7>; [0.00%]
else
goto <bb 17>; [100.00%]
in the end leaving us with
<bb 2> [local count: 1073741824]:
vect_cst__50 = {mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
mask.48_7(D), mask.48_7(D)};
vect__8.132_51 = vect_cst__50 >> { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15 };
vect__9.133_53 = vect__8.132_51 & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1 };
vect__36.135_56 = VIEW_CONVERT_EXPR<vector(16) unsigned int>(simd.46_13(D));
vect__37.136_57 = VIEW_CONVERT_EXPR<vector(16) unsigned int>(simd.47_15(D));
vect__38.137_58 = vect__36.135_56 + vect__37.136_57;
vect__3.138_59 = VIEW_CONVERT_EXPR<vector(16) int>(vect__38.137_58);
mask__39.139_60 = vect__9.133_53 != { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0 };
if (mask__39.139_60 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 })
goto <bb 4>; [20.00%]
else
goto <bb 3>; [80.00%]
<bb 3> [local count: 858993419]:
.MASK_STORE (&retval.43, 512B, mask__39.139_60, vect__3.138_59);
<bb 4> [local count: 1073741824]:
_10 = VIEW_CONVERT_EXPR<vector(16) int>(retval.43);
return _10;
and
_ZGVeM16vv_foo:
.LFB4:
.cfi_startproc
movl $1, %eax
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
vpbroadcastd %edi, %zmm2
vpaddd %zmm1, %zmm0, %zmm0
vpbroadcastd %eax, %zmm3
movq %rsp, %rbp
.cfi_def_cfa_register 6
andq $-64, %rsp
vpsrlvd .LC0(%rip), %zmm2, %zmm2
vpxor %xmm1, %xmm1, %xmm1
vpandd %zmm3, %zmm2, %zmm2
vpcmpd $4, %zmm1, %zmm2, %k1
kortestw %k1, %k1
je .L25
vmovdqa32 %zmm0, -64(%rsp){%k1}
.L25:
vmovdqa32 -64(%rsp), %zmm0
leave
.cfi_def_cfa 7, 8
ret
Referenced Bugs:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/111793] OpenMP SIMD inbranch clones for AVX512 are highly sub-optimal
2023-10-13 7:28 [Bug tree-optimization/111793] New: OpenMP SIMD inbranch clones for AVX are highly sub-optimal rguenth at gcc dot gnu.org
2023-10-13 7:33 ` [Bug tree-optimization/111793] " rguenth at gcc dot gnu.org
@ 2023-10-13 7:43 ` rguenth at gcc dot gnu.org
2023-10-13 9:08 ` jakub at gcc dot gnu.org
` (3 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-10-13 7:43 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111793
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |NEW
Last reconfirmed| |2023-10-13
Ever confirmed|0 |1
--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
For AVX2 and simdlen(8) we get
<bb 2> [local count: 1073741824]:
vect__35.98_51 = VIEW_CONVERT_EXPR<vector(8) unsigned int>(simd.25_11(D));
vect__36.99_52 = VIEW_CONVERT_EXPR<vector(8) unsigned int>(simd.26_13(D));
vect__37.100_53 = vect__35.98_51 + vect__36.99_52;
vect__3.101_54 = VIEW_CONVERT_EXPR<vector(8) int>(vect__37.100_53);
mask__38.102_55 = mask.27_15(D) != { 0, 0, 0, 0, 0, 0, 0, 0 };
if (mask__38.102_55 == { 0, 0, 0, 0, 0, 0, 0, 0 })
goto <bb 4>; [20.00%]
else
goto <bb 3>; [80.00%]
<bb 3> [local count: 858993419]:
.MASK_STORE (&retval.21, 256B, mask__38.102_55, vect__3.101_54);
<bb 4> [local count: 1073741824]:
_8 = VIEW_CONVERT_EXPR<vector(8) int>(retval.21);
return _8;
which is much more reasonable - I'm not sure whether the compare against 0
is required or if the ABI guarantees either 0 or -1 in the elements. That
we end up with memory due to the use of a .MASK_STORE unfortunately persists
to the final assembly:
_ZGVdM8vv_foo:
.LFB3:
.cfi_startproc
vpaddd %ymm1, %ymm0, %ymm0
vpxor %xmm1, %xmm1, %xmm1
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
vpcmpeqd %ymm1, %ymm2, %ymm2
movq %rsp, %rbp
.cfi_def_cfa_register 6
andq $-32, %rsp
vpcmpeqd %ymm1, %ymm2, %ymm2
vptest %ymm2, %ymm2
je .L12
vpmaskmovd %ymm0, %ymm2, -32(%rsp)
.L12:
vmovdqa -32(%rsp), %ymm0
leave
.cfi_def_cfa 7, 8
ret
there's the opportunity to maybe rewrite .MASK_STORE to a VEC_COND_EXPR
and rewriting retval.21 to SSA. Maybe if-conversion can see this already
given retval.21 is local automatic and not address taken.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/111793] OpenMP SIMD inbranch clones for AVX512 are highly sub-optimal
2023-10-13 7:28 [Bug tree-optimization/111793] New: OpenMP SIMD inbranch clones for AVX are highly sub-optimal rguenth at gcc dot gnu.org
2023-10-13 7:33 ` [Bug tree-optimization/111793] " rguenth at gcc dot gnu.org
2023-10-13 7:43 ` [Bug tree-optimization/111793] OpenMP SIMD inbranch clones for AVX512 " rguenth at gcc dot gnu.org
@ 2023-10-13 9:08 ` jakub at gcc dot gnu.org
2023-10-13 9:13 ` jakub at gcc dot gnu.org
` (2 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: jakub at gcc dot gnu.org @ 2023-10-13 9:08 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111793
Jakub Jelinek <jakub at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |jakub at gcc dot gnu.org
--- Comment #3 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
(In reply to Richard Biener from comment #2)
> which is much more reasonable - I'm not sure whether the compare against 0
> is required or if the ABI guarantees either 0 or -1 in the elements.
Yes, it does. All ones elements that is, the element type can be unsigned
integral or floating point as well...
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/111793] OpenMP SIMD inbranch clones for AVX512 are highly sub-optimal
2023-10-13 7:28 [Bug tree-optimization/111793] New: OpenMP SIMD inbranch clones for AVX are highly sub-optimal rguenth at gcc dot gnu.org
` (2 preceding siblings ...)
2023-10-13 9:08 ` jakub at gcc dot gnu.org
@ 2023-10-13 9:13 ` jakub at gcc dot gnu.org
2023-10-13 9:39 ` rguenth at gcc dot gnu.org
2024-06-14 13:26 ` rguenth at gcc dot gnu.org
5 siblings, 0 replies; 7+ messages in thread
From: jakub at gcc dot gnu.org @ 2023-10-13 9:13 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111793
--- Comment #4 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
So, shouldn't we match.pd (or something else) pattern match
vect_cst__50 = {mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
mask.48_7(D), mask.48_7(D)};
vect__8.132_51 = vect_cst__50 >> { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15 };
vect__9.133_53 = vect__8.132_51 & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1 };
mask__39.139_60 = vect__9.133_53 != { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0 };
back into mask__39.139_60 = mask.48_7(D); ?
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/111793] OpenMP SIMD inbranch clones for AVX512 are highly sub-optimal
2023-10-13 7:28 [Bug tree-optimization/111793] New: OpenMP SIMD inbranch clones for AVX are highly sub-optimal rguenth at gcc dot gnu.org
` (3 preceding siblings ...)
2023-10-13 9:13 ` jakub at gcc dot gnu.org
@ 2023-10-13 9:39 ` rguenth at gcc dot gnu.org
2024-06-14 13:26 ` rguenth at gcc dot gnu.org
5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-10-13 9:39 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111793
--- Comment #5 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Jakub Jelinek from comment #4)
> So, shouldn't we match.pd (or something else) pattern match
> vect_cst__50 = {mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
> mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
> mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
> mask.48_7(D), mask.48_7(D)};
> vect__8.132_51 = vect_cst__50 >> { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
> 12, 13, 14, 15 };
> vect__9.133_53 = vect__8.132_51 & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
> 1, 1, 1 };
> mask__39.139_60 = vect__9.133_53 != { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> 0, 0, 0, 0 };
> back into mask__39.139_60 = mask.48_7(D); ?
Yes, that's a possibility. I wonder if it's possible to arrange things in the
vectorizer itself so that costing gets more accurate (probably not that
important for OMP SIMD though).
Maybe it works a bit better if we did mask & (1 << iv), but I guess we
canonicalize that back.
I've opened this for tracking for now, working on PR111795 first.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/111793] OpenMP SIMD inbranch clones for AVX512 are highly sub-optimal
2023-10-13 7:28 [Bug tree-optimization/111793] New: OpenMP SIMD inbranch clones for AVX are highly sub-optimal rguenth at gcc dot gnu.org
` (4 preceding siblings ...)
2023-10-13 9:39 ` rguenth at gcc dot gnu.org
@ 2024-06-14 13:26 ` rguenth at gcc dot gnu.org
5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-06-14 13:26 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111793
--- Comment #6 from Richard Biener <rguenth at gcc dot gnu.org> ---
non-openmp testcase for the AVX512 mask inefficiency:
unsigned foo (unsigned *a, unsigned short mask)
{
unsigned sum = 0;
for (int i = 0; i < 16; ++i)
if ((mask >> i) & 1)
sum += a[i];
return sum;
}
I think the AVX2 one is settled as being as good as it can get.
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2024-06-14 13:26 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-13 7:28 [Bug tree-optimization/111793] New: OpenMP SIMD inbranch clones for AVX are highly sub-optimal rguenth at gcc dot gnu.org
2023-10-13 7:33 ` [Bug tree-optimization/111793] " rguenth at gcc dot gnu.org
2023-10-13 7:43 ` [Bug tree-optimization/111793] OpenMP SIMD inbranch clones for AVX512 " rguenth at gcc dot gnu.org
2023-10-13 9:08 ` jakub at gcc dot gnu.org
2023-10-13 9:13 ` jakub at gcc dot gnu.org
2023-10-13 9:39 ` rguenth at gcc dot gnu.org
2024-06-14 13:26 ` rguenth at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).