public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals
@ 2024-06-25 5:44 tnfchris at gcc dot gnu.org
2024-06-25 5:47 ` [Bug tree-optimization/115629] " pinskia at gcc dot gnu.org
` (9 more replies)
0 siblings, 10 replies; 11+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-06-25 5:44 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
Bug ID: 115629
Summary: Inefficient if-convert of masked conditionals
Product: gcc
Version: 15.0
Status: UNCONFIRMED
Keywords: missed-optimization
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: tnfchris at gcc dot gnu.org
Target Milestone: ---
With cases such as:
void foo1 (int *restrict a, int *restrict b, int *restrict c,
int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
res[i] = a[i] ? b[i] : (c[i] ? b[i] : d[i]);
}
we generate:
foo1:
cmp w5, 0
ble .L1
mov x6, 0
whilelo p7.s, wzr, w5
ptrue p3.b, all
.L3:
ld1w z31.s, p7/z, [x0, x6, lsl 2]
cmpeq p6.s, p7/z, z31.s, #0
cmpne p5.s, p7/z, z31.s, #0
ld1w z0.s, p6/z, [x2, x6, lsl 2]
ld1w z30.s, p5/z, [x1, x6, lsl 2]
cmpne p15.s, p3/z, z0.s, #0
orr z0.d, z31.d, z0.d
and p6.b, p15/z, p6.b, p6.b
cmpeq p4.s, p7/z, z0.s, #0
ld1w z28.s, p6/z, [x1, x6, lsl 2]
ld1w z29.s, p4/z, [x3, x6, lsl 2]
sel z29.s, p15, z28.s, z29.s
sel z29.s, p5, z30.s, z29.s
st1w z29.s, p7, [x4, x6, lsl 2]
incw x6
whilelo p7.s, w6, w5
b.any .L3
where b is loaded twice, once with the mask a[i] != 0, and once a[i] == 0 &&
c[i] != 0.
clearly we don't need the second compare nor the second load. This loop can be
optimally handled as:
foo1:
cmp w5, 0
ble .L1
mov x6, 0
cntw x7
whilelo p7.s, wzr, w5
.p2align 5,,15
.L3:
ld1w z1.s, p7/z, [x0, x6, lsl 2]
ld1w z0.s, p7/z, [x2, x6, lsl 2]
orr z0.d, z1.d, z0.d
cmpne p6.s, p7/z, z0.s, #0
cmpeq p5.s, p7/z, z0.s, #0
ld1w z31.s, p6/z, [x1, x6, lsl 2]
ld1w z30.s, p5/z, [x3, x6, lsl 2]
sel z30.s, p6, z31.s, z30.s
st1w z30.s, p7, [x4, x6, lsl 2]
add x6, x6, x7
whilelo p7.s, w6, w5
b.any .L3
.L1:
ret
i.e. transform a ? b : (c ? b : d) into (a || c) ? b : d.
This transform is actually also beneficial for scalar, but that's not the case
when one of the conditions have to be inverted. i.e. cases 2 to 4 are
beneficial for vector masked operations but not scalar:
/* Convert a ? b : (c ? b : d) into (a || c) ? b : d. */
void foo1 (int *restrict a, int *restrict b, int *restrict c,
int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
res[i] = a[i] ? b[i] : (c[i] ? b[i] : d[i]);
}
/* Convert a ? (c ? b : d) : b into (-a || c) ? b : d. */
void foo2 (int *restrict a, int *restrict b, int *restrict c,
int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
res[i] = a[i] ? (c[i] ? b[i] : d[i]) : b[i];
}
/* Convert a ? (c ? d :b) : b into (-a || -c) ? b : d. */
void foo3 (int *restrict a, int *restrict b, int *restrict c,
int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
res[i] = a[i] ? (c[i] ? d[i] : b[i]) : b[i];
}
/* Convert a ? b : (c ? d : b) into (a || -c) ? b : d. */
void foo4 (int *restrict a, int *restrict b, int *restrict c,
int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
res[i] = a[i] ? b[i] : (c[i] ? d[i] : b[i]);
}
for instance case 3 is currently vectorized as:
foo3:
cmp w5, 0
ble .L10
mov x6, 0
whilelo p7.s, wzr, w5
ptrue p3.b, all
.L12:
ld1w z1.s, p7/z, [x0, x6, lsl 2]
cmpeq p5.s, p7/z, z1.s, #0
cmpne p6.s, p7/z, z1.s, #0
ld1w z29.s, p5/z, [x1, x6, lsl 2]
ld1w z0.s, p6/z, [x2, x6, lsl 2]
cmpne p15.s, p3/z, z0.s, #0
cmpeq p4.s, p6/z, z0.s, #0
and p5.b, p15/z, p6.b, p6.b
ld1w z30.s, p4/z, [x1, x6, lsl 2]
ld1w z31.s, p5/z, [x3, x6, lsl 2]
sel z30.s, p15, z31.s, z30.s
sel z30.s, p6, z30.s, z29.s
st1w z30.s, p7, [x4, x6, lsl 2]
incw x6
whilelo p7.s, w6, w5
b.any .L12
but can be
foo3:
cmp w5, 0
ble .L10
mov x6, 0
cntw x7
whilelo p6.s, wzr, w5
ptrue p5.b, all
.p2align 5,,15
.L12:
ld1w z29.s, p6/z, [x0, x6, lsl 2]
ld1w z28.s, p6/z, [x2, x6, lsl 2]
cmpeq p15.s, p5/z, z29.s, #0
cmpeq p14.s, p5/z, z28.s, #0
orr p15.b, p5/z, p15.b, p14.b
and p4.b, p6/z, p15.b, p15.b
bic p7.b, p5/z, p6.b, p15.b
ld1w z31.s, p4/z, [x1, x6, lsl 2]
ld1w z30.s, p7/z, [x3, x6, lsl 2]
sel z30.s, p4, z31.s, z30.s
st1w z30.s, p6, [x4, x6, lsl 2]
add x6, x6, x7
whilelo p6.s, w6, w5
b.any .L12
with those match.pd rule, however this likely needs to be done in if-convert
since not all the transforms are beneficial for scalar because the negation
adds an extra instruction before the branch.
I think this would have to be done before conditional load lowering, because
we'd have to transform:
_32 = _4 != 0;
iftmp.0_22 = .MASK_LOAD (_5, 32B, _32);
_61 = _4 == 0;
_7 = .MASK_LOAD (_6, 32B, _61);
_26 = _7 != 0;
_13 = _26 & _61;
iftmp.0_21 = .MASK_LOAD (_5, 32B, _13);
_66 = _4 | _7;
_36 = _66 == 0;
iftmp.0_19 = .MASK_LOAD (_9, 32B, _36);
_ifc__56 = _26 ? iftmp.0_21 : iftmp.0_19;
iftmp.0_12 = _32 ? iftmp.0_22 : _ifc__56;
into
_4 = *_3;
_6 = *_5;
_7 = _4 | _6;
_35 = _7 != 0;
iftmp.0_21 = .MASK_LOAD (_8, 32B, _35);
_51 = _7 == 0;
iftmp.0_19 = .MASK_LOAD (_9, 32B, _51);
iftmp.0_12 = _35 ? iftmp.0_21 : iftmp.0_19;
which would be quite hard.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
2024-06-25 5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
@ 2024-06-25 5:47 ` pinskia at gcc dot gnu.org
2024-06-25 9:41 ` rguenth at gcc dot gnu.org
` (8 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: pinskia at gcc dot gnu.org @ 2024-06-25 5:47 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |pinskia at gcc dot gnu.org
Severity|normal |enhancement
Blocks| |53947
Referenced Bugs:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
2024-06-25 5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
2024-06-25 5:47 ` [Bug tree-optimization/115629] " pinskia at gcc dot gnu.org
@ 2024-06-25 9:41 ` rguenth at gcc dot gnu.org
2024-06-26 16:53 ` cvs-commit at gcc dot gnu.org
` (7 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-06-25 9:41 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |NEW
Last reconfirmed| |2024-06-25
Ever confirmed|0 |1
--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
I think the transform is profitable for scalar for all cases as well.
Now, if-conversion records BB predicates and uses those to predicate
stmts/PHIs, I'm not sure how to go optimize it on that level.
It's more a CFG transform of a if (A) x; else { if (B) x; else y; } to
if (A || B) x; else y; with the downside that in this case evaluating
'B' has side-effects (memory dereference which can trap).
Due to the trapping this needs to be somehow done in if-conversion
where you also need to determine that x == x' (both copies of x
evaluate to the same code). Some scalar GIMPLE opt could have
de-duplicated the BB with 'x' (tail-merging to the rescue? it considers
the cands but somehow doesn't merge - I think the PHI node isn't
supported).
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
2024-06-25 5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
2024-06-25 5:47 ` [Bug tree-optimization/115629] " pinskia at gcc dot gnu.org
2024-06-25 9:41 ` rguenth at gcc dot gnu.org
@ 2024-06-26 16:53 ` cvs-commit at gcc dot gnu.org
2024-06-26 16:55 ` rguenth at gcc dot gnu.org
` (6 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: cvs-commit at gcc dot gnu.org @ 2024-06-26 16:53 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
--- Comment #2 from GCC Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>:
https://gcc.gnu.org/g:629257bcb81434117f1e9c68479032563176dc0c
commit r15-1662-g629257bcb81434117f1e9c68479032563176dc0c
Author: Richard Biener <rguenther@suse.de>
Date: Tue Jun 25 14:04:31 2024 +0200
tree-optimization/115629 - missed tail merging
The following fixes a missed tail-merging observed for the testcase
in PR115629. The issue is that when deps_ok_for_redirect doesn't
compute both would be valid prevailing blocks it rejects the merge.
The following instead makes sure to record the working block as
prevailing. Also stmt comparison fails for indirect references
and is not handling memory references thoroughly, failing to unify
array indices and pointers indirected. The following attempts to
fix this.
PR tree-optimization/115629
* tree-ssa-tail-merge.cc (gimple_equal_p): Handle
memory references better.
(deps_ok_for_redirect): Handle the case not both blocks
are considered a valid prevailing block.
* gcc.dg/tree-ssa/tail-merge-1.c: New testcase.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
2024-06-25 5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
` (2 preceding siblings ...)
2024-06-26 16:53 ` cvs-commit at gcc dot gnu.org
@ 2024-06-26 16:55 ` rguenth at gcc dot gnu.org
2024-07-01 10:10 ` tnfchris at gcc dot gnu.org
` (5 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-06-26 16:55 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
So we now tail-merge the two b[i] loading blocks. Can you check SVE code-gen
with this? If that fixes the PR consider adding a SVE testcase.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
2024-06-25 5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
` (3 preceding siblings ...)
2024-06-26 16:55 ` rguenth at gcc dot gnu.org
@ 2024-07-01 10:10 ` tnfchris at gcc dot gnu.org
2024-07-01 12:27 ` rguenther at suse dot de
` (4 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-07-01 10:10 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
--- Comment #4 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
(In reply to Richard Biener from comment #3)
> So we now tail-merge the two b[i] loading blocks. Can you check SVE
> code-gen with this? If that fixes the PR consider adding a SVE testcase.
Thanks, the codegen is much better now, but shows some other missing mask
tracking in the vectorizer.
Atm we generate:
.L3:
ld1w z31.s, p6/z, [x0, x6, lsl 2] <-- load a
cmpeq p7.s, p6/z, z31.s, #0 <-- a == 0, !a
ld1w z0.s, p7/z, [x2, x6, lsl 2] <-- load c conditionally on !a
cmpeq p7.s, p7/z, z0.s, #0 <-- !a && !c
orr z0.d, z31.d, z0.d <-- a || c
ld1w z29.s, p7/z, [x3, x6, lsl 2] <--- load d where !a && !c
cmpne p5.s, p6/z, z0.s, #0 <--- (a || c) & loop_mask
and p7.b, p6/z, p7.b, p7.b <--- ((!a && !c) && (!a && !c)) &
loop_mask
ld1w z30.s, p5/z, [x1, x6, lsl 2] <-- load b conditionally on (a ||
c)
sel z30.s, p7, z29.s, z30.s <-- select (!a && !c, d, b)
st1w z30.s, p6, [x4, x6, lsl 2]
add x6, x6, x7
whilelo p6.s, w6, w5
b.any .L3
which corresponds to:
# loop_mask_63 = PHI <next_mask_95(10), max_mask_94(20)>
vect__4.10_64 = .MASK_LOAD (vectp_a.8_53, 32B, loop_mask_63);
mask__31.11_66 = vect__4.10_64 != { 0, ... };
mask__56.12_67 = ~mask__31.11_66;
vec_mask_and_70 = mask__56.12_67 & loop_mask_63;
vect__7.15_71 = .MASK_LOAD (vectp_c.13_68, 32B, vec_mask_and_70);
mask__22.16_73 = vect__7.15_71 == { 0, ... };
mask__34.17_75 = vec_mask_and_70 & mask__22.16_73;
vect_iftmp.20_78 = .MASK_LOAD (vectp_d.18_76, 32B, mask__34.17_75);
vect__61.21_79 = vect__4.10_64 | vect__7.15_71;
mask__35.22_81 = vect__61.21_79 != { 0, ... };
vec_mask_and_84 = mask__35.22_81 & loop_mask_63;
vect_iftmp.25_85 = .MASK_LOAD (vectp_b.23_82, 32B, vec_mask_and_84);
_86 = mask__34.17_75 & loop_mask_63;
vect_iftmp.26_87 = VEC_COND_EXPR <_86, vect_iftmp.20_78, vect_iftmp.25_85>;
.MASK_STORE (vectp_res.27_88, 32B, loop_mask_63, vect_iftmp.26_87);
it looks like what's missing is that the mask tracking doesn't know that other
masked operations naturally perform an AND when combined. We do some of this
in the backend but I feel like it may be better to do it in the vectorizer.
In this case, the second load is conditional on the first load mask, which
means it's already done an AND.
And crucially inverting it means you also inverted both conditions.
So there are some superflous masking operations happening. But I guess that's
a separate bug. Shall I just add some tests here and close it and open a new
PR?
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
2024-06-25 5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
` (4 preceding siblings ...)
2024-07-01 10:10 ` tnfchris at gcc dot gnu.org
@ 2024-07-01 12:27 ` rguenther at suse dot de
2024-07-01 19:33 ` tnfchris at gcc dot gnu.org
` (3 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: rguenther at suse dot de @ 2024-07-01 12:27 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
--- Comment #5 from rguenther at suse dot de <rguenther at suse dot de> ---
> Am 01.07.2024 um 12:10 schrieb tnfchris at gcc dot gnu.org <gcc-bugzilla@gcc.gnu.org>:
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
>
> --- Comment #4 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
> (In reply to Richard Biener from comment #3)
>> So we now tail-merge the two b[i] loading blocks. Can you check SVE
>> code-gen with this? If that fixes the PR consider adding a SVE testcase.
>
> Thanks, the codegen is much better now, but shows some other missing mask
> tracking in the vectorizer.
>
> Atm we generate:
>
> .L3:
> ld1w z31.s, p6/z, [x0, x6, lsl 2] <-- load a
> cmpeq p7.s, p6/z, z31.s, #0 <-- a == 0, !a
> ld1w z0.s, p7/z, [x2, x6, lsl 2] <-- load c conditionally on !a
> cmpeq p7.s, p7/z, z0.s, #0 <-- !a && !c
> orr z0.d, z31.d, z0.d <-- a || c
> ld1w z29.s, p7/z, [x3, x6, lsl 2] <--- load d where !a && !c
> cmpne p5.s, p6/z, z0.s, #0 <--- (a || c) & loop_mask
> and p7.b, p6/z, p7.b, p7.b <--- ((!a && !c) && (!a && !c)) &
> loop_mask
> ld1w z30.s, p5/z, [x1, x6, lsl 2] <-- load b conditionally on (a ||
> c)
> sel z30.s, p7, z29.s, z30.s <-- select (!a && !c, d, b)
> st1w z30.s, p6, [x4, x6, lsl 2]
> add x6, x6, x7
> whilelo p6.s, w6, w5
> b.any .L3
>
> which corresponds to:
>
> # loop_mask_63 = PHI <next_mask_95(10), max_mask_94(20)>
> vect__4.10_64 = .MASK_LOAD (vectp_a.8_53, 32B, loop_mask_63);
> mask__31.11_66 = vect__4.10_64 != { 0, ... };
> mask__56.12_67 = ~mask__31.11_66;
> vec_mask_and_70 = mask__56.12_67 & loop_mask_63;
> vect__7.15_71 = .MASK_LOAD (vectp_c.13_68, 32B, vec_mask_and_70);
> mask__22.16_73 = vect__7.15_71 == { 0, ... };
> mask__34.17_75 = vec_mask_and_70 & mask__22.16_73;
> vect_iftmp.20_78 = .MASK_LOAD (vectp_d.18_76, 32B, mask__34.17_75);
> vect__61.21_79 = vect__4.10_64 | vect__7.15_71;
> mask__35.22_81 = vect__61.21_79 != { 0, ... };
> vec_mask_and_84 = mask__35.22_81 & loop_mask_63;
> vect_iftmp.25_85 = .MASK_LOAD (vectp_b.23_82, 32B, vec_mask_and_84);
> _86 = mask__34.17_75 & loop_mask_63;
> vect_iftmp.26_87 = VEC_COND_EXPR <_86, vect_iftmp.20_78, vect_iftmp.25_85>;
> .MASK_STORE (vectp_res.27_88, 32B, loop_mask_63, vect_iftmp.26_87);
>
> it looks like what's missing is that the mask tracking doesn't know that other
> masked operations naturally perform an AND when combined. We do some of this
> in the backend but I feel like it may be better to do it in the vectorizer.
>
> In this case, the second load is conditional on the first load mask, which
> means it's already done an AND.
> And crucially inverting it means you also inverted both conditions.
>
> So there are some superflous masking operations happening. But I guess that's
> a separate bug. Shall I just add some tests here and close it and open a new
> PR?
Not sure if that helps - do we fully understand this is a separate issue and
not related to how we if-convert?
Adding a testcase is nevertheless OK of course.
> --
> You are receiving this mail because:
> You are on the CC list for the bug.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
2024-06-25 5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
` (5 preceding siblings ...)
2024-07-01 12:27 ` rguenther at suse dot de
@ 2024-07-01 19:33 ` tnfchris at gcc dot gnu.org
2024-07-02 7:03 ` rguenth at gcc dot gnu.org
` (2 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-07-01 19:33 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
--- Comment #6 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
(In reply to rguenther@suse.de from comment #5)
> > In this case, the second load is conditional on the first load mask, which
> > means it's already done an AND.
> > And crucially inverting it means you also inverted both conditions.
> >
> > So there are some superflous masking operations happening. But I guess that's
> > a separate bug. Shall I just add some tests here and close it and open a new
> > PR?
>
> Not sure if that helps - do we fully understand this is a separate issue and
> not related to how we if-convert?
>
if-convert looks ok to me:
<bb 3> [local count: 955630226]:
# i_28 = PHI <i_25(10), 0(21)>
_1 = (long unsigned int) i_28;
_2 = _1 * 4;
_3 = a_16(D) + _2;
_4 = *_3;
_31 = _4 != 0;
_55 = _54 + _2;
_6 = (int *) _55;
_56 = ~_31;
_7 = .MASK_LOAD (_6, 32B, _56);
_22 = _7 == 0;
_34 = _22 & _56;
_58 = _57 + _2;
_9 = (int *) _58;
iftmp.0_19 = .MASK_LOAD (_9, 32B, _34);
_61 = _4 | _7;
_35 = _61 != 0;
_60 = _59 + _2;
_8 = (int *) _60;
iftmp.0_21 = .MASK_LOAD (_8, 32B, _35);
iftmp.0_12 = _34 ? iftmp.0_19 : iftmp.0_21;
_10 = res_23(D) + _2;
*_10 = iftmp.0_12;
i_25 = i_28 + 1;
if (n_15(D) > i_25)
goto <bb 10>; [89.00%]
else
goto <bb 11>; [11.00%]
I think what's missing here is that
_7 = .MASK_LOAD (_6, 32B, _56);
_22 = _7 == 0;
_34 = _22 & _56;
iftmp.0_19 = .MASK_LOAD (_9, 32B, _34);
in itself produces an AND. namely (_7 && _34) && _56 where _56 is the loop
mask.
my (probably poor understanding) is that the mask tracking in the vectorizer
attempts to prevent to keep masks and their inverse live at the same time.
but that this code in this case doesn't track masks introduced by nested
MASK_LOADS. at least, that's my naive interpretation.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
2024-06-25 5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
` (6 preceding siblings ...)
2024-07-01 19:33 ` tnfchris at gcc dot gnu.org
@ 2024-07-02 7:03 ` rguenth at gcc dot gnu.org
2024-07-02 19:33 ` rsandifo at gcc dot gnu.org
2024-07-03 7:27 ` rguenth at gcc dot gnu.org
9 siblings, 0 replies; 11+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-07-02 7:03 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
--- Comment #7 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Tamar Christina from comment #6)
> (In reply to rguenther@suse.de from comment #5)
> > > In this case, the second load is conditional on the first load mask, which
> > > means it's already done an AND.
> > > And crucially inverting it means you also inverted both conditions.
> > >
> > > So there are some superflous masking operations happening. But I guess that's
> > > a separate bug. Shall I just add some tests here and close it and open a new
> > > PR?
> >
> > Not sure if that helps - do we fully understand this is a separate issue and
> > not related to how we if-convert?
> >
>
> if-convert looks ok to me:
>
> <bb 3> [local count: 955630226]:
> # i_28 = PHI <i_25(10), 0(21)>
> _1 = (long unsigned int) i_28;
> _2 = _1 * 4;
> _3 = a_16(D) + _2;
> _4 = *_3;
> _31 = _4 != 0;
> _55 = _54 + _2;
> _6 = (int *) _55;
> _56 = ~_31;
> _7 = .MASK_LOAD (_6, 32B, _56);
> _22 = _7 == 0;
> _34 = _22 & _56;
> _58 = _57 + _2;
> _9 = (int *) _58;
> iftmp.0_19 = .MASK_LOAD (_9, 32B, _34);
> _61 = _4 | _7;
> _35 = _61 != 0;
> _60 = _59 + _2;
> _8 = (int *) _60;
> iftmp.0_21 = .MASK_LOAD (_8, 32B, _35);
> iftmp.0_12 = _34 ? iftmp.0_19 : iftmp.0_21;
> _10 = res_23(D) + _2;
> *_10 = iftmp.0_12;
> i_25 = i_28 + 1;
> if (n_15(D) > i_25)
> goto <bb 10>; [89.00%]
> else
> goto <bb 11>; [11.00%]
>
> I think what's missing here is that
>
> _7 = .MASK_LOAD (_6, 32B, _56);
> _22 = _7 == 0;
> _34 = _22 & _56;
> iftmp.0_19 = .MASK_LOAD (_9, 32B, _34);
>
> in itself produces an AND. namely (_7 && _34) && _56 where _56 is the loop
> mask.
>
> my (probably poor understanding) is that the mask tracking in the vectorizer
> attempts to prevent to keep masks and their inverse live at the same time.
>
> but that this code in this case doesn't track masks introduced by nested
> MASK_LOADS. at least, that's my naive interpretation.
Hmm, without carrying out the math it seems like for
iftmp.0_19 = .MASK_LOAD (_9, 32B, _34);
..
iftmp.0_21 = .MASK_LOAD (_8, 32B, _35);
iftmp.0_12 = _34 ? iftmp.0_19 : iftmp.0_21;
the _34 conditional and the _34 and _35 masks would make it so this
could be written with sth like
iftmp.0_12 = iftmp.0_19 | iftmp.0_21;
but a detail might have escaped me ;) Is that what you're trying to say
as well? That of course still leaves maybe partly redundant mask
computations. In the ifcvt dump it doesn't help that != 0 is sometimes
elided and sometimes not ...
IIRC the vectorizer at some point tries to optimize loop mask and mask
mask? Maybe we need to teach that mechanism to track how mask masks are
combined from other masks via & and | to improve the masks used?
This could be also a separate mask optimization phase on the SLP
representation. While it doesn't represent loop masking at least
it does for .MASK_LOAD and .MASK_STORE (and .COND_*).
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
2024-06-25 5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
` (7 preceding siblings ...)
2024-07-02 7:03 ` rguenth at gcc dot gnu.org
@ 2024-07-02 19:33 ` rsandifo at gcc dot gnu.org
2024-07-03 7:27 ` rguenth at gcc dot gnu.org
9 siblings, 0 replies; 11+ messages in thread
From: rsandifo at gcc dot gnu.org @ 2024-07-02 19:33 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
Richard Sandiford <rsandifo at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |rsandifo at gcc dot gnu.org
--- Comment #8 from Richard Sandiford <rsandifo at gcc dot gnu.org> ---
Perhaps I'm missing the point, but I think one of the issues here is that we
(still) don't model that MASK_LOAD sets inactive elements to zero. Inactive
elements are currently undefined instead. (I think Robin mentioned that
assuming zero is problematic for RVV, so we might need an explicit MASK_LOAD
argument for inactive elements, like for COND_ADD etc.)
So quoting the IL in comment 4:
# loop_mask_63 = PHI <next_mask_95(10), max_mask_94(20)>
vect__4.10_64 = .MASK_LOAD (vectp_a.8_53, 32B, loop_mask_63);
mask__31.11_66 = vect__4.10_64 != { 0, ... };
mask__56.12_67 = ~mask__31.11_66;
vec_mask_and_70 = mask__56.12_67 & loop_mask_63;
vect__7.15_71 = .MASK_LOAD (vectp_c.13_68, 32B, vec_mask_and_70);
mask__22.16_73 = vect__7.15_71 == { 0, ... };
mask__34.17_75 = vec_mask_and_70 & mask__22.16_73;
I think this and...
vect_iftmp.20_78 = .MASK_LOAD (vectp_d.18_76, 32B, mask__34.17_75);
vect__61.21_79 = vect__4.10_64 | vect__7.15_71;
mask__35.22_81 = vect__61.21_79 != { 0, ... };
vec_mask_and_84 = mask__35.22_81 & loop_mask_63;
...this have to be kept until we model inactive elements.
vect_iftmp.25_85 = .MASK_LOAD (vectp_b.23_82, 32B, vec_mask_and_84);
_86 = mask__34.17_75 & loop_mask_63;
This one is really curious though :) Why does the code think that the loop
mask is needed here? Does the code think the mask is needed for correctness,
or is the scalar_cond_masked_set optimisation misfiring?
vect_iftmp.26_87 = VEC_COND_EXPR <_86, vect_iftmp.20_78, vect_iftmp.25_85>;
.MASK_STORE (vectp_res.27_88, 32B, loop_mask_63, vect_iftmp.26_87);
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
2024-06-25 5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
` (8 preceding siblings ...)
2024-07-02 19:33 ` rsandifo at gcc dot gnu.org
@ 2024-07-03 7:27 ` rguenth at gcc dot gnu.org
9 siblings, 0 replies; 11+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-07-03 7:27 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
--- Comment #9 from Richard Biener <rguenth at gcc dot gnu.org> ---
I think we sort-of agreed to have .MASK_LOAD to have inactive lanes zeroed but
we never got around to formalizing that. Note AVX512 supports zero-masking for
.MASK_STORE IIRC, not just merge. Same for .MASK_LOAD where it supports
merging as well. How would we ask the target whether it can do 'merge'?
zero and undefined could be done via predicates but merge?
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2024-07-03 7:27 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-25 5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
2024-06-25 5:47 ` [Bug tree-optimization/115629] " pinskia at gcc dot gnu.org
2024-06-25 9:41 ` rguenth at gcc dot gnu.org
2024-06-26 16:53 ` cvs-commit at gcc dot gnu.org
2024-06-26 16:55 ` rguenth at gcc dot gnu.org
2024-07-01 10:10 ` tnfchris at gcc dot gnu.org
2024-07-01 12:27 ` rguenther at suse dot de
2024-07-01 19:33 ` tnfchris at gcc dot gnu.org
2024-07-02 7:03 ` rguenth at gcc dot gnu.org
2024-07-02 19:33 ` rsandifo at gcc dot gnu.org
2024-07-03 7:27 ` rguenth at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).