public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals
@ 2024-06-25 5:44 tnfchris at gcc dot gnu.org
2024-06-25 5:47 ` [Bug tree-optimization/115629] " pinskia at gcc dot gnu.org
` (3 more replies)
0 siblings, 4 replies; 5+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-06-25 5:44 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
Bug ID: 115629
Summary: Inefficient if-convert of masked conditionals
Product: gcc
Version: 15.0
Status: UNCONFIRMED
Keywords: missed-optimization
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: tnfchris at gcc dot gnu.org
Target Milestone: ---
With cases such as:
void foo1 (int *restrict a, int *restrict b, int *restrict c,
int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
res[i] = a[i] ? b[i] : (c[i] ? b[i] : d[i]);
}
we generate:
foo1:
cmp w5, 0
ble .L1
mov x6, 0
whilelo p7.s, wzr, w5
ptrue p3.b, all
.L3:
ld1w z31.s, p7/z, [x0, x6, lsl 2]
cmpeq p6.s, p7/z, z31.s, #0
cmpne p5.s, p7/z, z31.s, #0
ld1w z0.s, p6/z, [x2, x6, lsl 2]
ld1w z30.s, p5/z, [x1, x6, lsl 2]
cmpne p15.s, p3/z, z0.s, #0
orr z0.d, z31.d, z0.d
and p6.b, p15/z, p6.b, p6.b
cmpeq p4.s, p7/z, z0.s, #0
ld1w z28.s, p6/z, [x1, x6, lsl 2]
ld1w z29.s, p4/z, [x3, x6, lsl 2]
sel z29.s, p15, z28.s, z29.s
sel z29.s, p5, z30.s, z29.s
st1w z29.s, p7, [x4, x6, lsl 2]
incw x6
whilelo p7.s, w6, w5
b.any .L3
where b is loaded twice, once with the mask a[i] != 0, and once a[i] == 0 &&
c[i] != 0.
clearly we don't need the second compare nor the second load. This loop can be
optimally handled as:
foo1:
cmp w5, 0
ble .L1
mov x6, 0
cntw x7
whilelo p7.s, wzr, w5
.p2align 5,,15
.L3:
ld1w z1.s, p7/z, [x0, x6, lsl 2]
ld1w z0.s, p7/z, [x2, x6, lsl 2]
orr z0.d, z1.d, z0.d
cmpne p6.s, p7/z, z0.s, #0
cmpeq p5.s, p7/z, z0.s, #0
ld1w z31.s, p6/z, [x1, x6, lsl 2]
ld1w z30.s, p5/z, [x3, x6, lsl 2]
sel z30.s, p6, z31.s, z30.s
st1w z30.s, p7, [x4, x6, lsl 2]
add x6, x6, x7
whilelo p7.s, w6, w5
b.any .L3
.L1:
ret
i.e. transform a ? b : (c ? b : d) into (a || c) ? b : d.
This transform is actually also beneficial for scalar, but that's not the case
when one of the conditions have to be inverted. i.e. cases 2 to 4 are
beneficial for vector masked operations but not scalar:
/* Convert a ? b : (c ? b : d) into (a || c) ? b : d. */
void foo1 (int *restrict a, int *restrict b, int *restrict c,
int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
res[i] = a[i] ? b[i] : (c[i] ? b[i] : d[i]);
}
/* Convert a ? (c ? b : d) : b into (-a || c) ? b : d. */
void foo2 (int *restrict a, int *restrict b, int *restrict c,
int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
res[i] = a[i] ? (c[i] ? b[i] : d[i]) : b[i];
}
/* Convert a ? (c ? d :b) : b into (-a || -c) ? b : d. */
void foo3 (int *restrict a, int *restrict b, int *restrict c,
int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
res[i] = a[i] ? (c[i] ? d[i] : b[i]) : b[i];
}
/* Convert a ? b : (c ? d : b) into (a || -c) ? b : d. */
void foo4 (int *restrict a, int *restrict b, int *restrict c,
int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
res[i] = a[i] ? b[i] : (c[i] ? d[i] : b[i]);
}
for instance case 3 is currently vectorized as:
foo3:
cmp w5, 0
ble .L10
mov x6, 0
whilelo p7.s, wzr, w5
ptrue p3.b, all
.L12:
ld1w z1.s, p7/z, [x0, x6, lsl 2]
cmpeq p5.s, p7/z, z1.s, #0
cmpne p6.s, p7/z, z1.s, #0
ld1w z29.s, p5/z, [x1, x6, lsl 2]
ld1w z0.s, p6/z, [x2, x6, lsl 2]
cmpne p15.s, p3/z, z0.s, #0
cmpeq p4.s, p6/z, z0.s, #0
and p5.b, p15/z, p6.b, p6.b
ld1w z30.s, p4/z, [x1, x6, lsl 2]
ld1w z31.s, p5/z, [x3, x6, lsl 2]
sel z30.s, p15, z31.s, z30.s
sel z30.s, p6, z30.s, z29.s
st1w z30.s, p7, [x4, x6, lsl 2]
incw x6
whilelo p7.s, w6, w5
b.any .L12
but can be
foo3:
cmp w5, 0
ble .L10
mov x6, 0
cntw x7
whilelo p6.s, wzr, w5
ptrue p5.b, all
.p2align 5,,15
.L12:
ld1w z29.s, p6/z, [x0, x6, lsl 2]
ld1w z28.s, p6/z, [x2, x6, lsl 2]
cmpeq p15.s, p5/z, z29.s, #0
cmpeq p14.s, p5/z, z28.s, #0
orr p15.b, p5/z, p15.b, p14.b
and p4.b, p6/z, p15.b, p15.b
bic p7.b, p5/z, p6.b, p15.b
ld1w z31.s, p4/z, [x1, x6, lsl 2]
ld1w z30.s, p7/z, [x3, x6, lsl 2]
sel z30.s, p4, z31.s, z30.s
st1w z30.s, p6, [x4, x6, lsl 2]
add x6, x6, x7
whilelo p6.s, w6, w5
b.any .L12
with those match.pd rule, however this likely needs to be done in if-convert
since not all the transforms are beneficial for scalar because the negation
adds an extra instruction before the branch.
I think this would have to be done before conditional load lowering, because
we'd have to transform:
_32 = _4 != 0;
iftmp.0_22 = .MASK_LOAD (_5, 32B, _32);
_61 = _4 == 0;
_7 = .MASK_LOAD (_6, 32B, _61);
_26 = _7 != 0;
_13 = _26 & _61;
iftmp.0_21 = .MASK_LOAD (_5, 32B, _13);
_66 = _4 | _7;
_36 = _66 == 0;
iftmp.0_19 = .MASK_LOAD (_9, 32B, _36);
_ifc__56 = _26 ? iftmp.0_21 : iftmp.0_19;
iftmp.0_12 = _32 ? iftmp.0_22 : _ifc__56;
into
_4 = *_3;
_6 = *_5;
_7 = _4 | _6;
_35 = _7 != 0;
iftmp.0_21 = .MASK_LOAD (_8, 32B, _35);
_51 = _7 == 0;
iftmp.0_19 = .MASK_LOAD (_9, 32B, _51);
iftmp.0_12 = _35 ? iftmp.0_21 : iftmp.0_19;
which would be quite hard.
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
2024-06-25 5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
@ 2024-06-25 5:47 ` pinskia at gcc dot gnu.org
2024-06-25 9:41 ` rguenth at gcc dot gnu.org
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: pinskia at gcc dot gnu.org @ 2024-06-25 5:47 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |pinskia at gcc dot gnu.org
Severity|normal |enhancement
Blocks| |53947
Referenced Bugs:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
2024-06-25 5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
2024-06-25 5:47 ` [Bug tree-optimization/115629] " pinskia at gcc dot gnu.org
@ 2024-06-25 9:41 ` rguenth at gcc dot gnu.org
2024-06-26 16:53 ` cvs-commit at gcc dot gnu.org
2024-06-26 16:55 ` rguenth at gcc dot gnu.org
3 siblings, 0 replies; 5+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-06-25 9:41 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |NEW
Last reconfirmed| |2024-06-25
Ever confirmed|0 |1
--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
I think the transform is profitable for scalar for all cases as well.
Now, if-conversion records BB predicates and uses those to predicate
stmts/PHIs, I'm not sure how to go optimize it on that level.
It's more a CFG transform of a if (A) x; else { if (B) x; else y; } to
if (A || B) x; else y; with the downside that in this case evaluating
'B' has side-effects (memory dereference which can trap).
Due to the trapping this needs to be somehow done in if-conversion
where you also need to determine that x == x' (both copies of x
evaluate to the same code). Some scalar GIMPLE opt could have
de-duplicated the BB with 'x' (tail-merging to the rescue? it considers
the cands but somehow doesn't merge - I think the PHI node isn't
supported).
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
2024-06-25 5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
2024-06-25 5:47 ` [Bug tree-optimization/115629] " pinskia at gcc dot gnu.org
2024-06-25 9:41 ` rguenth at gcc dot gnu.org
@ 2024-06-26 16:53 ` cvs-commit at gcc dot gnu.org
2024-06-26 16:55 ` rguenth at gcc dot gnu.org
3 siblings, 0 replies; 5+ messages in thread
From: cvs-commit at gcc dot gnu.org @ 2024-06-26 16:53 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
--- Comment #2 from GCC Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>:
https://gcc.gnu.org/g:629257bcb81434117f1e9c68479032563176dc0c
commit r15-1662-g629257bcb81434117f1e9c68479032563176dc0c
Author: Richard Biener <rguenther@suse.de>
Date: Tue Jun 25 14:04:31 2024 +0200
tree-optimization/115629 - missed tail merging
The following fixes a missed tail-merging observed for the testcase
in PR115629. The issue is that when deps_ok_for_redirect doesn't
compute both would be valid prevailing blocks it rejects the merge.
The following instead makes sure to record the working block as
prevailing. Also stmt comparison fails for indirect references
and is not handling memory references thoroughly, failing to unify
array indices and pointers indirected. The following attempts to
fix this.
PR tree-optimization/115629
* tree-ssa-tail-merge.cc (gimple_equal_p): Handle
memory references better.
(deps_ok_for_redirect): Handle the case not both blocks
are considered a valid prevailing block.
* gcc.dg/tree-ssa/tail-merge-1.c: New testcase.
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
2024-06-25 5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
` (2 preceding siblings ...)
2024-06-26 16:53 ` cvs-commit at gcc dot gnu.org
@ 2024-06-26 16:55 ` rguenth at gcc dot gnu.org
3 siblings, 0 replies; 5+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-06-26 16:55 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
So we now tail-merge the two b[i] loading blocks. Can you check SVE code-gen
with this? If that fixes the PR consider adding a SVE testcase.
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2024-06-26 16:55 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-25 5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
2024-06-25 5:47 ` [Bug tree-optimization/115629] " pinskia at gcc dot gnu.org
2024-06-25 9:41 ` rguenth at gcc dot gnu.org
2024-06-26 16:53 ` cvs-commit at gcc dot gnu.org
2024-06-26 16:55 ` rguenth at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).