public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/94269] New: widening_mul should consider block frequency
@ 2020-03-23 9:47 felix.yang at huawei dot com
2020-03-23 14:14 ` [Bug tree-optimization/94269] " rguenth at gcc dot gnu.org
2020-03-26 7:36 ` cvs-commit at gcc dot gnu.org
0 siblings, 2 replies; 3+ messages in thread
From: felix.yang at huawei dot com @ 2020-03-23 9:47 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94269
Bug ID: 94269
Summary: widening_mul should consider block frequency
Product: gcc
Version: 10.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: felix.yang at huawei dot com
Target Milestone: ---
Test case:
float
calc(long n, float *x, int inc_x,
float *y, int inc_y)
{
float dot = 0.0;
int ix = 0, iy = 0;
if (n < 0) {
return dot;
}
int i = 0;
while (i < n) {
dot += y[iy] * x[ix];
ix += inc_x;
iy += inc_y;
i++;
}
return dot;
}
Command line: aarch64-linux-gnu-gcc -S -O2 -fopt-info -ftree-loop-vectorize
-funsafe-math-optimizations -march=armv8.2-a+sve -msve-vector-bits=256 calc.c
calc:
.LFB0:
.cfi_startproc
cmp x0, 0
ble .L4
mov w7, w0
mov x5, x3
mov w6, 32
mov x3, x1
mov x1, 0
index z4.s, #0, w4
index z3.s, #0, w2
whilelo p0.s, wzr, w0
mov z0.s, #0
.p2align 3,,7
.L3:
ld1w z1.s, p0/z, [x5, z4.s, sxtw 2]
ld1w z2.s, p0/z, [x3, z3.s, sxtw 2]
add x1, x1, 8
fmla z0.s, p0/m, z1.s, z2.s
smaddl x5, w4, w6, x5 <==============
whilelo p0.s, w1, w7
smaddl x3, w2, w6, x3 <==============
b.any .L3
ptrue p0.b, vl32
faddv s0, p0, z0.s
ret
Command line: aarch64-linux-gnu-gcc -S -O2 -fopt-info -ftree-loop-vectorize
-funsafe-math-optimizations -march=armv8.2-a+sve -msve-vector-bits=256 calc.c
-fdisable-tree-widening_mul
calc:
.LFB0:
.cfi_startproc
cmp x0, 0
ble .L4
sbfiz x8, x4, 5, 32
sbfiz x7, x2, 5, 32
mov w6, w0
mov x5, x3
mov x3, x1
mov x1, 0
index z4.s, #0, w4
index z3.s, #0, w2
whilelo p0.s, wzr, w0
mov z0.s, #0
ptrue p1.b, vl32
.p2align 3,,7
.L3:
ld1w z1.s, p0/z, [x5, z4.s, sxtw 2]
ld1w z2.s, p0/z, [x3, z3.s, sxtw 2]
add x1, x1, 8
fmul z1.s, z1.s, z2.s
add x5, x5, x8 <=============
fadd z0.s, p0/m, z0.s, z1.s
add x3, x3, x7 <=============
whilelo p0.s, w1, w6
b.any .L3
faddv s0, p1, z0.s
ret
widening_mul phase moves the two multiply instructions from outside the loop to
inside the loop, merging with the two add instructions separately. This
increases the cost of the loop.
I think widening_mul should consider block frequency when doing such a
combination.
I mean something like:
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 54ba035..4439452 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -2721,7 +2721,10 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi,
gimple *stmt,
{
if (!has_single_use (rhs1)
|| !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
- &type2, &mult_rhs2))
+ &type2, &mult_rhs2)
+ || (gimple_bb (rhs1_stmt) != gimple_bb (stmt)
+ && gimple_bb (rhs1_stmt)->count.to_frequency(cfun)
+ < gimple_bb (stmt)->count.to_frequency(cfun)))
return false;
add_rhs = rhs2;
conv_stmt = conv1_stmt;
@@ -2730,7 +2733,10 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi,
gimple *stmt,
{
if (!has_single_use (rhs2)
|| !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
- &type2, &mult_rhs2))
+ &type2, &mult_rhs2)
+ || (gimple_bb (rhs2_stmt) != gimple_bb (stmt)
+ && gimple_bb (rhs2_stmt)->count.to_frequency(cfun)
+ < gimple_bb (stmt)->count.to_frequency(cfun)))
return false;
add_rhs = rhs1;
conv_stmt = conv2_stmt;
^ permalink raw reply [flat|nested] 3+ messages in thread
* [Bug tree-optimization/94269] widening_mul should consider block frequency
2020-03-23 9:47 [Bug tree-optimization/94269] New: widening_mul should consider block frequency felix.yang at huawei dot com
@ 2020-03-23 14:14 ` rguenth at gcc dot gnu.org
2020-03-26 7:36 ` cvs-commit at gcc dot gnu.org
1 sibling, 0 replies; 3+ messages in thread
From: rguenth at gcc dot gnu.org @ 2020-03-23 14:14 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94269
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Last reconfirmed| |2020-03-23
Status|UNCONFIRMED |NEW
Ever confirmed|0 |1
--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
Or even simpler like FMA detection which restricts ops to be defined in the
same basic-block:
/* For now restrict this operations to single basic blocks. In theory
we would want to support sinking the multiplication in
m = a*b;
if ()
ma = m + c;
else
d = m;
to form a fma in the then block and sink the multiplication to the
else block. */
if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
return false;
^ permalink raw reply [flat|nested] 3+ messages in thread
* [Bug tree-optimization/94269] widening_mul should consider block frequency
2020-03-23 9:47 [Bug tree-optimization/94269] New: widening_mul should consider block frequency felix.yang at huawei dot com
2020-03-23 14:14 ` [Bug tree-optimization/94269] " rguenth at gcc dot gnu.org
@ 2020-03-26 7:36 ` cvs-commit at gcc dot gnu.org
1 sibling, 0 replies; 3+ messages in thread
From: cvs-commit at gcc dot gnu.org @ 2020-03-26 7:36 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94269
--- Comment #2 from CVS Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>:
https://gcc.gnu.org/g:d21dff5b4fee51ae432143065bededfc763dc344
commit r10-7391-gd21dff5b4fee51ae432143065bededfc763dc344
Author: Richard Biener <rguenther@suse.de>
Date: Thu Mar 26 08:33:57 2020 +0100
widening_mul: restrict ops to be defined in the same basic-block when
convert plusminus to widen
In the testcase for PR94269, widening_mul moves two multiply
instructions from outside the loop to inside
the loop, merging with two add instructions separately. This
increases the cost of the loop. Like FMA detection
in the same pass, simply restrict ops to be defined in the same
basic-block to avoid possibly moving multiply
to a different block with a higher execution frequency.
2020-03-26 Felix Yang <felix.yang@huawei.com>
PR tree-optimization/94269
* tree-ssa-math-opts.c (convert_plusminus_to_widen): Restrict
this
operation to single basic block.
* gcc.dg/pr94269.c: New test.
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2020-03-26 7:36 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-23 9:47 [Bug tree-optimization/94269] New: widening_mul should consider block frequency felix.yang at huawei dot com
2020-03-23 14:14 ` [Bug tree-optimization/94269] " rguenth at gcc dot gnu.org
2020-03-26 7:36 ` cvs-commit at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).