public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug rtl-optimization/115186] New: Suboptimal codes generated by rtl-expand for divmod
@ 2024-05-22 10:20 dizhao at os dot amperecomputing.com
2024-05-22 10:28 ` [Bug rtl-optimization/115186] " dizhao at os dot amperecomputing.com
0 siblings, 1 reply; 2+ messages in thread
From: dizhao at os dot amperecomputing.com @ 2024-05-22 10:20 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115186
Bug ID: 115186
Summary: Suboptimal codes generated by rtl-expand for divmod
Product: gcc
Version: 15.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: rtl-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: dizhao at os dot amperecomputing.com
Target Milestone: ---
For the code below:
typedef unsigned short int uint16_t;
typedef unsigned char uint8_t;
typedef long unsigned int size_t;
uint16_t fletcher16(const uint8_t *data, const size_t len)
{
register uint16_t sum1 = 0, sum2 = 0;
register size_t i;
for (i = 0; i < len; i++) {
sum1 = (sum1 + data[i]) % 255;
sum2 = (sum2 + sum1) % 255;
}
return ((uint16_t)(sum2 << 8)) | sum1;
}
, when compiled with "-O3 -mcpu=ampere1a", shift+add is used instead of div.
The dump file at 266r.expand shows:
20: NOTE_INSN_BASIC_BLOCK 4
21: debug i => r114:DI-r116:DI
22: debug sum2 => r111:SI#0
23: debug sum1 => r110:SI#0
24: debug begin stmt marker
25: r118:SI=zero_extend([r114:DI])
26: r119:SI=r118:SI+r110:SI
27: r120:DI=zero_extend(r119:SI)
28: r121:DI=r120:DI
29: r122:DI=r121:DI<<0x8
30: r123:DI=r122:DI+r120:DI
REG_EQUAL r120:DI*0x101
31: r124:DI=r123:DI<<0x10
32: r125:DI=r123:DI+r124:DI
REG_EQUAL r120:DI*0x1010101
33: r126:DI=r125:DI<<0x7
34: r127:DI=r126:DI+r120:DI
REG_EQUAL r120:DI*0x80808081
35: r128:DI=r127:DI 0>>0x20
36: r104:SI=r128:DI#0 0>>0x7
REG_EQUAL udiv(r119:SI,0xff)
...
However, using mult instruction is better, like (the result can be produce with
"-mtune=neoverse-n1"):
20: NOTE_INSN_BASIC_BLOCK 4
21: debug i => r114:DI-r116:DI
22: debug sum2 => r111:SI#0
23: debug sum1 => r110:SI#0
24: debug begin stmt marker
25: r118:SI=zero_extend([r114:DI])
26: r119:SI=r118:SI+r110:SI
27: r121:SI=0xffffffff80808081
28: r120:DI=zero_extend(r119:SI)*zero_extend(r121:SI)
29: r122:DI=r120:DI 0>>0x20
30: r104:SI=r122:DI#0 0>>0x7
REG_EQUAL udiv(r119:SI,0xff)
...
The problem is, in expmed.cc:expmed_mult_highpart, when the result here is 0:
/* See whether the specialized multiplication optabs are
cheaper than the shift/add version. */
tem = expmed_mult_highpart_optab (mode, op0, narrow_op1, target,
unsignedp,
, the "tem" produced by the code that follows (shift/add version) can be more
expensive than the result of:
return expmed_mult_highpart_optab (mode, op0, op1, target, unsignedp,
max_cost);
For -mcpu=ampere1a, the estimated cost of the former is 36, the cost of the
latter is 28.
^ permalink raw reply [flat|nested] 2+ messages in thread
* [Bug rtl-optimization/115186] Suboptimal codes generated by rtl-expand for divmod
2024-05-22 10:20 [Bug rtl-optimization/115186] New: Suboptimal codes generated by rtl-expand for divmod dizhao at os dot amperecomputing.com
@ 2024-05-22 10:28 ` dizhao at os dot amperecomputing.com
0 siblings, 0 replies; 2+ messages in thread
From: dizhao at os dot amperecomputing.com @ 2024-05-22 10:28 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115186
--- Comment #1 from Di Zhao <dizhao at os dot amperecomputing.com> ---
A raw fix below can fix this case (not sure if this is the right way):
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 50d22762cae..bf42a0ff5ca 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -3976,7 +3976,10 @@ expmed_mult_highpart (scalar_int_mode mode, rtx op0, rtx
op1,
bool sign_adjust = false;
enum mult_variant variant;
struct algorithm alg;
- rtx narrow_op1, tem;
+ rtx narrow_op1, tem, tem2;
+ rtx_insn *shift_add_insns;
+ rtx_insn *mult_insns;
+ unsigned shift_add_cost, mult_cost;
/* We can't support modes wider than HOST_BITS_PER_INT. */
gcc_assert (HWI_COMPUTABLE_MODE_P (mode));
@@ -4014,6 +4017,7 @@ expmed_mult_highpart (scalar_int_mode mode, rtx op0, rtx
op1,
if (tem)
return tem;
+ start_sequence ();
tem = convert_to_mode (wider_mode, op0, unsignedp);
tem = expand_mult_const (wider_mode, tem, cnst1, 0, &alg, variant);
tem = extract_high_half (mode, tem);
@@ -4021,8 +4025,29 @@ expmed_mult_highpart (scalar_int_mode mode, rtx op0, rtx
op1,
/* Adjust result for signedness. */
if (sign_adjust)
tem = force_operand (gen_rtx_MINUS (mode, tem, op0), tem);
+ shift_add_insns = get_insns ();
+ end_sequence ();
+ shift_add_cost = seq_cost (shift_add_insns, speed);
+
+ start_sequence ();
+ tem2 = expmed_mult_highpart_optab (mode, op0, op1, target, unsignedp,
+ max_cost);
+ mult_insns = get_insns ();
+ end_sequence ();
+ mult_cost = seq_cost (mult_insns, speed);
- return tem;
+ if (tem2 && mult_cost < shift_add_cost)
+ {
+ emit_insn (mult_insns);
+ return tem2;
+ }
+ else if (shift_add_cost < max_cost)
+ {
+ emit_insn (shift_add_insns);
+ return tem;
+ }
+ else
+ return 0;
}
return expmed_mult_highpart_optab (mode, op0, narrow_op1, target,
unsignedp, max_cost);
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2024-05-22 10:28 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-22 10:20 [Bug rtl-optimization/115186] New: Suboptimal codes generated by rtl-expand for divmod dizhao at os dot amperecomputing.com
2024-05-22 10:28 ` [Bug rtl-optimization/115186] " dizhao at os dot amperecomputing.com
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).