public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r14-5013] ARC: Improved ARC rtx_costs/insn_cost for SHIFTs and ROTATEs.
@ 2023-10-30 16:18 Roger Sayle
0 siblings, 0 replies; only message in thread
From: Roger Sayle @ 2023-10-30 16:18 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:31cc9824d1cd5e0f7fa145d0831a923479333cd6
commit r14-5013-g31cc9824d1cd5e0f7fa145d0831a923479333cd6
Author: Roger Sayle <roger@nextmovesoftware.com>
Date: Mon Oct 30 16:17:42 2023 +0000
ARC: Improved ARC rtx_costs/insn_cost for SHIFTs and ROTATEs.
This patch overhauls the ARC backend's insn_cost target hook, and makes
some related improvements to rtx_costs, BRANCH_COST, etc. The primary
goal is to allow the backend to indicate that shifts and rotates are
slow (discouraged) when the CPU doesn't have a barrel shifter. I should
also acknowledge Richard Sandiford for inspiring the use of set_cost
in this rewrite of arc_insn_cost; this implementation borrows heavily
for the target hooks for AArch64 and ARM.
The motivating example is derived from PR rtl-optimization/110717.
struct S { int a : 5; };
unsigned int foo (struct S *p) {
return p->a;
}
With a barrel shifter, GCC -O2 generates the reasonable:
foo: ldb_s r0,[r0]
asl_s r0,r0,27
j_s.d [blink]
asr_s r0,r0,27
What's interesting is that during combine, the middle-end actually
has two shifts by three bits, and a sign-extension from QI to SI.
Trying 8, 9 -> 11:
8: r158:SI=r157:QI#0<<0x3
REG_DEAD r157:QI
9: r159:SI=sign_extend(r158:SI#0)
REG_DEAD r158:SI
11: r155:SI=r159:SI>>0x3
REG_DEAD r159:SI
Whilst it's reasonable to simplify this to two shifts by 27 bits when
the CPU has a barrel shifter, it's actually a significant pessimization
when these shifts are implemented by loops. This combination can be
prevented if the backend provides accurate-ish estimates for insn_cost.
Previously, without a barrel shifter, GCC -O2 -mcpu=em generates:
foo: ldb_s r0,[r0]
mov lp_count,27
lp 2f
add r0,r0,r0
nop
2: # end single insn loop
mov lp_count,27
lp 2f
asr r0,r0
nop
2: # end single insn loop
j_s [blink]
which contains two loops and requires about ~113 cycles to execute.
With this patch to rtx_cost/insn_cost, GCC -O2 -mcpu=em generates:
foo: ldb_s r0,[r0]
mov_s r2,0 ;3
add3 r0,r2,r0
sexb_s r0,r0
asr_s r0,r0
asr_s r0,r0
j_s.d [blink]
asr_s r0,r0
which requires only ~6 cycles, for the shorter shifts by 3 and sign
extension.
2023-10-30 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/arc/arc.cc (arc_rtx_costs): Improve cost estimates.
Provide reasonable values for SHIFTS and ROTATES by constant
bit counts depending upon TARGET_BARREL_SHIFTER.
(arc_insn_cost): Use insn attributes if the instruction is
recognized. Avoid calling get_attr_length for type "multi",
i.e. define_insn_and_split patterns without explicit type.
Fall-back to set_rtx_cost for single_set and pattern_cost
otherwise.
* config/arc/arc.h (COSTS_N_BYTES): Define helper macro.
(BRANCH_COST): Improve/correct definition.
(LOGICAL_OP_NON_SHORT_CIRCUIT): Preserve previous behavior.
Diff:
---
gcc/config/arc/arc.cc | 84 ++++++++++++++++++++++++++++-----------------------
gcc/config/arc/arc.h | 15 +++++----
2 files changed, 56 insertions(+), 43 deletions(-)
diff --git a/gcc/config/arc/arc.cc b/gcc/config/arc/arc.cc
index e98692a70b4c..e209ad2d327e 100644
--- a/gcc/config/arc/arc.cc
+++ b/gcc/config/arc/arc.cc
@@ -5545,7 +5545,7 @@ arc_rtx_costs (rtx x, machine_mode mode, int outer_code,
case CONST:
case LABEL_REF:
case SYMBOL_REF:
- *total = speed ? COSTS_N_INSNS (1) : COSTS_N_INSNS (4);
+ *total = speed ? COSTS_N_INSNS (1) : COSTS_N_BYTES (4);
return true;
case CONST_DOUBLE:
@@ -5569,26 +5569,32 @@ arc_rtx_costs (rtx x, machine_mode mode, int outer_code,
case ASHIFT:
case ASHIFTRT:
case LSHIFTRT:
+ case ROTATE:
+ case ROTATERT:
+ if (mode == DImode)
+ return false;
if (TARGET_BARREL_SHIFTER)
{
- if (CONSTANT_P (XEXP (x, 0)))
+ *total = COSTS_N_INSNS (1);
+ if (CONSTANT_P (XEXP (x, 1)))
{
- *total += rtx_cost (XEXP (x, 1), mode, (enum rtx_code) code,
+ *total += rtx_cost (XEXP (x, 0), mode, (enum rtx_code) code,
0, speed);
return true;
}
- *total = COSTS_N_INSNS (1);
}
else if (GET_CODE (XEXP (x, 1)) != CONST_INT)
- *total = COSTS_N_INSNS (16);
+ *total = speed ? COSTS_N_INSNS (16) : COSTS_N_INSNS (4);
else
{
- *total = COSTS_N_INSNS (INTVAL (XEXP ((x), 1)));
- /* ??? want_to_gcse_p can throw negative shift counts at us,
- and then panics when it gets a negative cost as result.
- Seen for gcc.c-torture/compile/20020710-1.c -Os . */
- if (*total < 0)
- *total = 0;
+ int n = INTVAL (XEXP (x, 1)) & 31;
+ if (n < 4)
+ *total = COSTS_N_INSNS (n);
+ else
+ *total = speed ? COSTS_N_INSNS (n + 2) : COSTS_N_INSNS (4);
+ *total += rtx_cost (XEXP (x, 0), mode, (enum rtx_code) code,
+ 0, speed);
+ return true;
}
return false;
@@ -5620,6 +5626,8 @@ arc_rtx_costs (rtx x, machine_mode mode, int outer_code,
return false;
case PLUS:
+ if (mode == DImode)
+ return false;
if (outer_code == MEM && CONST_INT_P (XEXP (x, 1))
&& RTX_OK_FOR_OFFSET_P (mode, XEXP (x, 1)))
{
@@ -11154,35 +11162,37 @@ static int
arc_insn_cost (rtx_insn *insn, bool speed)
{
int cost;
- if (recog_memoized (insn) < 0)
- return 0;
-
- /* If optimizing for size, we want the insn size. */
- if (!speed)
- return get_attr_length (insn);
-
- /* Use cost if provided. */
- cost = get_attr_cost (insn);
- if (cost > 0)
- return cost;
-
- /* For speed make a simple cost model: memory access is more
- expensive than any other instruction. */
- enum attr_type type = get_attr_type (insn);
-
- switch (type)
+ enum attr_type type;
+ if (recog_memoized (insn) >= 0)
{
- case TYPE_LOAD:
- case TYPE_STORE:
- cost = COSTS_N_INSNS (2);
- break;
-
- default:
- cost = COSTS_N_INSNS (1);
- break;
+ if (speed)
+ {
+ /* Use cost if provided. */
+ cost = get_attr_cost (insn);
+ if (cost > 0)
+ return cost;
+ /* For speed make a simple cost model: memory access is more
+ expensive than any other instruction. */
+ type = get_attr_type (insn);
+ if (type == TYPE_LOAD || type == TYPE_STORE)
+ return COSTS_N_INSNS (2);
+ }
+ else
+ {
+ /* If optimizing for size, we want the insn size. */
+ type = get_attr_type (insn);
+ if (type != TYPE_MULTI)
+ return get_attr_length (insn);
+ }
}
- return cost;
+ if (rtx set = single_set (insn))
+ cost = set_rtx_cost (set, speed);
+ else
+ cost = pattern_cost (PATTERN (insn), speed);
+ /* If the cost is zero, then it's likely a complex insn. We don't
+ want the cost of these to be less than something we know about. */
+ return cost ? cost : COSTS_N_INSNS (2);
}
static unsigned
diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h
index 5877389a10da..b34f0b225c66 100644
--- a/gcc/config/arc/arc.h
+++ b/gcc/config/arc/arc.h
@@ -956,10 +956,16 @@ arc_select_cc_mode (OP, X, Y)
/* Costs. */
+/* Analog of COSTS_N_INSNS when optimizing for size. */
+#ifndef COSTS_N_BYTES
+#define COSTS_N_BYTES(N) (N)
+#endif
+
/* The cost of a branch insn. */
/* ??? What's the right value here? Branches are certainly more
expensive than reg->reg moves. */
-#define BRANCH_COST(speed_p, predictable_p) 2
+#define BRANCH_COST(speed_p, predictable_p) \
+ (speed_p ? COSTS_N_INSNS (2) : COSTS_N_INSNS (1))
/* Scc sets the destination to 1 and then conditionally zeroes it.
Best case, ORed SCCs can be made into clear - condset - condset.
@@ -971,11 +977,8 @@ arc_select_cc_mode (OP, X, Y)
beging decisive of p0, we want:
p0 * (branch_cost - 4) > (1 - p0) * 5
??? We don't get to see that probability to evaluate, so we can
- only wildly guess that it might be 50%.
- ??? The compiler also lacks the notion of branch predictability. */
-#define LOGICAL_OP_NON_SHORT_CIRCUIT \
- (BRANCH_COST (optimize_function_for_speed_p (cfun), \
- false) > 9)
+ only wildly guess that it might be 50%. */
+#define LOGICAL_OP_NON_SHORT_CIRCUIT false
/* Nonzero if access to memory by bytes is slow and undesirable.
For RISC chips, it means that access to memory by bytes is no
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2023-10-30 16:18 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-30 16:18 [gcc r14-5013] ARC: Improved ARC rtx_costs/insn_cost for SHIFTs and ROTATEs Roger Sayle
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).