public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH, AArch64] Disable reg offset in quad-word store for Falkor.
@ 2017-09-22 15:49 Jim Wilson
  2017-09-22 15:59 ` Jim Wilson
  0 siblings, 1 reply; 10+ messages in thread
From: Jim Wilson @ 2017-09-22 15:49 UTC (permalink / raw)
  To: gcc-patches; +Cc: Jim Wilson, wilson

On Falkor, because of an idiosyncracy of how the pipelines are designed, a
quad-word store using a reg+reg addressing mode is almost twice as slow as an
add followed by a quad-word store with a single reg addressing mode.  So we
get better performance if we disallow addressing modes using register offsets
with quad-word stores.

Using lmbench compiled with -O2 -ftree-vectorize as my benchmark, I see a 13%
performance increase on stream copy using this patch, and a 16% performance
increase on stream scale using this patch.  I also see a small performance
increase on SPEC CPU2006 of around 0.2% for int and 0.4% for FP at -O3.

	gcc/
	* config/aarch64/aarch64-protos.h (aarch64_movti_target_operand_p):
	New.
	* config/aarch64/aarch64-simd.md (aarch64_simd_mov<mode>): Use Utf.
	* config/aarch64/aarch64-tuning-flags.def
	(SLOW_REGOFFSET_QUADWORD_STORE): New.
	* config/aarch64/aarch64.c (qdf24xx_tunings): Add
	SLOW_REGOFFSET_QUADWORD_STORE to tuning flags.
	(aarch64_movti_target_operand_p): New.
	* config/aarch64/aarch64.md (movti_aarch64): Use Utf.
	(movtf_aarch64): Likewise.
	* config/aarch64/constraints.md (Utf): New.
---
 gcc/config/aarch64/aarch64-protos.h         |  1 +
 gcc/config/aarch64/aarch64-simd.md          |  4 ++--
 gcc/config/aarch64/aarch64-tuning-flags.def |  4 ++++
 gcc/config/aarch64/aarch64.c                | 14 +++++++++++++-
 gcc/config/aarch64/aarch64.md               |  6 +++---
 gcc/config/aarch64/constraints.md           |  6 ++++++
 6 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index e67c2ed..2dfd057 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -379,6 +379,7 @@ const char *aarch64_output_move_struct (rtx *operands);
 rtx aarch64_return_addr (int, rtx);
 rtx aarch64_simd_gen_const_vector_dup (machine_mode, HOST_WIDE_INT);
 bool aarch64_simd_mem_operand_p (rtx);
+bool aarch64_movti_target_operand_p (rtx);
 rtx aarch64_simd_vect_par_cnst_half (machine_mode, bool);
 rtx aarch64_tls_get_addr (void);
 tree aarch64_fold_builtin (tree, int, tree *, bool);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 70e9339..88bf210 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -133,9 +133,9 @@
 
 (define_insn "*aarch64_simd_mov<mode>"
   [(set (match_operand:VQ 0 "nonimmediate_operand"
-		"=w, Umq,  m,  w, ?r, ?w, ?r, w")
+		"=w, Umq,  Utf,  w, ?r, ?w, ?r, w")
 	(match_operand:VQ 1 "general_operand"
-		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
+		"m,  Dz,   w,    w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
        || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index f48642c..7d0b104 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -41,4 +41,8 @@ AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW)
    are not considered cheap.  */
 AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND)
 
+/* Don't use a register offset in a memory address for a quad-word store.  */
+AARCH64_EXTRA_TUNING_OPTION ("slow_regoffset_quadword_store",
+			     SLOW_REGOFFSET_QUADWORD_STORE)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5e26cb7..d6f1133 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -818,7 +818,7 @@ static const struct tune_params qdf24xx_tunings =
   2,	/* min_div_recip_mul_df.  */
   0,	/* max_case_values.  */
   tune_params::AUTOPREFETCHER_STRONG,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE),	/* tune_flags.  */
   &qdf24xx_prefetch_tune
 };
 
@@ -11821,6 +11821,18 @@ aarch64_simd_mem_operand_p (rtx op)
 			|| REG_P (XEXP (op, 0)));
 }
 
+/* Return TRUE if OP uses an efficient memory address for quad-word target.  */
+bool
+aarch64_movti_target_operand_p (rtx op)
+{
+  if (! optimize_size
+      && (aarch64_tune_params.extra_tuning_flags
+	  & AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE))
+    return MEM_P (op) && ! (GET_CODE (XEXP (op, 0)) == PLUS
+			    && ! CONST_INT_P (XEXP (XEXP (op, 0), 1)));
+  return MEM_P (op);
+}
+
 /* Emit a register copy from operand to operand, taking care not to
    early-clobber source registers in the process.
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index f8cdb06..9c7e356 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1023,7 +1023,7 @@
 
 (define_insn "*movti_aarch64"
   [(set (match_operand:TI 0
-	 "nonimmediate_operand"  "=r, w,r,w,r,m,m,w,m")
+	 "nonimmediate_operand"  "=r, w,r,w,r,m,m,w,Utf")
 	(match_operand:TI 1
 	 "aarch64_movti_operand" " rn,r,w,w,m,r,Z,m,w"))]
   "(register_operand (operands[0], TImode)
@@ -1170,9 +1170,9 @@
 
 (define_insn "*movtf_aarch64"
   [(set (match_operand:TF 0
-	 "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,m,?r,m ,m")
+	 "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,Utf,?r,m ,m")
 	(match_operand:TF 1
-	 "general_operand"      " w,?r, ?r,w ,Y,Y ,m,w,m ,?r,Y"))]
+	 "general_operand"      " w,?r, ?r,w ,Y,Y ,m,w  ,m ,?r,Y"))]
   "TARGET_FLOAT && (register_operand (operands[0], TFmode)
     || aarch64_reg_or_fp_zero (operands[1], TFmode))"
   "@
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 3649fb4..b1defb6 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -171,6 +171,12 @@
        (match_test "aarch64_legitimate_address_p (GET_MODE (op), XEXP (op, 0),
 						  PARALLEL, 1)")))
 
+(define_memory_constraint "Utf"
+  "@iternal
+   An efficient memory address for a quad-word target operand."
+  (and (match_code "mem")
+       (match_test "aarch64_movti_target_operand_p (op)")))
+
 (define_memory_constraint "Utv"
   "@internal
    An address valid for loading/storing opaque structure
-- 
2.7.4

^ permalink raw reply	[flat|nested] 10+ messages in thread
* Re: [PATCH, AArch64] Disable reg offset in quad-word store for Falkor.
@ 2017-09-22 20:06 Wilco Dijkstra
  0 siblings, 0 replies; 10+ messages in thread
From: Wilco Dijkstra @ 2017-09-22 20:06 UTC (permalink / raw)
  To: jim.wilson, Andrew.pinski; +Cc: nd, GCC Patches

Hi Jim,

This looks like a general issue with reg+reg addressing modes being generated in
cases where it is not correct. I haven't looked at lmbench for a while, but it generated
absolutely horrible code like:

add x1, x0, #120
ldr v0, [x2, x1]
add x1, x0, #128
ldr v1, [x2, x1]

If this is still happening, we need to fix this in a general way as this is bad for any
CPU even if reg+reg is fast. Reduced testcases for this would be welcome as it's
likely many other targets are affected. A while ago I posted a patch to reassociate
(x + C1) * C2 -> x * C2 + C3 to improve cases like this.

Note we already support adjusting addressing costs. There are several other CPUs
which increase the reg+reg costs. So that's where I would start - assuming reg+reg
addressing mode was correctly used.

Finally, if you really want to completely disable a particular addressing mode, it's
best done in classify_address rather than changing the md patterns. My experience
is that if you use anything other than the standard 'm' constraint, GCC reload starts
to generate inefficient code even if the pattern should still apply. I have posted
several patches to use 'm' more to get better spill code and efficient expansions
if the offset happens to be too large.

Wilco

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2017-11-01 22:59 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-22 15:49 [PATCH, AArch64] Disable reg offset in quad-word store for Falkor Jim Wilson
2017-09-22 15:59 ` Jim Wilson
2017-09-22 17:59   ` Andrew Pinski
2017-09-22 18:39     ` Jim Wilson
2017-09-22 21:11       ` Andrew Pinski
2017-10-12 21:54         ` Jim Wilson
2017-10-31  3:40           ` Kugan Vivekanandarajah
2017-10-31 16:18             ` Jim Wilson
2017-11-01 22:59               ` Kugan Vivekanandarajah
2017-09-22 20:06 Wilco Dijkstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).