[PATCH] aarch64: Improve size optimisation heuristic for setmem expansion

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] aarch64: Improve size optimisation heuristic for setmem expansion
@ 2021-09-29  9:58 Kyrylo Tkachov
  0 siblings, 0 replies; only message in thread
From: Kyrylo Tkachov @ 2021-09-29  9:58 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2541 bytes --]

Hi all,

This patch adjusts the setmem expansion in the backend to track the number of ops it generates
for the DUP + STR/STP inline sequences. This way we can compare the size/complexity of the sequence
against alternatives, notably just returning "false" and thus just emitting a call to memset.

The simple heuristic change here is that if we were going to emit more than 4 operations then
we shouldn't bother and just call memset. The number 4 is chosen because in the worst case for memset
we need to emit 4 instructions: 3 to move the arguments into the right registers and 1 for the call.

The speed optimisation decisions are not affected, though I do want to extend these expansions in a later
patch and I'd like to reuse this ops counting logic there. In any case this patch should make sense on its own.

For the code:
void __attribute__((__noinline__))
set127byte (int64_t *src, int c)
{
  __builtin_memset (src, c, 127);
}

void __attribute__((__noinline__))
set128byte (int64_t *src, int c)
{
  __builtin_memset (src, c, 128);
}

when optimising for size we now get just an immediate move + a call to memset (2 instructions) where before we'd have generated:
set127byte(long*, int):
        dup     v0.16b, w1
        str     q0, [x0, 96]
        stp     q0, q0, [x0]
        stp     q0, q0, [x0, 32]
        stp     q0, q0, [x0, 64]
        str     q0, [x0, 111]
        ret
set128byte(long*, int):
        dup     v0.16b, w1
        stp     q0, q0, [x0]
        stp     q0, q0, [x0, 32]
        stp     q0, q0, [x0, 64]
        stp     q0, q0, [x0, 96]
        ret

which is clearly undesirable for -Os.

I've adjusted the recently-added gcc.target/aarch64/memset-strict-align-1.c testcase to use a bigger struct
and switch to speed optimisation as with this patch we'll just call memset rather than expanding inline.
That is the right decision for size optimisation (the resulting code is indeed shorter).
With -O2 and the new struct size we still try the SIMD expansion and still trigger the path that the testcase is supposed to exercise.

Bootstrapped and tested on aarch64-none-linux-gnu.
Pushing to trunk.

2021-09-29  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

	* config/aarch64/aarch64.c (aarch64_expand_setmem): Count number of
	emitted operations and adjust heuristic for code size.
    
2021-09-29  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

	* gcc.target/aarch64/memset-corner-cases-2.c: New test.
	* gcc.target/aarch64/memset-strict-align-1.c: Adjust.

[-- Attachment #2: setmem-s.patch --]
[-- Type: application/octet-stream, Size: 5028 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 36519ccc5a58abab483c38d0a6c5f039592bfc7f..ac17c1c88fb42dc1a8543b64934a624d49f1779d 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -23534,40 +23534,37 @@ aarch64_expand_setmem (rtx *operands)
   if (!CONST_INT_P (operands[1]))
     return false;
 
-  bool speed_p = !optimize_function_for_size_p (cfun);
+  bool size_p = optimize_function_for_size_p (cfun);
 
   /* Default the maximum to 256-bytes.  */
   unsigned max_set_size = 256;
 
-  /* In case we are optimizing for size or if the core does not
-     want to use STP Q regs, lower the max_set_size.  */
-  max_set_size = (!speed_p
-		  || (aarch64_tune_params.extra_tuning_flags
-		      & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
-		  ? max_set_size / 2 : max_set_size;
-
   len = INTVAL (operands[1]);
 
   /* Upper bound check.  */
   if (len > max_set_size)
     return false;
 
+  /* Attempt a sequence with a vector broadcast followed by stores.
+     Count the number of operations involved to see if it's worth it for
+     code size.  */
+  start_sequence ();
+  unsigned nops = 0;
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
 
   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
   src = expand_vector_broadcast (V16QImode, val);
   src = force_reg (V16QImode, src);
-
+  nops++;
   /* Convert len to bits to make the rest of the code simpler.  */
   n = len * BITS_PER_UNIT;
 
   /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
      AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  setmem expand
      pattern is only turned on for TARGET_SIMD.  */
-  const int copy_limit = (speed_p
-			  && (aarch64_tune_params.extra_tuning_flags
-			      & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
+			  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
 			  ? GET_MODE_BITSIZE (TImode) : 256;
 
   while (n > 0)
@@ -23583,7 +23580,7 @@ aarch64_expand_setmem (rtx *operands)
 
       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
       aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
-
+      nops++;
       n -= mode_bits;
 
       /* Do certain trailing copies as overlapping if it's going to be
@@ -23599,7 +23596,15 @@ aarch64_expand_setmem (rtx *operands)
 	  n = n_bits;
 	}
     }
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
+  /* A call to memset in the worst case requires 3 instructions to prepare
+     the arguments + 1 for the call.  Prefer the inline sequence for size if
+     it is no longer than that.  */
+  if (size_p && nops > 4)
+    return false;
 
+  emit_insn (seq);
   return true;
 }
 
diff --git a/gcc/testsuite/gcc.target/aarch64/memset-corner-cases-2.c b/gcc/testsuite/gcc.target/aarch64/memset-corner-cases-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..9ee96f33255e1841be516734892c1fbb20772a3d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/memset-corner-cases-2.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-Os" } */
+/* { dg-require-effective-target lp64 } */
+
+#include <stdint.h>
+
+/* 127 bytes should use libcall for size.
+**set127byte:
+**	mov	x2, 127
+**	b	memset
+*/
+void __attribute__((__noinline__))
+set127byte (int64_t *src, int c)
+{
+  __builtin_memset (src, c, 127);
+}
+
+/* 128 bytes should use libcall for size.
+**set128byte:
+**	mov	x2, 128
+**	b	memset
+*/
+void __attribute__((__noinline__))
+set128byte (int64_t *src, int c)
+{
+  __builtin_memset (src, c, 128);
+}
+
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
index 5cdc8a449684055b040056063084dfce484d827e..664d43aee1300819022691ac337d6b8c03d6b8dd 100644
--- a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
@@ -1,14 +1,14 @@
 /* { dg-do compile } */
-/* { dg-options "-Os -mstrict-align" } */
+/* { dg-options "-O2 -mstrict-align" } */
 
-struct s { char x[95]; };
+struct s { char x[255]; };
 void foo (struct s *);
 void bar (void) { struct s s1 = {}; foo (&s1); }
 
-/* memset (s1 = {}, sizeof = 95) should be expanded out
+/* memset (s1 = {}, sizeof = 255) should be expanded out
    such that there are no overlap stores when -mstrict-align
    is in use.
-   so 2 pair 16 bytes stores (64 bytes).
+   so 7 pairs of 16 bytes stores (224 bytes).
    1 16 byte stores
    1 8 byte store
    1 4 byte store
@@ -16,7 +16,7 @@ void bar (void) { struct s s1 = {}; foo (&s1); }
    1 1 byte store
    */
 
-/* { dg-final { scan-assembler-times "stp\tq" 2 } } */
+/* { dg-final { scan-assembler-times "stp\tq" 7 } } */
 /* { dg-final { scan-assembler-times "str\tq" 1 } } */
 /* { dg-final { scan-assembler-times "str\txzr" 1 } } */
 /* { dg-final { scan-assembler-times "str\twzr" 1 } } */

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-09-29  9:59 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-29  9:58 [PATCH] aarch64: Improve size optimisation heuristic for setmem expansion Kyrylo Tkachov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).