public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] aarch64: Improve size heuristic for cpymem expansion
@ 2021-09-29 10:20 Kyrylo Tkachov
  2021-09-30 13:51 ` Christophe LYON
  0 siblings, 1 reply; 3+ messages in thread
From: Kyrylo Tkachov @ 2021-09-29 10:20 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1638 bytes --]

Hi all,

Similar to my previous patch for setmem this one does the same for the cpymem expansion.
We count the number of ops emitted and compare it against the alternative of just calling
the library function when optimising for size.
For the code:
void
cpy_127 (char *out, char *in)
{
  __builtin_memcpy (out, in, 127);
}

void
cpy_128 (char *out, char *in)
{
  __builtin_memcpy (out, in, 128);
}

we now emit a call to memcpy (with an extra MOV-immediate instruction for the size) instead of:
cpy_127(char*, char*):
        ldp     q0, q1, [x1]
        stp     q0, q1, [x0]
        ldp     q0, q1, [x1, 32]
        stp     q0, q1, [x0, 32]
        ldp     q0, q1, [x1, 64]
        stp     q0, q1, [x0, 64]
        ldr     q0, [x1, 96]
        str     q0, [x0, 96]
        ldr     q0, [x1, 111]
        str     q0, [x0, 111]
        ret
cpy_128(char*, char*):
        ldp     q0, q1, [x1]
        stp     q0, q1, [x0]
        ldp     q0, q1, [x1, 32]
        stp     q0, q1, [x0, 32]
        ldp     q0, q1, [x1, 64]
        stp     q0, q1, [x0, 64]
        ldp     q0, q1, [x1, 96]
        stp     q0, q1, [x0, 96]
        ret

which is a clear code size win. Speed optimisation heuristics remain unchanged.
Bootstrapped and tested on aarch64-none-linux-gnu.
Pushing to trunk.

Thanks,
Kyrill

2021-09-29  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

	* config/aarch64/aarch64.c (aarch64_expand_cpymem): Count number of
	emitted operations and adjust heuristic for code size.
    
2021-09-29  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

	* gcc.target/aarch64/cpymem-size.c: New test.

[-- Attachment #2: cpmem-s.patch --]
[-- Type: application/octet-stream, Size: 3319 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ac17c1c88fb42dc1a8543b64934a624d49f1779d..a9a1800af53b18306465e382e9dd149d0e335b09 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -23390,7 +23390,8 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
 }
 
 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
-   we succeed, otherwise return false.  */
+   we succeed, otherwise return false, indicating that a libcall to
+   memcpy should be emitted.  */
 
 bool
 aarch64_expand_cpymem (rtx *operands)
@@ -23407,11 +23408,13 @@ aarch64_expand_cpymem (rtx *operands)
 
   unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
 
-  /* Inline up to 256 bytes when optimizing for speed.  */
+  /* Try to inline up to 256 bytes.  */
   unsigned HOST_WIDE_INT max_copy_size = 256;
 
-  if (optimize_function_for_size_p (cfun))
-    max_copy_size = 128;
+  bool size_p = optimize_function_for_size_p (cfun);
+
+  if (size > max_copy_size)
+    return false;
 
   int copy_bits = 256;
 
@@ -23421,13 +23424,14 @@ aarch64_expand_cpymem (rtx *operands)
       || !TARGET_SIMD
       || (aarch64_tune_params.extra_tuning_flags
 	  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
-    {
-      copy_bits = 128;
-      max_copy_size = max_copy_size / 2;
-    }
+    copy_bits = 128;
 
-  if (size > max_copy_size)
-    return false;
+  /* Emit an inline load+store sequence and count the number of operations
+     involved.  We use a simple count of just the loads and stores emitted
+     rather than rtx_insn count as all the pointer adjustments and reg copying
+     in this function will get optimized away later in the pipeline.  */
+  start_sequence ();
+  unsigned nops = 0;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -23456,7 +23460,8 @@ aarch64_expand_cpymem (rtx *operands)
 	cur_mode = V4SImode;
 
       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
-
+      /* A single block copy is 1 load + 1 store.  */
+      nops += 2;
       n -= mode_bits;
 
       /* Emit trailing copies using overlapping unaligned accesses - this is
@@ -23471,7 +23476,16 @@ aarch64_expand_cpymem (rtx *operands)
 	  n = n_bits;
 	}
     }
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
+
+  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
+     arguments + 1 for the call.  */
+  unsigned libcall_cost = 4;
+  if (size_p && libcall_cost < nops)
+    return false;
 
+  emit_insn (seq);
   return true;
 }
 
diff --git a/gcc/testsuite/gcc.target/aarch64/cpymem-size.c b/gcc/testsuite/gcc.target/aarch64/cpymem-size.c
new file mode 100644
index 0000000000000000000000000000000000000000..4d488b74301dd6bf6321372a0985771b0e4d5c07
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/cpymem-size.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-Os" } */
+
+#include <stdlib.h>
+
+/*
+** cpy_127:
+**      mov	x2, 127
+**      b	memcpy
+*/
+void
+cpy_127 (char *out, char *in)
+{
+  __builtin_memcpy (out, in, 127);
+}
+
+/*
+** cpy_128:
+**      mov	x2, 128
+**      b	memcpy
+*/
+void
+cpy_128 (char *out, char *in)
+{
+  __builtin_memcpy (out, in, 128);
+}
+
+/* { dg-final { check-function-bodies "**" "" "" } } */
+

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] aarch64: Improve size heuristic for cpymem expansion
  2021-09-29 10:20 [PATCH] aarch64: Improve size heuristic for cpymem expansion Kyrylo Tkachov
@ 2021-09-30 13:51 ` Christophe LYON
  2021-10-01 11:19   ` Kyrylo Tkachov
  0 siblings, 1 reply; 3+ messages in thread
From: Christophe LYON @ 2021-09-30 13:51 UTC (permalink / raw)
  To: gcc-patches


On 29/09/2021 12:20, Kyrylo Tkachov via Gcc-patches wrote:
> Hi all,
>
> Similar to my previous patch for setmem this one does the same for the cpymem expansion.
> We count the number of ops emitted and compare it against the alternative of just calling
> the library function when optimising for size.
> For the code:
> void
> cpy_127 (char *out, char *in)
> {
>    __builtin_memcpy (out, in, 127);
> }
>
> void
> cpy_128 (char *out, char *in)
> {
>    __builtin_memcpy (out, in, 128);
> }
>
> we now emit a call to memcpy (with an extra MOV-immediate instruction for the size) instead of:
> cpy_127(char*, char*):
>          ldp     q0, q1, [x1]
>          stp     q0, q1, [x0]
>          ldp     q0, q1, [x1, 32]
>          stp     q0, q1, [x0, 32]
>          ldp     q0, q1, [x1, 64]
>          stp     q0, q1, [x0, 64]
>          ldr     q0, [x1, 96]
>          str     q0, [x0, 96]
>          ldr     q0, [x1, 111]
>          str     q0, [x0, 111]
>          ret
> cpy_128(char*, char*):
>          ldp     q0, q1, [x1]
>          stp     q0, q1, [x0]
>          ldp     q0, q1, [x1, 32]
>          stp     q0, q1, [x0, 32]
>          ldp     q0, q1, [x1, 64]
>          stp     q0, q1, [x0, 64]
>          ldp     q0, q1, [x1, 96]
>          stp     q0, q1, [x0, 96]
>          ret
>
> which is a clear code size win. Speed optimisation heuristics remain unchanged.
> Bootstrapped and tested on aarch64-none-linux-gnu.
> Pushing to trunk.
>
> Thanks,
> Kyrill
>
> 2021-09-29  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>
> 	* config/aarch64/aarch64.c (aarch64_expand_cpymem): Count number of
> 	emitted operations and adjust heuristic for code size.
>      
> 2021-09-29  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>
> 	* gcc.target/aarch64/cpymem-size.c: New test.


Hi Kyrill,

Just to mention that the new test fails with -mabi=ilp32...


Thanks,


Christophe




^ permalink raw reply	[flat|nested] 3+ messages in thread

* RE: [PATCH] aarch64: Improve size heuristic for cpymem expansion
  2021-09-30 13:51 ` Christophe LYON
@ 2021-10-01 11:19   ` Kyrylo Tkachov
  0 siblings, 0 replies; 3+ messages in thread
From: Kyrylo Tkachov @ 2021-10-01 11:19 UTC (permalink / raw)
  To: Christophe LYON; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2569 bytes --]



> -----Original Message-----
> From: Gcc-patches <gcc-patches-
> bounces+kyrylo.tkachov=arm.com@gcc.gnu.org> On Behalf Of Christophe
> LYON via Gcc-patches
> Sent: Thursday, September 30, 2021 2:51 PM
> To: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH] aarch64: Improve size heuristic for cpymem expansion
> 
> 
> On 29/09/2021 12:20, Kyrylo Tkachov via Gcc-patches wrote:
> > Hi all,
> >
> > Similar to my previous patch for setmem this one does the same for the
> cpymem expansion.
> > We count the number of ops emitted and compare it against the
> alternative of just calling
> > the library function when optimising for size.
> > For the code:
> > void
> > cpy_127 (char *out, char *in)
> > {
> >    __builtin_memcpy (out, in, 127);
> > }
> >
> > void
> > cpy_128 (char *out, char *in)
> > {
> >    __builtin_memcpy (out, in, 128);
> > }
> >
> > we now emit a call to memcpy (with an extra MOV-immediate instruction
> for the size) instead of:
> > cpy_127(char*, char*):
> >          ldp     q0, q1, [x1]
> >          stp     q0, q1, [x0]
> >          ldp     q0, q1, [x1, 32]
> >          stp     q0, q1, [x0, 32]
> >          ldp     q0, q1, [x1, 64]
> >          stp     q0, q1, [x0, 64]
> >          ldr     q0, [x1, 96]
> >          str     q0, [x0, 96]
> >          ldr     q0, [x1, 111]
> >          str     q0, [x0, 111]
> >          ret
> > cpy_128(char*, char*):
> >          ldp     q0, q1, [x1]
> >          stp     q0, q1, [x0]
> >          ldp     q0, q1, [x1, 32]
> >          stp     q0, q1, [x0, 32]
> >          ldp     q0, q1, [x1, 64]
> >          stp     q0, q1, [x0, 64]
> >          ldp     q0, q1, [x1, 96]
> >          stp     q0, q1, [x0, 96]
> >          ret
> >
> > which is a clear code size win. Speed optimisation heuristics remain
> unchanged.
> > Bootstrapped and tested on aarch64-none-linux-gnu.
> > Pushing to trunk.
> >
> > Thanks,
> > Kyrill
> >
> > 2021-09-29  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> >
> > 	* config/aarch64/aarch64.c (aarch64_expand_cpymem): Count
> number of
> > 	emitted operations and adjust heuristic for code size.
> >
> > 2021-09-29  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> >
> > 	* gcc.target/aarch64/cpymem-size.c: New test.
> 
> 
> Hi Kyrill,
> 
> Just to mention that the new test fails with -mabi=ilp32...

Drat, thanks for letting me know.
Pushing fix to trunk.
Kyrill

gcc/testsuite/

	* gcc.target/aarch64/cpymem-size.c: Adjust scan for ilp32.

> 
> 
> Thanks,
> 
> 
> Christophe
> 
> 


[-- Attachment #2: test.patch --]
[-- Type: application/octet-stream, Size: 685 bytes --]

commit 1057a1e4a68253ecb4185cbe56c3e242031aa408
Author: Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Date:   Fri Oct 1 12:08:42 2021 +0100

    aarch64: Fix cpymem-size.c test for ILP32

diff --git a/gcc/testsuite/gcc.target/aarch64/cpymem-size.c b/gcc/testsuite/gcc.target/aarch64/cpymem-size.c
index 4d488b74301..4a6f2495d22 100644
--- a/gcc/testsuite/gcc.target/aarch64/cpymem-size.c
+++ b/gcc/testsuite/gcc.target/aarch64/cpymem-size.c
@@ -5,7 +5,7 @@
 
 /*
 ** cpy_127:
-**      mov	x2, 127
+**      mov	(w|x)2, 127
 **      b	memcpy
 */
 void
@@ -16,7 +16,7 @@ cpy_127 (char *out, char *in)
 
 /*
 ** cpy_128:
-**      mov	x2, 128
+**      mov	(w|x)2, 128
 **      b	memcpy
 */
 void

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-10-01 11:19 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-29 10:20 [PATCH] aarch64: Improve size heuristic for cpymem expansion Kyrylo Tkachov
2021-09-30 13:51 ` Christophe LYON
2021-10-01 11:19   ` Kyrylo Tkachov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).