[PATCH] AArch64: Fix strict-align cpymem/setmem [PR103100]

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] AArch64: Fix strict-align cpymem/setmem [PR103100]
@ 2023-09-20 13:50 Wilco Dijkstra
  2023-09-20 15:52 ` Richard Earnshaw (lists)
  0 siblings, 1 reply; 8+ messages in thread
From: Wilco Dijkstra @ 2023-09-20 13:50 UTC (permalink / raw)
  To: GCC Patches; +Cc: Richard Sandiford


The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.
    
Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog/
        PR target/103100
        * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
        (setmemdi): Likewise.
        * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
        strict-align.  Cleanup condition for using MOPS.
        (aarch64_expand_setmem): Likewise.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index dd6874d13a75f20d10a244578afc355b25c73da2..8f3bfb91c0f4ec43f37fe9289a66092a29a47e4d 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
   int mode_bits;
   rtx dst = operands[0];
   rtx src = operands[1];
+  unsigned align = INTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode;
+  bool size_p = optimize_function_for_size_p (cfun);
 
-  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
-  if (!CONST_INT_P (operands[2]))
+  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_cpymem_mops (operands);
 
   unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
 
-  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
-  unsigned HOST_WIDE_INT max_copy_size
-    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
-
-  bool size_p = optimize_function_for_size_p (cfun);
+  /* Try to inline up to 256 bytes.  */
+  unsigned max_copy_size = 256;
+  unsigned max_mops_size = aarch64_mops_memcpy_size_threshold;
 
-  /* Large constant-sized cpymem should go through MOPS when possible.
-     It should be a win even for size optimization in the general case.
-     For speed optimization the choice between MOPS and the SIMD sequence
-     depends on the size of the copy, rather than number of instructions,
-     alignment etc.  */
-  if (size > max_copy_size)
+  /* Large copies use MOPS when available or a library call.  */
+  if (size > max_copy_size || (TARGET_MOPS && size > max_mops_size))
     return aarch64_expand_cpymem_mops (operands);
 
   int copy_bits = 256;
@@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
+  unsigned align = INTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode, next_mode;
 
-  /* If we don't have SIMD registers or the size is variable use the MOPS
-     inlined sequence if possible.  */
-  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+      || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_setmem_mops (operands);
 
   bool size_p = optimize_function_for_size_p (cfun);
@@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
   /* Default the maximum to 256-bytes when considering only libcall vs
      SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
+  unsigned max_mops_size = aarch64_mops_memset_size_threshold;
 
   len = INTVAL (operands[1]);
-  if (len > max_set_size && !TARGET_MOPS)
-    return false;
+
+  /* Large memset uses MOPS when available or a library call.  */
+  if (len > max_set_size || (TARGET_MOPS && len > max_mops_size))
+    return aarch64_expand_setmem_mops (operands);
 
   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
   /* The MOPS sequence takes:
@@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
      the arguments + 1 for the call.  */
   unsigned libcall_cost = 4;
 
-  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
-     when available.  */
-  if (TARGET_MOPS
-      && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
-    return aarch64_expand_setmem_mops (operands);
-
   /* Attempt a sequence with a vector broadcast followed by stores.
      Count the number of operations involved to see if it's worth it
      against the alternatives.  A simple counter simd_ops on the
@@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
       simd_ops++;
       n -= mode_bits;
 
-      /* Do certain trailing copies as overlapping if it's going to be
-	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
-	 byte copy it's more efficient to do two overlapping 8 byte copies than
-	 8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
+      /* Emit trailing writes using overlapping unaligned accesses
+	(when !STRICT_ALIGNMENT) - this is smaller and faster.  */
       if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
 	{
 	  next_mode = smallest_mode_for_size (n, MODE_INT);
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
    (match_operand:BLK 1 "memory_operand")
    (match_operand:DI 2 "general_operand")
    (match_operand:DI 3 "immediate_operand")]
-   "!STRICT_ALIGNMENT || TARGET_MOPS"
+   ""
 {
   if (aarch64_expand_cpymem (operands))
     DONE;
@@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
         (match_operand:QI  2 "nonmemory_operand")) ;; Value
    (use (match_operand:DI  1 "general_operand")) ;; Length
    (match_operand          3 "immediate_operand")] ;; Align
- "TARGET_SIMD || TARGET_MOPS"
+ ""
  {
   if (aarch64_expand_setmem (operands))
     DONE;



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] AArch64: Fix strict-align cpymem/setmem [PR103100]
  2023-09-20 13:50 [PATCH] AArch64: Fix strict-align cpymem/setmem [PR103100] Wilco Dijkstra
@ 2023-09-20 15:52 ` Richard Earnshaw (lists)
  2023-09-20 17:08   ` Wilco Dijkstra
  2023-09-21 14:24   ` [PATCH v2] " Wilco Dijkstra
  0 siblings, 2 replies; 8+ messages in thread
From: Richard Earnshaw (lists) @ 2023-09-20 15:52 UTC (permalink / raw)
  To: Wilco Dijkstra, GCC Patches; +Cc: Richard Sandiford

On 20/09/2023 14:50, Wilco Dijkstra wrote:
> 
> The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
> Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
> Clean up the condition when to use MOPS.
>     
> Passes regress/bootstrap, OK for commit?
>     
> gcc/ChangeLog/
>         PR target/103100
>         * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.

Shouldn't this be a separate patch?  It's not immediately obvious that this is a necessary part of this change.

>         (setmemdi): Likewise.
>         * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
>         strict-align.  Cleanup condition for using MOPS.
>         (aarch64_expand_setmem): Likewise.
> 
> ---
> 
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index dd6874d13a75f20d10a244578afc355b25c73da2..8f3bfb91c0f4ec43f37fe9289a66092a29a47e4d 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
>    int mode_bits;
>    rtx dst = operands[0];
>    rtx src = operands[1];
> +  unsigned align = INTVAL (operands[3]);

This should read the value with UINTVAL.  Given the useful range of the alignment, it should be OK that we're not using unsigned HWI.

>    rtx base;
>    machine_mode cur_mode = BLKmode;
> +  bool size_p = optimize_function_for_size_p (cfun);
>  
> -  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
> -  if (!CONST_INT_P (operands[2]))
> +  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
> +  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
>      return aarch64_expand_cpymem_mops (operands);

So what about align=4 and copying, for example, 8 or 12 bytes; wouldn't we want a sequence of LDR/STR in that case?  Doesn't this fall back to MOPS too eagerly?


>  
>    unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
>  
> -  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
> -  unsigned HOST_WIDE_INT max_copy_size
> -    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
> -
> -  bool size_p = optimize_function_for_size_p (cfun);
> +  /* Try to inline up to 256 bytes.  */
> +  unsigned max_copy_size = 256;
> +  unsigned max_mops_size = aarch64_mops_memcpy_size_threshold;

I find this name slightly confusing.  Surely it's min_mops_size (since above that we want to use MOPS rather than inlined loads/stores).  But why not just use aarch64_mops_memcpy_size_threshold directly in the one place it's used?

>  
> -  /* Large constant-sized cpymem should go through MOPS when possible.
> -     It should be a win even for size optimization in the general case.
> -     For speed optimization the choice between MOPS and the SIMD sequence
> -     depends on the size of the copy, rather than number of instructions,
> -     alignment etc.  */
> -  if (size > max_copy_size)
> +  /* Large copies use MOPS when available or a library call.  */
> +  if (size > max_copy_size || (TARGET_MOPS && size > max_mops_size))
>      return aarch64_expand_cpymem_mops (operands);
>  
>    int copy_bits = 256;
> @@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)

Similar comments apply to this code as well.

>    unsigned HOST_WIDE_INT len;
>    rtx dst = operands[0];
>    rtx val = operands[2], src;
> +  unsigned align = INTVAL (operands[3]);
>    rtx base;
>    machine_mode cur_mode = BLKmode, next_mode;
>  
> -  /* If we don't have SIMD registers or the size is variable use the MOPS
> -     inlined sequence if possible.  */
> -  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
> +  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
> +  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
> +      || (STRICT_ALIGNMENT && align < 16))
>      return aarch64_expand_setmem_mops (operands);
>  
>    bool size_p = optimize_function_for_size_p (cfun);
> @@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)

And here.

>    /* Default the maximum to 256-bytes when considering only libcall vs
>       SIMD broadcast sequence.  */
>    unsigned max_set_size = 256;
> +  unsigned max_mops_size = aarch64_mops_memset_size_threshold;
>  
>    len = INTVAL (operands[1]);
> -  if (len > max_set_size && !TARGET_MOPS)
> -    return false;
> +
> +  /* Large memset uses MOPS when available or a library call.  */
> +  if (len > max_set_size || (TARGET_MOPS && len > max_mops_size))
> +    return aarch64_expand_setmem_mops (operands);
>  
>    int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
>    /* The MOPS sequence takes:
> @@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
>       the arguments + 1 for the call.  */
>    unsigned libcall_cost = 4;
>  
> -  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
> -     when available.  */
> -  if (TARGET_MOPS
> -      && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
> -    return aarch64_expand_setmem_mops (operands);
> -
>    /* Attempt a sequence with a vector broadcast followed by stores.
>       Count the number of operations involved to see if it's worth it
>       against the alternatives.  A simple counter simd_ops on the
> @@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
>        simd_ops++;
>        n -= mode_bits;
>  
> -      /* Do certain trailing copies as overlapping if it's going to be
> -	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
> -	 byte copy it's more efficient to do two overlapping 8 byte copies than
> -	 8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
> +      /* Emit trailing writes using overlapping unaligned accesses
> +	(when !STRICT_ALIGNMENT) - this is smaller and faster.  */
>        if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
>  	{
>  	  next_mode = smallest_mode_for_size (n, MODE_INT);
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
>     (match_operand:BLK 1 "memory_operand")
>     (match_operand:DI 2 "general_operand")
>     (match_operand:DI 3 "immediate_operand")]
> -   "!STRICT_ALIGNMENT || TARGET_MOPS"
> +   ""
>  {
>    if (aarch64_expand_cpymem (operands))
>      DONE;
> @@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
>          (match_operand:QI  2 "nonmemory_operand")) ;; Value
>     (use (match_operand:DI  1 "general_operand")) ;; Length
>     (match_operand          3 "immediate_operand")] ;; Align
> - "TARGET_SIMD || TARGET_MOPS"
> + ""
>   {
>    if (aarch64_expand_setmem (operands))
>      DONE;
> 
> 

Are there any additional tests for this?
R.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] AArch64: Fix strict-align cpymem/setmem [PR103100]
  2023-09-20 15:52 ` Richard Earnshaw (lists)
@ 2023-09-20 17:08   ` Wilco Dijkstra
  2023-09-21 14:24   ` [PATCH v2] " Wilco Dijkstra
  1 sibling, 0 replies; 8+ messages in thread
From: Wilco Dijkstra @ 2023-09-20 17:08 UTC (permalink / raw)
  To: Richard Earnshaw, GCC Patches; +Cc: Richard Sandiford

Hi Richard,

>         * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.

> Shouldn't this be a separate patch?  It's not immediately obvious that this is a necessary part of this change.

You mean this?

@@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
    (match_operand:BLK 1 "memory_operand")
    (match_operand:DI 2 "general_operand")
    (match_operand:DI 3 "immediate_operand")]
-   "!STRICT_ALIGNMENT || TARGET_MOPS"
+   ""

Yes that's necessary since that is the bug.

> +  unsigned align = INTVAL (operands[3]);
>
>This should read the value with UINTVAL.  Given the useful range of the alignment, it should be OK that we're not using unsigned HWI.

I'll fix that.

> +  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
>      return aarch64_expand_cpymem_mops (operands);
>
> So what about align=4 and copying, for example, 8 or 12 bytes; wouldn't we want a sequence of LDR/STR in that case?  Doesn't this fall back to MOPS too eagerly?

The goal was to fix the issue in way that is both obvious and can be easily backported.
Further improvements can be made to handle other alignments, but it is
slightly tricky (eg. align == 4 won't emit LDP/STP directly using current code
and thus would need additional work to generalize the LDP path).
  
>> +  unsigned max_mops_size = aarch64_mops_memcpy_size_threshold;
>
>I find this name slightly confusing.  Surely it's min_mops_size (since above that we want to use MOPS rather than inlined loads/stores).  But why not just use aarch64_mops_memcpy_size_threshold directly in the one place it's used?

The reason is that in a follow-on patch I check aarch64_mops_memcpy_size_threshold
too, so for now this acts as a shortcut for the ridiculously long name.

> Are there any additional tests for this?

There are existing tests that check the expansion which fail if you completely
block expansions with STRICT_ALIGNMENT.

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v2] AArch64: Fix strict-align cpymem/setmem [PR103100]
  2023-09-20 15:52 ` Richard Earnshaw (lists)
  2023-09-20 17:08   ` Wilco Dijkstra
@ 2023-09-21 14:24   ` Wilco Dijkstra
  2023-10-16 12:48     ` Wilco Dijkstra
  2023-11-29 18:09     ` Richard Sandiford
  1 sibling, 2 replies; 8+ messages in thread
From: Wilco Dijkstra @ 2023-09-21 14:24 UTC (permalink / raw)
  To: Richard Earnshaw, GCC Patches; +Cc: Richard Sandiford

v2: Use UINTVAL, rename max_mops_size.

The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.
    
Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog/
        PR target/103100
        * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
        (setmemdi): Likewise.
        * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
        strict-align.  Cleanup condition for using MOPS.
        (aarch64_expand_setmem): Likewise.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
   int mode_bits;
   rtx dst = operands[0];
   rtx src = operands[1];
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode;
+  bool size_p = optimize_function_for_size_p (cfun);
 
-  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
-  if (!CONST_INT_P (operands[2]))
+  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_cpymem_mops (operands);
 
-  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
-
-  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
-  unsigned HOST_WIDE_INT max_copy_size
-    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+  unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
 
-  bool size_p = optimize_function_for_size_p (cfun);
+  /* Try to inline up to 256 bytes.  */
+  unsigned max_copy_size = 256;
+  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
 
-  /* Large constant-sized cpymem should go through MOPS when possible.
-     It should be a win even for size optimization in the general case.
-     For speed optimization the choice between MOPS and the SIMD sequence
-     depends on the size of the copy, rather than number of instructions,
-     alignment etc.  */
-  if (size > max_copy_size)
+  /* Large copies use MOPS when available or a library call.  */
+  if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
     return aarch64_expand_cpymem_mops (operands);
 
   int copy_bits = 256;
@@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode, next_mode;
 
-  /* If we don't have SIMD registers or the size is variable use the MOPS
-     inlined sequence if possible.  */
-  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+      || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_setmem_mops (operands);
 
   bool size_p = optimize_function_for_size_p (cfun);
@@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
   /* Default the maximum to 256-bytes when considering only libcall vs
      SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
+  unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
-  len = INTVAL (operands[1]);
-  if (len > max_set_size && !TARGET_MOPS)
-    return false;
+  len = UINTVAL (operands[1]);
+
+  /* Large memset uses MOPS when available or a library call.  */
+  if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
+    return aarch64_expand_setmem_mops (operands);
 
   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
   /* The MOPS sequence takes:
@@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
      the arguments + 1 for the call.  */
   unsigned libcall_cost = 4;
 
-  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
-     when available.  */
-  if (TARGET_MOPS
-      && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
-    return aarch64_expand_setmem_mops (operands);
-
   /* Attempt a sequence with a vector broadcast followed by stores.
      Count the number of operations involved to see if it's worth it
      against the alternatives.  A simple counter simd_ops on the
@@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
       simd_ops++;
       n -= mode_bits;
 
-      /* Do certain trailing copies as overlapping if it's going to be
-	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
-	 byte copy it's more efficient to do two overlapping 8 byte copies than
-	 8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
+      /* Emit trailing writes using overlapping unaligned accesses
+	(when !STRICT_ALIGNMENT) - this is smaller and faster.  */
       if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
 	{
 	  next_mode = smallest_mode_for_size (n, MODE_INT);
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
    (match_operand:BLK 1 "memory_operand")
    (match_operand:DI 2 "general_operand")
    (match_operand:DI 3 "immediate_operand")]
-   "!STRICT_ALIGNMENT || TARGET_MOPS"
+   ""
 {
   if (aarch64_expand_cpymem (operands))
     DONE;
@@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
         (match_operand:QI  2 "nonmemory_operand")) ;; Value
    (use (match_operand:DI  1 "general_operand")) ;; Length
    (match_operand          3 "immediate_operand")] ;; Align
- "TARGET_SIMD || TARGET_MOPS"
+ ""
  {
   if (aarch64_expand_setmem (operands))
     DONE;


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] AArch64: Fix strict-align cpymem/setmem [PR103100]
  2023-09-21 14:24   ` [PATCH v2] " Wilco Dijkstra
@ 2023-10-16 12:48     ` Wilco Dijkstra
  2023-11-06 12:09       ` Wilco Dijkstra
  2023-11-29 18:09     ` Richard Sandiford
  1 sibling, 1 reply; 8+ messages in thread
From: Wilco Dijkstra @ 2023-10-16 12:48 UTC (permalink / raw)
  To: Richard Earnshaw, GCC Patches; +Cc: Richard Sandiford

ping
 
v2: Use UINTVAL, rename max_mops_size.

The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.
    
Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog/
        PR target/103100
        * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
        (setmemdi): Likewise.
        * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
        strict-align.  Cleanup condition for using MOPS.
        (aarch64_expand_setmem): Likewise.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
   int mode_bits;
   rtx dst = operands[0];
   rtx src = operands[1];
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode;
+  bool size_p = optimize_function_for_size_p (cfun);
 
-  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
-  if (!CONST_INT_P (operands[2]))
+  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_cpymem_mops (operands);
 
-  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
-
-  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
-  unsigned HOST_WIDE_INT max_copy_size
-    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+  unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
 
-  bool size_p = optimize_function_for_size_p (cfun);
+  /* Try to inline up to 256 bytes.  */
+  unsigned max_copy_size = 256;
+  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
 
-  /* Large constant-sized cpymem should go through MOPS when possible.
-     It should be a win even for size optimization in the general case.
-     For speed optimization the choice between MOPS and the SIMD sequence
-     depends on the size of the copy, rather than number of instructions,
-     alignment etc.  */
-  if (size > max_copy_size)
+  /* Large copies use MOPS when available or a library call.  */
+  if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
     return aarch64_expand_cpymem_mops (operands);
 
   int copy_bits = 256;
@@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode, next_mode;
 
-  /* If we don't have SIMD registers or the size is variable use the MOPS
-     inlined sequence if possible.  */
-  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+      || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_setmem_mops (operands);
 
   bool size_p = optimize_function_for_size_p (cfun);
@@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
   /* Default the maximum to 256-bytes when considering only libcall vs
      SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
+  unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
-  len = INTVAL (operands[1]);
-  if (len > max_set_size && !TARGET_MOPS)
-    return false;
+  len = UINTVAL (operands[1]);
+
+  /* Large memset uses MOPS when available or a library call.  */
+  if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
+    return aarch64_expand_setmem_mops (operands);
 
   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
   /* The MOPS sequence takes:
@@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
      the arguments + 1 for the call.  */
   unsigned libcall_cost = 4;
 
-  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
-     when available.  */
-  if (TARGET_MOPS
-      && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
-    return aarch64_expand_setmem_mops (operands);
-
   /* Attempt a sequence with a vector broadcast followed by stores.
      Count the number of operations involved to see if it's worth it
      against the alternatives.  A simple counter simd_ops on the
@@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
       simd_ops++;
       n -= mode_bits;
 
-      /* Do certain trailing copies as overlapping if it's going to be
-        cheaper.  i.e. less instructions to do so.  For instance doing a 15
-        byte copy it's more efficient to do two overlapping 8 byte copies than
-        8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
+      /* Emit trailing writes using overlapping unaligned accesses
+       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
       if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
         {
           next_mode = smallest_mode_for_size (n, MODE_INT);
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
    (match_operand:BLK 1 "memory_operand")
    (match_operand:DI 2 "general_operand")
    (match_operand:DI 3 "immediate_operand")]
-   "!STRICT_ALIGNMENT || TARGET_MOPS"
+   ""
 {
   if (aarch64_expand_cpymem (operands))
     DONE;
@@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
         (match_operand:QI  2 "nonmemory_operand")) ;; Value
    (use (match_operand:DI  1 "general_operand")) ;; Length
    (match_operand          3 "immediate_operand")] ;; Align
- "TARGET_SIMD || TARGET_MOPS"
+ ""
  {
   if (aarch64_expand_setmem (operands))
     DONE;

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] AArch64: Fix strict-align cpymem/setmem [PR103100]
  2023-10-16 12:48     ` Wilco Dijkstra
@ 2023-11-06 12:09       ` Wilco Dijkstra
  0 siblings, 0 replies; 8+ messages in thread
From: Wilco Dijkstra @ 2023-11-06 12:09 UTC (permalink / raw)
  To: Richard Earnshaw, GCC Patches; +Cc: Richard Sandiford


ping
 
v2: Use UINTVAL, rename max_mops_size.

The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.
    
Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog/
        PR target/103100
        * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
        (setmemdi): Likewise.
        * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
        strict-align.  Cleanup condition for using MOPS.
        (aarch64_expand_setmem): Likewise.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
   int mode_bits;
   rtx dst = operands[0];
   rtx src = operands[1];
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode;
+  bool size_p = optimize_function_for_size_p (cfun);
 
-  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
-  if (!CONST_INT_P (operands[2]))
+  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_cpymem_mops (operands);
 
-  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
-
-  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
-  unsigned HOST_WIDE_INT max_copy_size
-    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+  unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
 
-  bool size_p = optimize_function_for_size_p (cfun);
+  /* Try to inline up to 256 bytes.  */
+  unsigned max_copy_size = 256;
+  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
 
-  /* Large constant-sized cpymem should go through MOPS when possible.
-     It should be a win even for size optimization in the general case.
-     For speed optimization the choice between MOPS and the SIMD sequence
-     depends on the size of the copy, rather than number of instructions,
-     alignment etc.  */
-  if (size > max_copy_size)
+  /* Large copies use MOPS when available or a library call.  */
+  if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
     return aarch64_expand_cpymem_mops (operands);
 
   int copy_bits = 256;
@@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode, next_mode;
 
-  /* If we don't have SIMD registers or the size is variable use the MOPS
-     inlined sequence if possible.  */
-  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+      || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_setmem_mops (operands);
 
   bool size_p = optimize_function_for_size_p (cfun);
@@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
   /* Default the maximum to 256-bytes when considering only libcall vs
      SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
+  unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
-  len = INTVAL (operands[1]);
-  if (len > max_set_size && !TARGET_MOPS)
-    return false;
+  len = UINTVAL (operands[1]);
+
+  /* Large memset uses MOPS when available or a library call.  */
+  if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
+    return aarch64_expand_setmem_mops (operands);
 
   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
   /* The MOPS sequence takes:
@@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
      the arguments + 1 for the call.  */
   unsigned libcall_cost = 4;
 
-  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
-     when available.  */
-  if (TARGET_MOPS
-      && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
-    return aarch64_expand_setmem_mops (operands);
-
   /* Attempt a sequence with a vector broadcast followed by stores.
      Count the number of operations involved to see if it's worth it
      against the alternatives.  A simple counter simd_ops on the
@@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
       simd_ops++;
       n -= mode_bits;
 
-      /* Do certain trailing copies as overlapping if it's going to be
-        cheaper.  i.e. less instructions to do so.  For instance doing a 15
-        byte copy it's more efficient to do two overlapping 8 byte copies than
-        8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
+      /* Emit trailing writes using overlapping unaligned accesses
+       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
       if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
         {
           next_mode = smallest_mode_for_size (n, MODE_INT);
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
    (match_operand:BLK 1 "memory_operand")
    (match_operand:DI 2 "general_operand")
    (match_operand:DI 3 "immediate_operand")]
-   "!STRICT_ALIGNMENT || TARGET_MOPS"
+   ""
 {
   if (aarch64_expand_cpymem (operands))
     DONE;
@@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
         (match_operand:QI  2 "nonmemory_operand")) ;; Value
    (use (match_operand:DI  1 "general_operand")) ;; Length
    (match_operand          3 "immediate_operand")] ;; Align
- "TARGET_SIMD || TARGET_MOPS"
+ ""
  {
   if (aarch64_expand_setmem (operands))
     DONE;

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] AArch64: Fix strict-align cpymem/setmem [PR103100]
  2023-09-21 14:24   ` [PATCH v2] " Wilco Dijkstra
  2023-10-16 12:48     ` Wilco Dijkstra
@ 2023-11-29 18:09     ` Richard Sandiford
  2023-11-30 12:03       ` Richard Earnshaw
  1 sibling, 1 reply; 8+ messages in thread
From: Richard Sandiford @ 2023-11-29 18:09 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: Richard Earnshaw, GCC Patches

Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> v2: Use UINTVAL, rename max_mops_size.
>
> The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
> Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
> Clean up the condition when to use MOPS.
>
> Passes regress/bootstrap, OK for commit?
>
> gcc/ChangeLog/
>         PR target/103100
>         * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
>         (setmemdi): Likewise.
>         * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
>         strict-align.  Cleanup condition for using MOPS.
>         (aarch64_expand_setmem): Likewise.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
>    int mode_bits;
>    rtx dst = operands[0];
>    rtx src = operands[1];
> +  unsigned align = UINTVAL (operands[3]);
>    rtx base;
>    machine_mode cur_mode = BLKmode;
> +  bool size_p = optimize_function_for_size_p (cfun);
>
> -  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
> -  if (!CONST_INT_P (operands[2]))
> +  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
> +  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
>      return aarch64_expand_cpymem_mops (operands);
>
> -  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
> -
> -  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
> -  unsigned HOST_WIDE_INT max_copy_size
> -    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
> +  unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
>
> -  bool size_p = optimize_function_for_size_p (cfun);
> +  /* Try to inline up to 256 bytes.  */
> +  unsigned max_copy_size = 256;
> +  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
>
> -  /* Large constant-sized cpymem should go through MOPS when possible.
> -     It should be a win even for size optimization in the general case.
> -     For speed optimization the choice between MOPS and the SIMD sequence
> -     depends on the size of the copy, rather than number of instructions,
> -     alignment etc.  */
> -  if (size > max_copy_size)
> +  /* Large copies use MOPS when available or a library call.  */
> +  if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
>      return aarch64_expand_cpymem_mops (operands);

It feels a little unintuitive to be calling aarch64_expand_cpymem_mops
for !TARGET_MOPS, but that's pre-existing, and I can see there are
arguments both ways.

Although !TARGET_SIMD is a niche interest on current trunk, it becomes
important for streaming-compatible mode.  So we might want to look
again at the different handling of !TARGET_SIMD in this function (where
we lower the copy size but not the threshold) and aarch64_expand_setmem
(where we bail out early).  That's not something for this patch though,
just mentioning it.

The patch is OK with me, but please give Richard E a day to object.

Thanks,
Richard

>
>    int copy_bits = 256;
> @@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
>    unsigned HOST_WIDE_INT len;
>    rtx dst = operands[0];
>    rtx val = operands[2], src;
> +  unsigned align = UINTVAL (operands[3]);
>    rtx base;
>    machine_mode cur_mode = BLKmode, next_mode;
>
> -  /* If we don't have SIMD registers or the size is variable use the MOPS
> -     inlined sequence if possible.  */
> -  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
> +  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
> +  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
> +      || (STRICT_ALIGNMENT && align < 16))
>      return aarch64_expand_setmem_mops (operands);
>
>    bool size_p = optimize_function_for_size_p (cfun);
> @@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
>    /* Default the maximum to 256-bytes when considering only libcall vs
>       SIMD broadcast sequence.  */
>    unsigned max_set_size = 256;
> +  unsigned mops_threshold = aarch64_mops_memset_size_threshold;
>
> -  len = INTVAL (operands[1]);
> -  if (len > max_set_size && !TARGET_MOPS)
> -    return false;
> +  len = UINTVAL (operands[1]);
> +
> +  /* Large memset uses MOPS when available or a library call.  */
> +  if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
> +    return aarch64_expand_setmem_mops (operands);
>
>    int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
>    /* The MOPS sequence takes:
> @@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
>       the arguments + 1 for the call.  */
>    unsigned libcall_cost = 4;
>
> -  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
> -     when available.  */
> -  if (TARGET_MOPS
> -      && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
> -    return aarch64_expand_setmem_mops (operands);
> -
>    /* Attempt a sequence with a vector broadcast followed by stores.
>       Count the number of operations involved to see if it's worth it
>       against the alternatives.  A simple counter simd_ops on the
> @@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
>        simd_ops++;
>        n -= mode_bits;
>
> -      /* Do certain trailing copies as overlapping if it's going to be
> -        cheaper.  i.e. less instructions to do so.  For instance doing a 15
> -        byte copy it's more efficient to do two overlapping 8 byte copies than
> -        8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
> +      /* Emit trailing writes using overlapping unaligned accesses
> +       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
>        if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
>         {
>           next_mode = smallest_mode_for_size (n, MODE_INT);
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
>     (match_operand:BLK 1 "memory_operand")
>     (match_operand:DI 2 "general_operand")
>     (match_operand:DI 3 "immediate_operand")]
> -   "!STRICT_ALIGNMENT || TARGET_MOPS"
> +   ""
>  {
>    if (aarch64_expand_cpymem (operands))
>      DONE;
> @@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
>          (match_operand:QI  2 "nonmemory_operand")) ;; Value
>     (use (match_operand:DI  1 "general_operand")) ;; Length
>     (match_operand          3 "immediate_operand")] ;; Align
> - "TARGET_SIMD || TARGET_MOPS"
> + ""
>   {
>    if (aarch64_expand_setmem (operands))
>      DONE;

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] AArch64: Fix strict-align cpymem/setmem [PR103100]
  2023-11-29 18:09     ` Richard Sandiford
@ 2023-11-30 12:03       ` Richard Earnshaw
  0 siblings, 0 replies; 8+ messages in thread
From: Richard Earnshaw @ 2023-11-30 12:03 UTC (permalink / raw)
  To: Wilco Dijkstra, Richard Earnshaw, GCC Patches, richard.sandiford



On 29/11/2023 18:09, Richard Sandiford wrote:
> Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
>> v2: Use UINTVAL, rename max_mops_size.
>>
>> The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
>> Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
>> Clean up the condition when to use MOPS.
>>
>> Passes regress/bootstrap, OK for commit?
>>
>> gcc/ChangeLog/
>>          PR target/103100
>>          * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
>>          (setmemdi): Likewise.
>>          * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
>>          strict-align.  Cleanup condition for using MOPS.
>>          (aarch64_expand_setmem): Likewise.
>>
>> ---
>>
>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644
>> --- a/gcc/config/aarch64/aarch64.cc
>> +++ b/gcc/config/aarch64/aarch64.cc
>> @@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
>>     int mode_bits;
>>     rtx dst = operands[0];
>>     rtx src = operands[1];
>> +  unsigned align = UINTVAL (operands[3]);
>>     rtx base;
>>     machine_mode cur_mode = BLKmode;
>> +  bool size_p = optimize_function_for_size_p (cfun);
>>
>> -  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
>> -  if (!CONST_INT_P (operands[2]))
>> +  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
>> +  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
>>       return aarch64_expand_cpymem_mops (operands);
>>
>> -  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
>> -
>> -  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
>> -  unsigned HOST_WIDE_INT max_copy_size
>> -    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
>> +  unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
>>
>> -  bool size_p = optimize_function_for_size_p (cfun);
>> +  /* Try to inline up to 256 bytes.  */
>> +  unsigned max_copy_size = 256;
>> +  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
>>
>> -  /* Large constant-sized cpymem should go through MOPS when possible.
>> -     It should be a win even for size optimization in the general case.
>> -     For speed optimization the choice between MOPS and the SIMD sequence
>> -     depends on the size of the copy, rather than number of instructions,
>> -     alignment etc.  */
>> -  if (size > max_copy_size)
>> +  /* Large copies use MOPS when available or a library call.  */
>> +  if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
>>       return aarch64_expand_cpymem_mops (operands);
> 
> It feels a little unintuitive to be calling aarch64_expand_cpymem_mops
> for !TARGET_MOPS, but that's pre-existing, and I can see there are
> arguments both ways.
> 
> Although !TARGET_SIMD is a niche interest on current trunk, it becomes
> important for streaming-compatible mode.  So we might want to look
> again at the different handling of !TARGET_SIMD in this function (where
> we lower the copy size but not the threshold) and aarch64_expand_setmem
> (where we bail out early).  That's not something for this patch though,
> just mentioning it.
> 
> The patch is OK with me, but please give Richard E a day to object.

This is fine by me.

R.

> 
> Thanks,
> Richard
> 
>>
>>     int copy_bits = 256;
>> @@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
>>     unsigned HOST_WIDE_INT len;
>>     rtx dst = operands[0];
>>     rtx val = operands[2], src;
>> +  unsigned align = UINTVAL (operands[3]);
>>     rtx base;
>>     machine_mode cur_mode = BLKmode, next_mode;
>>
>> -  /* If we don't have SIMD registers or the size is variable use the MOPS
>> -     inlined sequence if possible.  */
>> -  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
>> +  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
>> +  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
>> +      || (STRICT_ALIGNMENT && align < 16))
>>       return aarch64_expand_setmem_mops (operands);
>>
>>     bool size_p = optimize_function_for_size_p (cfun);
>> @@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
>>     /* Default the maximum to 256-bytes when considering only libcall vs
>>        SIMD broadcast sequence.  */
>>     unsigned max_set_size = 256;
>> +  unsigned mops_threshold = aarch64_mops_memset_size_threshold;
>>
>> -  len = INTVAL (operands[1]);
>> -  if (len > max_set_size && !TARGET_MOPS)
>> -    return false;
>> +  len = UINTVAL (operands[1]);
>> +
>> +  /* Large memset uses MOPS when available or a library call.  */
>> +  if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
>> +    return aarch64_expand_setmem_mops (operands);
>>
>>     int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
>>     /* The MOPS sequence takes:
>> @@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
>>        the arguments + 1 for the call.  */
>>     unsigned libcall_cost = 4;
>>
>> -  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
>> -     when available.  */
>> -  if (TARGET_MOPS
>> -      && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
>> -    return aarch64_expand_setmem_mops (operands);
>> -
>>     /* Attempt a sequence with a vector broadcast followed by stores.
>>        Count the number of operations involved to see if it's worth it
>>        against the alternatives.  A simple counter simd_ops on the
>> @@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
>>         simd_ops++;
>>         n -= mode_bits;
>>
>> -      /* Do certain trailing copies as overlapping if it's going to be
>> -        cheaper.  i.e. less instructions to do so.  For instance doing a 15
>> -        byte copy it's more efficient to do two overlapping 8 byte copies than
>> -        8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
>> +      /* Emit trailing writes using overlapping unaligned accesses
>> +       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
>>         if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
>>          {
>>            next_mode = smallest_mode_for_size (n, MODE_INT);
>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>> index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
>>      (match_operand:BLK 1 "memory_operand")
>>      (match_operand:DI 2 "general_operand")
>>      (match_operand:DI 3 "immediate_operand")]
>> -   "!STRICT_ALIGNMENT || TARGET_MOPS"
>> +   ""
>>   {
>>     if (aarch64_expand_cpymem (operands))
>>       DONE;
>> @@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
>>           (match_operand:QI  2 "nonmemory_operand")) ;; Value
>>      (use (match_operand:DI  1 "general_operand")) ;; Length
>>      (match_operand          3 "immediate_operand")] ;; Align
>> - "TARGET_SIMD || TARGET_MOPS"
>> + ""
>>    {
>>     if (aarch64_expand_setmem (operands))
>>       DONE;

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2023-11-30 12:03 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-20 13:50 [PATCH] AArch64: Fix strict-align cpymem/setmem [PR103100] Wilco Dijkstra
2023-09-20 15:52 ` Richard Earnshaw (lists)
2023-09-20 17:08   ` Wilco Dijkstra
2023-09-21 14:24   ` [PATCH v2] " Wilco Dijkstra
2023-10-16 12:48     ` Wilco Dijkstra
2023-11-06 12:09       ` Wilco Dijkstra
2023-11-29 18:09     ` Richard Sandiford
2023-11-30 12:03       ` Richard Earnshaw

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).