[AArch64, PATCH] Improve Neon store of zero

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [AArch64, PATCH] Improve Neon store of zero
@ 2017-08-10 13:38 Jackson Woodruff
  2017-08-11 15:16 ` Richard Earnshaw (lists)
  0 siblings, 1 reply; 10+ messages in thread
From: Jackson Woodruff @ 2017-08-10 13:38 UTC (permalink / raw)
  To: gcc-patches; +Cc: James Greenhalgh, richard.earnshaw

[-- Attachment #1: Type: text/plain, Size: 715 bytes --]

Hi all,

This patch changes patterns in aarch64-simd.md to replace

     movi    v0.4s, 0
     str    q0, [x0, 16]

With:

     stp xzr, xzr, [x0, 16]

When we are storing zeros to vectors like this:

     void f(uint32x4_t *p) {
       uint32x4_t x = { 0, 0, 0, 0};
       p[1] = x;
     }

Bootstrapped and regtested on aarch64 with no regressions.
OK for trunk?

Jackson

gcc/

2017-08-09  Jackson Woodruff  <jackson.woodruff@arm.com>

	* aarch64-simd.md (mov<mode>): No longer force zero
	immediate into register.
	(*aarch64_simd_mov<mode>): Add new case for stp
	using zero immediate.


gcc/testsuite

2017-08-09  Jackson Woodruff  <jackson.woodruff@arm.com>

	* gcc.target/aarch64/simd/neon_str_zero.c: New.


[-- Attachment #2: patchfile --]
[-- Type: text/plain, Size: 4374 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 74de9b8c89dd5e4e3d87504594c969de0e0128ce..0149a742d34ae4fd5b3fd705b03c845f94aa1d59 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -23,7 +23,10 @@
 	(match_operand:VALL_F16 1 "general_operand" ""))]
   "TARGET_SIMD"
   "
-    if (GET_CODE (operands[0]) == MEM)
+    if (GET_CODE (operands[0]) == MEM
+	    && !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
+		 && aarch64_legitimate_address_p (<MODE>mode, operands[0],
+						  PARALLEL, 1)))
       operands[1] = force_reg (<MODE>mode, operands[1]);
   "
 )
@@ -94,63 +97,70 @@
 
 (define_insn "*aarch64_simd_mov<mode>"
   [(set (match_operand:VD 0 "nonimmediate_operand"
-		"=w, m,  w, ?r, ?w, ?r, w")
+		"=w, m,  m,  w, ?r, ?w, ?r, w")
 	(match_operand:VD 1 "general_operand"
-		"m,  w,  w,  w,  r,  r, Dn"))]
+		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
-   && (register_operand (operands[0], <MODE>mode)
-       || register_operand (operands[1], <MODE>mode))"
+   && ((register_operand (operands[0], <MODE>mode)
+       || register_operand (operands[1], <MODE>mode))
+      || (memory_operand (operands[0], <MODE>mode)
+	  && immediate_operand (operands[1], <MODE>mode)))"
 {
    switch (which_alternative)
      {
      case 0: return "ldr\\t%d0, %1";
-     case 1: return "str\\t%d1, %0";
-     case 2: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
-     case 3: return "umov\t%0, %1.d[0]";
-     case 4: return "fmov\t%d0, %1";
-     case 5: return "mov\t%0, %1";
-     case 6:
+     case 1: return "str\\txzr, %0";
+     case 2: return "str\\t%d1, %0";
+     case 3: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
+     case 4: return "umov\t%0, %1.d[0]";
+     case 5: return "fmov\t%d0, %1";
+     case 6: return "mov\t%0, %1";
+     case 7:
 	return aarch64_output_simd_mov_immediate (operands[1],
 						  <MODE>mode, 64);
      default: gcc_unreachable ();
      }
 }
-  [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
+  [(set_attr "type" "neon_load1_1reg<q>, neon_stp, neon_store1_1reg<q>,\
 		     neon_logic<q>, neon_to_gp<q>, f_mcr,\
 		     mov_reg, neon_move<q>")]
 )
 
 (define_insn "*aarch64_simd_mov<mode>"
   [(set (match_operand:VQ 0 "nonimmediate_operand"
-		"=w, m,  w, ?r, ?w, ?r, w")
+		"=w, Ump,  m,  w, ?r, ?w, ?r, w")
 	(match_operand:VQ 1 "general_operand"
-		"m,  w,  w,  w,  r,  r, Dn"))]
+		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
-   && (register_operand (operands[0], <MODE>mode)
-       || register_operand (operands[1], <MODE>mode))"
+   && ((register_operand (operands[0], <MODE>mode)
+	|| register_operand (operands[1], <MODE>mode))
+       || (memory_operand (operands[0], <MODE>mode)
+	   && immediate_operand (operands[1], <MODE>mode)))"
 {
   switch (which_alternative)
     {
     case 0:
 	return "ldr\\t%q0, %1";
     case 1:
-	return "str\\t%q1, %0";
+	return "stp\\txzr, xzr, %0";
     case 2:
-	return "mov\t%0.<Vbtype>, %1.<Vbtype>";
+	return "str\\t%q1, %0";
     case 3:
+	return "mov\t%0.<Vbtype>, %1.<Vbtype>";
     case 4:
     case 5:
-	return "#";
     case 6:
+	return "#";
+    case 7:
 	return aarch64_output_simd_mov_immediate (operands[1], <MODE>mode, 128);
     default:
 	gcc_unreachable ();
     }
 }
   [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
-                     neon_logic<q>, multiple, multiple, multiple,\
-                     neon_move<q>")
-   (set_attr "length" "4,4,4,8,8,8,4")]
+		     neon_stp, neon_logic<q>, multiple, multiple,\
+		     multiple, neon_move<q>")
+   (set_attr "length" "4,4,4,4,8,8,8,4")]
 )
 
 ;; When storing lane zero we can use the normal STR and its more permissive
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
new file mode 100644
index 0000000000000000000000000000000000000000..07198de109432b530745cc540790303ae0245efb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <arm_neon.h>
+
+void
+f (uint32x4_t *p)
+{
+  uint32x4_t x = { 0, 0, 0, 0};
+  p[1] = x;
+
+  /* { dg-final { scan-assembler "stp\txzr, xzr," } } */
+}
+
+void
+g (float32x2_t *p)
+{
+  float32x2_t x = {0.0, 0.0};
+  p[0] = x;
+
+  /* { dg-final { scan-assembler "str\txzr, " } } */
+}

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [AArch64, PATCH] Improve Neon store of zero
  2017-08-10 13:38 [AArch64, PATCH] Improve Neon store of zero Jackson Woodruff
@ 2017-08-11 15:16 ` Richard Earnshaw (lists)
  2017-08-16 16:01   ` Jackson Woodruff
  0 siblings, 1 reply; 10+ messages in thread
From: Richard Earnshaw (lists) @ 2017-08-11 15:16 UTC (permalink / raw)
  To: Jackson Woodruff, gcc-patches; +Cc: James Greenhalgh

On 10/08/17 14:12, Jackson Woodruff wrote:
> Hi all,
> 
> This patch changes patterns in aarch64-simd.md to replace
> 
>     movi    v0.4s, 0
>     str    q0, [x0, 16]
> 
> With:
> 
>     stp xzr, xzr, [x0, 16]
> 
> When we are storing zeros to vectors like this:
> 
>     void f(uint32x4_t *p) {
>       uint32x4_t x = { 0, 0, 0, 0};
>       p[1] = x;
>     }
> 
> Bootstrapped and regtested on aarch64 with no regressions.
> OK for trunk?
> 
> Jackson
> 
> gcc/
> 
> 2017-08-09  Jackson Woodruff  <jackson.woodruff@arm.com>
> 
>     * aarch64-simd.md (mov<mode>): No longer force zero
>     immediate into register.
>     (*aarch64_simd_mov<mode>): Add new case for stp
>     using zero immediate.
> 
> 
> gcc/testsuite
> 
> 2017-08-09  Jackson Woodruff  <jackson.woodruff@arm.com>
> 
>     * gcc.target/aarch64/simd/neon_str_zero.c: New.
> 
> 
> patchfile
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 74de9b8c89dd5e4e3d87504594c969de0e0128ce..0149a742d34ae4fd5b3fd705b03c845f94aa1d59 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -23,7 +23,10 @@
>  	(match_operand:VALL_F16 1 "general_operand" ""))]
>    "TARGET_SIMD"
>    "
> -    if (GET_CODE (operands[0]) == MEM)
> +    if (GET_CODE (operands[0]) == MEM
> +	    && !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
> +		 && aarch64_legitimate_address_p (<MODE>mode, operands[0],
> +						  PARALLEL, 1)))
>        operands[1] = force_reg (<MODE>mode, operands[1]);
>    "
>  )
> @@ -94,63 +97,70 @@
>  
>  (define_insn "*aarch64_simd_mov<mode>"
>    [(set (match_operand:VD 0 "nonimmediate_operand"
> -		"=w, m,  w, ?r, ?w, ?r, w")
> +		"=w, m,  m,  w, ?r, ?w, ?r, w")
>  	(match_operand:VD 1 "general_operand"
> -		"m,  w,  w,  w,  r,  r, Dn"))]
> +		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
>    "TARGET_SIMD
> -   && (register_operand (operands[0], <MODE>mode)
> -       || register_operand (operands[1], <MODE>mode))"
> +   && ((register_operand (operands[0], <MODE>mode)
> +       || register_operand (operands[1], <MODE>mode))
> +      || (memory_operand (operands[0], <MODE>mode)
> +	  && immediate_operand (operands[1], <MODE>mode)))"

Allowing any immediate here seems too lax - it allows any immediate
value which then could cause reload operations to be inserted (that in
turn might cause register pressure calculations to be incorrect).
Wouldn't it be better to use something like aarch64_simd_reg_or_zero?
Similarly below.

R.

>  {
>     switch (which_alternative)
>       {
>       case 0: return "ldr\\t%d0, %1";
> -     case 1: return "str\\t%d1, %0";
> -     case 2: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
> -     case 3: return "umov\t%0, %1.d[0]";
> -     case 4: return "fmov\t%d0, %1";
> -     case 5: return "mov\t%0, %1";
> -     case 6:
> +     case 1: return "str\\txzr, %0";
> +     case 2: return "str\\t%d1, %0";
> +     case 3: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
> +     case 4: return "umov\t%0, %1.d[0]";
> +     case 5: return "fmov\t%d0, %1";
> +     case 6: return "mov\t%0, %1";
> +     case 7:
>  	return aarch64_output_simd_mov_immediate (operands[1],
>  						  <MODE>mode, 64);
>       default: gcc_unreachable ();
>       }
>  }
> -  [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
> +  [(set_attr "type" "neon_load1_1reg<q>, neon_stp, neon_store1_1reg<q>,\
>  		     neon_logic<q>, neon_to_gp<q>, f_mcr,\
>  		     mov_reg, neon_move<q>")]
>  )
>  
>  (define_insn "*aarch64_simd_mov<mode>"
>    [(set (match_operand:VQ 0 "nonimmediate_operand"
> -		"=w, m,  w, ?r, ?w, ?r, w")
> +		"=w, Ump,  m,  w, ?r, ?w, ?r, w")
>  	(match_operand:VQ 1 "general_operand"
> -		"m,  w,  w,  w,  r,  r, Dn"))]
> +		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
>    "TARGET_SIMD
> -   && (register_operand (operands[0], <MODE>mode)
> -       || register_operand (operands[1], <MODE>mode))"
> +   && ((register_operand (operands[0], <MODE>mode)
> +	|| register_operand (operands[1], <MODE>mode))
> +       || (memory_operand (operands[0], <MODE>mode)
> +	   && immediate_operand (operands[1], <MODE>mode)))"
>  {
>    switch (which_alternative)
>      {
>      case 0:
>  	return "ldr\\t%q0, %1";
>      case 1:
> -	return "str\\t%q1, %0";
> +	return "stp\\txzr, xzr, %0";
>      case 2:
> -	return "mov\t%0.<Vbtype>, %1.<Vbtype>";
> +	return "str\\t%q1, %0";
>      case 3:
> +	return "mov\t%0.<Vbtype>, %1.<Vbtype>";
>      case 4:
>      case 5:
> -	return "#";
>      case 6:
> +	return "#";
> +    case 7:
>  	return aarch64_output_simd_mov_immediate (operands[1], <MODE>mode, 128);
>      default:
>  	gcc_unreachable ();
>      }
>  }
>    [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
> -                     neon_logic<q>, multiple, multiple, multiple,\
> -                     neon_move<q>")
> -   (set_attr "length" "4,4,4,8,8,8,4")]
> +		     neon_stp, neon_logic<q>, multiple, multiple,\
> +		     multiple, neon_move<q>")
> +   (set_attr "length" "4,4,4,4,8,8,8,4")]
>  )
>  
>  ;; When storing lane zero we can use the normal STR and its more permissive
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..07198de109432b530745cc540790303ae0245efb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1" } */
> +
> +#include <arm_neon.h>
> +
> +void
> +f (uint32x4_t *p)
> +{
> +  uint32x4_t x = { 0, 0, 0, 0};
> +  p[1] = x;
> +
> +  /* { dg-final { scan-assembler "stp\txzr, xzr," } } */
> +}
> +
> +void
> +g (float32x2_t *p)
> +{
> +  float32x2_t x = {0.0, 0.0};
> +  p[0] = x;
> +
> +  /* { dg-final { scan-assembler "str\txzr, " } } */
> +}
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [AArch64, PATCH] Improve Neon store of zero
  2017-08-11 15:16 ` Richard Earnshaw (lists)
@ 2017-08-16 16:01   ` Jackson Woodruff
  2017-08-17 13:56     ` Richard Earnshaw (lists)
  2017-08-23 14:46     ` Richard Sandiford
  0 siblings, 2 replies; 10+ messages in thread
From: Jackson Woodruff @ 2017-08-16 16:01 UTC (permalink / raw)
  To: Richard Earnshaw (lists), gcc-patches; +Cc: James Greenhalgh

[-- Attachment #1: Type: text/plain, Size: 6194 bytes --]

Hi Richard,

I have changed the condition as you suggest below. OK for trunk?

Jackson.

On 08/11/2017 02:56 PM, Richard Earnshaw (lists) wrote:

> On 10/08/17 14:12, Jackson Woodruff wrote:
>> Hi all,
>>
>> This patch changes patterns in aarch64-simd.md to replace
>>
>>      movi    v0.4s, 0
>>      str    q0, [x0, 16]
>>
>> With:
>>
>>      stp xzr, xzr, [x0, 16]
>>
>> When we are storing zeros to vectors like this:
>>
>>      void f(uint32x4_t *p) {
>>        uint32x4_t x = { 0, 0, 0, 0};
>>        p[1] = x;
>>      }
>>
>> Bootstrapped and regtested on aarch64 with no regressions.
>> OK for trunk?
>>
>> Jackson
>>
>> gcc/
>>
>> 2017-08-09  Jackson Woodruff  <jackson.woodruff@arm.com>
>>
>>      * aarch64-simd.md (mov<mode>): No longer force zero
>>      immediate into register.
>>      (*aarch64_simd_mov<mode>): Add new case for stp
>>      using zero immediate.
>>
>>
>> gcc/testsuite
>>
>> 2017-08-09  Jackson Woodruff  <jackson.woodruff@arm.com>
>>
>>      * gcc.target/aarch64/simd/neon_str_zero.c: New.
>>
>>
>> patchfile
>>
>>
>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
>> index 74de9b8c89dd5e4e3d87504594c969de0e0128ce..0149a742d34ae4fd5b3fd705b03c845f94aa1d59 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -23,7 +23,10 @@
>>   	(match_operand:VALL_F16 1 "general_operand" ""))]
>>     "TARGET_SIMD"
>>     "
>> -    if (GET_CODE (operands[0]) == MEM)
>> +    if (GET_CODE (operands[0]) == MEM
>> +	    && !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
>> +		 && aarch64_legitimate_address_p (<MODE>mode, operands[0],
>> +						  PARALLEL, 1)))
>>         operands[1] = force_reg (<MODE>mode, operands[1]);
>>     "
>>   )
>> @@ -94,63 +97,70 @@
>>   
>>   (define_insn "*aarch64_simd_mov<mode>"
>>     [(set (match_operand:VD 0 "nonimmediate_operand"
>> -		"=w, m,  w, ?r, ?w, ?r, w")
>> +		"=w, m,  m,  w, ?r, ?w, ?r, w")
>>   	(match_operand:VD 1 "general_operand"
>> -		"m,  w,  w,  w,  r,  r, Dn"))]
>> +		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
>>     "TARGET_SIMD
>> -   && (register_operand (operands[0], <MODE>mode)
>> -       || register_operand (operands[1], <MODE>mode))"
>> +   && ((register_operand (operands[0], <MODE>mode)
>> +       || register_operand (operands[1], <MODE>mode))
>> +      || (memory_operand (operands[0], <MODE>mode)
>> +	  && immediate_operand (operands[1], <MODE>mode)))"
> Allowing any immediate here seems too lax - it allows any immediate
> value which then could cause reload operations to be inserted (that in
> turn might cause register pressure calculations to be incorrect).
> Wouldn't it be better to use something like aarch64_simd_reg_or_zero?
> Similarly below.
>
> R.
>
>>   {
>>      switch (which_alternative)
>>        {
>>        case 0: return "ldr\\t%d0, %1";
>> -     case 1: return "str\\t%d1, %0";
>> -     case 2: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
>> -     case 3: return "umov\t%0, %1.d[0]";
>> -     case 4: return "fmov\t%d0, %1";
>> -     case 5: return "mov\t%0, %1";
>> -     case 6:
>> +     case 1: return "str\\txzr, %0";
>> +     case 2: return "str\\t%d1, %0";
>> +     case 3: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
>> +     case 4: return "umov\t%0, %1.d[0]";
>> +     case 5: return "fmov\t%d0, %1";
>> +     case 6: return "mov\t%0, %1";
>> +     case 7:
>>   	return aarch64_output_simd_mov_immediate (operands[1],
>>   						  <MODE>mode, 64);
>>        default: gcc_unreachable ();
>>        }
>>   }
>> -  [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
>> +  [(set_attr "type" "neon_load1_1reg<q>, neon_stp, neon_store1_1reg<q>,\
>>   		     neon_logic<q>, neon_to_gp<q>, f_mcr,\
>>   		     mov_reg, neon_move<q>")]
>>   )
>>   
>>   (define_insn "*aarch64_simd_mov<mode>"
>>     [(set (match_operand:VQ 0 "nonimmediate_operand"
>> -		"=w, m,  w, ?r, ?w, ?r, w")
>> +		"=w, Ump,  m,  w, ?r, ?w, ?r, w")
>>   	(match_operand:VQ 1 "general_operand"
>> -		"m,  w,  w,  w,  r,  r, Dn"))]
>> +		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
>>     "TARGET_SIMD
>> -   && (register_operand (operands[0], <MODE>mode)
>> -       || register_operand (operands[1], <MODE>mode))"
>> +   && ((register_operand (operands[0], <MODE>mode)
>> +	|| register_operand (operands[1], <MODE>mode))
>> +       || (memory_operand (operands[0], <MODE>mode)
>> +	   && immediate_operand (operands[1], <MODE>mode)))"
>>   {
>>     switch (which_alternative)
>>       {
>>       case 0:
>>   	return "ldr\\t%q0, %1";
>>       case 1:
>> -	return "str\\t%q1, %0";
>> +	return "stp\\txzr, xzr, %0";
>>       case 2:
>> -	return "mov\t%0.<Vbtype>, %1.<Vbtype>";
>> +	return "str\\t%q1, %0";
>>       case 3:
>> +	return "mov\t%0.<Vbtype>, %1.<Vbtype>";
>>       case 4:
>>       case 5:
>> -	return "#";
>>       case 6:
>> +	return "#";
>> +    case 7:
>>   	return aarch64_output_simd_mov_immediate (operands[1], <MODE>mode, 128);
>>       default:
>>   	gcc_unreachable ();
>>       }
>>   }
>>     [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
>> -                     neon_logic<q>, multiple, multiple, multiple,\
>> -                     neon_move<q>")
>> -   (set_attr "length" "4,4,4,8,8,8,4")]
>> +		     neon_stp, neon_logic<q>, multiple, multiple,\
>> +		     multiple, neon_move<q>")
>> +   (set_attr "length" "4,4,4,4,8,8,8,4")]
>>   )
>>   
>>   ;; When storing lane zero we can use the normal STR and its more permissive
>> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..07198de109432b530745cc540790303ae0245efb
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
>> @@ -0,0 +1,22 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O1" } */
>> +
>> +#include <arm_neon.h>
>> +
>> +void
>> +f (uint32x4_t *p)
>> +{
>> +  uint32x4_t x = { 0, 0, 0, 0};
>> +  p[1] = x;
>> +
>> +  /* { dg-final { scan-assembler "stp\txzr, xzr," } } */
>> +}
>> +
>> +void
>> +g (float32x2_t *p)
>> +{
>> +  float32x2_t x = {0.0, 0.0};
>> +  p[0] = x;
>> +
>> +  /* { dg-final { scan-assembler "str\txzr, " } } */
>> +}
>>


[-- Attachment #2: patchfile --]
[-- Type: text/plain, Size: 4140 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 74de9b8c89dd5e4e3d87504594c969de0e0128ce..ce1b981fc005edf48a401a456def2a37cf9d9022 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -23,7 +23,10 @@
 	(match_operand:VALL_F16 1 "general_operand" ""))]
   "TARGET_SIMD"
   "
-    if (GET_CODE (operands[0]) == MEM)
+    if (GET_CODE (operands[0]) == MEM
+	&& !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
+	     && aarch64_legitimate_address_p (<MODE>mode, operands[0],
+					      PARALLEL, 1)))
       operands[1] = force_reg (<MODE>mode, operands[1]);
   "
 )
@@ -94,63 +97,66 @@
 
 (define_insn "*aarch64_simd_mov<mode>"
   [(set (match_operand:VD 0 "nonimmediate_operand"
-		"=w, m,  w, ?r, ?w, ?r, w")
+		"=w, m,  m,  w, ?r, ?w, ?r, w")
 	(match_operand:VD 1 "general_operand"
-		"m,  w,  w,  w,  r,  r, Dn"))]
+		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
-       || register_operand (operands[1], <MODE>mode))"
+       || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
 {
    switch (which_alternative)
      {
-     case 0: return "ldr\\t%d0, %1";
-     case 1: return "str\\t%d1, %0";
-     case 2: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
-     case 3: return "umov\t%0, %1.d[0]";
-     case 4: return "fmov\t%d0, %1";
-     case 5: return "mov\t%0, %1";
-     case 6:
+     case 0: return "ldr\t%d0, %1";
+     case 1: return "str\txzr, %0";
+     case 2: return "str\t%d1, %0";
+     case 3: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
+     case 4: return "umov\t%0, %1.d[0]";
+     case 5: return "fmov\t%d0, %1";
+     case 6: return "mov\t%0, %1";
+     case 7:
 	return aarch64_output_simd_mov_immediate (operands[1],
 						  <MODE>mode, 64);
      default: gcc_unreachable ();
      }
 }
-  [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
+  [(set_attr "type" "neon_load1_1reg<q>, neon_stp, neon_store1_1reg<q>,\
 		     neon_logic<q>, neon_to_gp<q>, f_mcr,\
 		     mov_reg, neon_move<q>")]
 )
 
 (define_insn "*aarch64_simd_mov<mode>"
   [(set (match_operand:VQ 0 "nonimmediate_operand"
-		"=w, m,  w, ?r, ?w, ?r, w")
+		"=w, Ump,  m,  w, ?r, ?w, ?r, w")
 	(match_operand:VQ 1 "general_operand"
-		"m,  w,  w,  w,  r,  r, Dn"))]
+		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
-       || register_operand (operands[1], <MODE>mode))"
+       || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
 {
   switch (which_alternative)
     {
     case 0:
-	return "ldr\\t%q0, %1";
+	return "ldr\t%q0, %1";
     case 1:
-	return "str\\t%q1, %0";
+	return "stp\txzr, xzr, %0";
     case 2:
-	return "mov\t%0.<Vbtype>, %1.<Vbtype>";
+	return "str\t%q1, %0";
     case 3:
+	return "mov\t%0.<Vbtype>, %1.<Vbtype>";
     case 4:
     case 5:
-	return "#";
     case 6:
+	return "#";
+    case 7:
 	return aarch64_output_simd_mov_immediate (operands[1], <MODE>mode, 128);
     default:
 	gcc_unreachable ();
     }
 }
   [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
-                     neon_logic<q>, multiple, multiple, multiple,\
-                     neon_move<q>")
-   (set_attr "length" "4,4,4,8,8,8,4")]
+		     neon_stp, neon_logic<q>, multiple, multiple,\
+		     multiple, neon_move<q>")
+   (set_attr "length" "4,4,4,4,8,8,8,4")]
 )
 
 ;; When storing lane zero we can use the normal STR and its more permissive
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
new file mode 100644
index 0000000000000000000000000000000000000000..07198de109432b530745cc540790303ae0245efb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <arm_neon.h>
+
+void
+f (uint32x4_t *p)
+{
+  uint32x4_t x = { 0, 0, 0, 0};
+  p[1] = x;
+
+  /* { dg-final { scan-assembler "stp\txzr, xzr," } } */
+}
+
+void
+g (float32x2_t *p)
+{
+  float32x2_t x = {0.0, 0.0};
+  p[0] = x;
+
+  /* { dg-final { scan-assembler "str\txzr, " } } */
+}

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [AArch64, PATCH] Improve Neon store of zero
  2017-08-16 16:01   ` Jackson Woodruff
@ 2017-08-17 13:56     ` Richard Earnshaw (lists)
  2017-08-23 14:46     ` Richard Sandiford
  1 sibling, 0 replies; 10+ messages in thread
From: Richard Earnshaw (lists) @ 2017-08-17 13:56 UTC (permalink / raw)
  To: Jackson Woodruff, gcc-patches; +Cc: James Greenhalgh

On 16/08/17 16:19, Jackson Woodruff wrote:
> Hi Richard,
> 
> I have changed the condition as you suggest below. OK for trunk?
> 
> Jackson.
> 

I renamed the testcase to vect_str_zero.c, as that seems to more closely
match the naming style, and checked this in.

Thanks for the patch.

R.

> On 08/11/2017 02:56 PM, Richard Earnshaw (lists) wrote:
> 
>> On 10/08/17 14:12, Jackson Woodruff wrote:
>>> Hi all,
>>>
>>> This patch changes patterns in aarch64-simd.md to replace
>>>
>>>      movi    v0.4s, 0
>>>      str    q0, [x0, 16]
>>>
>>> With:
>>>
>>>      stp xzr, xzr, [x0, 16]
>>>
>>> When we are storing zeros to vectors like this:
>>>
>>>      void f(uint32x4_t *p) {
>>>        uint32x4_t x = { 0, 0, 0, 0};
>>>        p[1] = x;
>>>      }
>>>
>>> Bootstrapped and regtested on aarch64 with no regressions.
>>> OK for trunk?
>>>
>>> Jackson
>>>
>>> gcc/
>>>
>>> 2017-08-09  Jackson Woodruff  <jackson.woodruff@arm.com>
>>>
>>>      * aarch64-simd.md (mov<mode>): No longer force zero
>>>      immediate into register.
>>>      (*aarch64_simd_mov<mode>): Add new case for stp
>>>      using zero immediate.
>>>
>>>
>>> gcc/testsuite
>>>
>>> 2017-08-09  Jackson Woodruff  <jackson.woodruff@arm.com>
>>>
>>>      * gcc.target/aarch64/simd/neon_str_zero.c: New.
>>>
>>>
>>> patchfile
>>>
>>>
>>> diff --git a/gcc/config/aarch64/aarch64-simd.md
>>> b/gcc/config/aarch64/aarch64-simd.md
>>> index
>>> 74de9b8c89dd5e4e3d87504594c969de0e0128ce..0149a742d34ae4fd5b3fd705b03c845f94aa1d59
>>> 100644
>>> --- a/gcc/config/aarch64/aarch64-simd.md
>>> +++ b/gcc/config/aarch64/aarch64-simd.md
>>> @@ -23,7 +23,10 @@
>>>       (match_operand:VALL_F16 1 "general_operand" ""))]
>>>     "TARGET_SIMD"
>>>     "
>>> -    if (GET_CODE (operands[0]) == MEM)
>>> +    if (GET_CODE (operands[0]) == MEM
>>> +        && !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
>>> +         && aarch64_legitimate_address_p (<MODE>mode, operands[0],
>>> +                          PARALLEL, 1)))
>>>         operands[1] = force_reg (<MODE>mode, operands[1]);
>>>     "
>>>   )
>>> @@ -94,63 +97,70 @@
>>>     (define_insn "*aarch64_simd_mov<mode>"
>>>     [(set (match_operand:VD 0 "nonimmediate_operand"
>>> -        "=w, m,  w, ?r, ?w, ?r, w")
>>> +        "=w, m,  m,  w, ?r, ?w, ?r, w")
>>>       (match_operand:VD 1 "general_operand"
>>> -        "m,  w,  w,  w,  r,  r, Dn"))]
>>> +        "m,  Dz, w,  w,  w,  r,  r, Dn"))]
>>>     "TARGET_SIMD
>>> -   && (register_operand (operands[0], <MODE>mode)
>>> -       || register_operand (operands[1], <MODE>mode))"
>>> +   && ((register_operand (operands[0], <MODE>mode)
>>> +       || register_operand (operands[1], <MODE>mode))
>>> +      || (memory_operand (operands[0], <MODE>mode)
>>> +      && immediate_operand (operands[1], <MODE>mode)))"
>> Allowing any immediate here seems too lax - it allows any immediate
>> value which then could cause reload operations to be inserted (that in
>> turn might cause register pressure calculations to be incorrect).
>> Wouldn't it be better to use something like aarch64_simd_reg_or_zero?
>> Similarly below.
>>
>> R.
>>
>>>   {
>>>      switch (which_alternative)
>>>        {
>>>        case 0: return "ldr\\t%d0, %1";
>>> -     case 1: return "str\\t%d1, %0";
>>> -     case 2: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
>>> -     case 3: return "umov\t%0, %1.d[0]";
>>> -     case 4: return "fmov\t%d0, %1";
>>> -     case 5: return "mov\t%0, %1";
>>> -     case 6:
>>> +     case 1: return "str\\txzr, %0";
>>> +     case 2: return "str\\t%d1, %0";
>>> +     case 3: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
>>> +     case 4: return "umov\t%0, %1.d[0]";
>>> +     case 5: return "fmov\t%d0, %1";
>>> +     case 6: return "mov\t%0, %1";
>>> +     case 7:
>>>       return aarch64_output_simd_mov_immediate (operands[1],
>>>                             <MODE>mode, 64);
>>>        default: gcc_unreachable ();
>>>        }
>>>   }
>>> -  [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
>>> +  [(set_attr "type" "neon_load1_1reg<q>, neon_stp,
>>> neon_store1_1reg<q>,\
>>>                neon_logic<q>, neon_to_gp<q>, f_mcr,\
>>>                mov_reg, neon_move<q>")]
>>>   )
>>>     (define_insn "*aarch64_simd_mov<mode>"
>>>     [(set (match_operand:VQ 0 "nonimmediate_operand"
>>> -        "=w, m,  w, ?r, ?w, ?r, w")
>>> +        "=w, Ump,  m,  w, ?r, ?w, ?r, w")
>>>       (match_operand:VQ 1 "general_operand"
>>> -        "m,  w,  w,  w,  r,  r, Dn"))]
>>> +        "m,  Dz, w,  w,  w,  r,  r, Dn"))]
>>>     "TARGET_SIMD
>>> -   && (register_operand (operands[0], <MODE>mode)
>>> -       || register_operand (operands[1], <MODE>mode))"
>>> +   && ((register_operand (operands[0], <MODE>mode)
>>> +    || register_operand (operands[1], <MODE>mode))
>>> +       || (memory_operand (operands[0], <MODE>mode)
>>> +       && immediate_operand (operands[1], <MODE>mode)))"
>>>   {
>>>     switch (which_alternative)
>>>       {
>>>       case 0:
>>>       return "ldr\\t%q0, %1";
>>>       case 1:
>>> -    return "str\\t%q1, %0";
>>> +    return "stp\\txzr, xzr, %0";
>>>       case 2:
>>> -    return "mov\t%0.<Vbtype>, %1.<Vbtype>";
>>> +    return "str\\t%q1, %0";
>>>       case 3:
>>> +    return "mov\t%0.<Vbtype>, %1.<Vbtype>";
>>>       case 4:
>>>       case 5:
>>> -    return "#";
>>>       case 6:
>>> +    return "#";
>>> +    case 7:
>>>       return aarch64_output_simd_mov_immediate (operands[1],
>>> <MODE>mode, 128);
>>>       default:
>>>       gcc_unreachable ();
>>>       }
>>>   }
>>>     [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
>>> -                     neon_logic<q>, multiple, multiple, multiple,\
>>> -                     neon_move<q>")
>>> -   (set_attr "length" "4,4,4,8,8,8,4")]
>>> +             neon_stp, neon_logic<q>, multiple, multiple,\
>>> +             multiple, neon_move<q>")
>>> +   (set_attr "length" "4,4,4,4,8,8,8,4")]
>>>   )
>>>     ;; When storing lane zero we can use the normal STR and its more
>>> permissive
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
>>> b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
>>> new file mode 100644
>>> index
>>> 0000000000000000000000000000000000000000..07198de109432b530745cc540790303ae0245efb
>>>
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
>>> @@ -0,0 +1,22 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O1" } */
>>> +
>>> +#include <arm_neon.h>
>>> +
>>> +void
>>> +f (uint32x4_t *p)
>>> +{
>>> +  uint32x4_t x = { 0, 0, 0, 0};
>>> +  p[1] = x;
>>> +
>>> +  /* { dg-final { scan-assembler "stp\txzr, xzr," } } */
>>> +}
>>> +
>>> +void
>>> +g (float32x2_t *p)
>>> +{
>>> +  float32x2_t x = {0.0, 0.0};
>>> +  p[0] = x;
>>> +
>>> +  /* { dg-final { scan-assembler "str\txzr, " } } */
>>> +}
>>>
> 
> 
> patchfile
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 74de9b8c89dd5e4e3d87504594c969de0e0128ce..ce1b981fc005edf48a401a456def2a37cf9d9022 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -23,7 +23,10 @@
>  	(match_operand:VALL_F16 1 "general_operand" ""))]
>    "TARGET_SIMD"
>    "
> -    if (GET_CODE (operands[0]) == MEM)
> +    if (GET_CODE (operands[0]) == MEM
> +	&& !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
> +	     && aarch64_legitimate_address_p (<MODE>mode, operands[0],
> +					      PARALLEL, 1)))
>        operands[1] = force_reg (<MODE>mode, operands[1]);
>    "
>  )
> @@ -94,63 +97,66 @@
>  
>  (define_insn "*aarch64_simd_mov<mode>"
>    [(set (match_operand:VD 0 "nonimmediate_operand"
> -		"=w, m,  w, ?r, ?w, ?r, w")
> +		"=w, m,  m,  w, ?r, ?w, ?r, w")
>  	(match_operand:VD 1 "general_operand"
> -		"m,  w,  w,  w,  r,  r, Dn"))]
> +		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
>    "TARGET_SIMD
>     && (register_operand (operands[0], <MODE>mode)
> -       || register_operand (operands[1], <MODE>mode))"
> +       || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
>  {
>     switch (which_alternative)
>       {
> -     case 0: return "ldr\\t%d0, %1";
> -     case 1: return "str\\t%d1, %0";
> -     case 2: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
> -     case 3: return "umov\t%0, %1.d[0]";
> -     case 4: return "fmov\t%d0, %1";
> -     case 5: return "mov\t%0, %1";
> -     case 6:
> +     case 0: return "ldr\t%d0, %1";
> +     case 1: return "str\txzr, %0";
> +     case 2: return "str\t%d1, %0";
> +     case 3: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
> +     case 4: return "umov\t%0, %1.d[0]";
> +     case 5: return "fmov\t%d0, %1";
> +     case 6: return "mov\t%0, %1";
> +     case 7:
>  	return aarch64_output_simd_mov_immediate (operands[1],
>  						  <MODE>mode, 64);
>       default: gcc_unreachable ();
>       }
>  }
> -  [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
> +  [(set_attr "type" "neon_load1_1reg<q>, neon_stp, neon_store1_1reg<q>,\
>  		     neon_logic<q>, neon_to_gp<q>, f_mcr,\
>  		     mov_reg, neon_move<q>")]
>  )
>  
>  (define_insn "*aarch64_simd_mov<mode>"
>    [(set (match_operand:VQ 0 "nonimmediate_operand"
> -		"=w, m,  w, ?r, ?w, ?r, w")
> +		"=w, Ump,  m,  w, ?r, ?w, ?r, w")
>  	(match_operand:VQ 1 "general_operand"
> -		"m,  w,  w,  w,  r,  r, Dn"))]
> +		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
>    "TARGET_SIMD
>     && (register_operand (operands[0], <MODE>mode)
> -       || register_operand (operands[1], <MODE>mode))"
> +       || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
>  {
>    switch (which_alternative)
>      {
>      case 0:
> -	return "ldr\\t%q0, %1";
> +	return "ldr\t%q0, %1";
>      case 1:
> -	return "str\\t%q1, %0";
> +	return "stp\txzr, xzr, %0";
>      case 2:
> -	return "mov\t%0.<Vbtype>, %1.<Vbtype>";
> +	return "str\t%q1, %0";
>      case 3:
> +	return "mov\t%0.<Vbtype>, %1.<Vbtype>";
>      case 4:
>      case 5:
> -	return "#";
>      case 6:
> +	return "#";
> +    case 7:
>  	return aarch64_output_simd_mov_immediate (operands[1], <MODE>mode, 128);
>      default:
>  	gcc_unreachable ();
>      }
>  }
>    [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
> -                     neon_logic<q>, multiple, multiple, multiple,\
> -                     neon_move<q>")
> -   (set_attr "length" "4,4,4,8,8,8,4")]
> +		     neon_stp, neon_logic<q>, multiple, multiple,\
> +		     multiple, neon_move<q>")
> +   (set_attr "length" "4,4,4,4,8,8,8,4")]
>  )
>  
>  ;; When storing lane zero we can use the normal STR and its more permissive
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..07198de109432b530745cc540790303ae0245efb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1" } */
> +
> +#include <arm_neon.h>
> +
> +void
> +f (uint32x4_t *p)
> +{
> +  uint32x4_t x = { 0, 0, 0, 0};
> +  p[1] = x;
> +
> +  /* { dg-final { scan-assembler "stp\txzr, xzr," } } */
> +}
> +
> +void
> +g (float32x2_t *p)
> +{
> +  float32x2_t x = {0.0, 0.0};
> +  p[0] = x;
> +
> +  /* { dg-final { scan-assembler "str\txzr, " } } */
> +}
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [AArch64, PATCH] Improve Neon store of zero
  2017-08-16 16:01   ` Jackson Woodruff
  2017-08-17 13:56     ` Richard Earnshaw (lists)
@ 2017-08-23 14:46     ` Richard Sandiford
  1 sibling, 0 replies; 10+ messages in thread
From: Richard Sandiford @ 2017-08-23 14:46 UTC (permalink / raw)
  To: Jackson Woodruff; +Cc: Richard Earnshaw (lists), gcc-patches, James Greenhalgh

Jackson Woodruff <jackson.woodruff@foss.arm.com> writes:
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 74de9b8c89dd5e4e3d87504594c969de0e0128ce..ce1b981fc005edf48a401a456def2a37cf9d9022 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -23,7 +23,10 @@
>  	(match_operand:VALL_F16 1 "general_operand" ""))]
>    "TARGET_SIMD"
>    "
> -    if (GET_CODE (operands[0]) == MEM)
> +    if (GET_CODE (operands[0]) == MEM
> +	&& !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
> +	     && aarch64_legitimate_address_p (<MODE>mode, operands[0],
> +					      PARALLEL, 1)))
>        operands[1] = force_reg (<MODE>mode, operands[1]);

Sorry for only noticing now, but the call to aarch64_legitimate_address_p
is asking whether the MEM itself is a legitimate LDP/STP address.  Also,
it might be better to pass false for strict_p, since this can be called
before RA.  So maybe:

    if (GET_CODE (operands[0]) == MEM
	&& !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
	     && aarch64_mem_pair_operand (operands[0], <MODE>mode)))

?

Thanks,
Richard

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [AArch64, PATCH] Improve Neon store of zero
  2017-09-13 16:35     ` Jackson Woodruff
@ 2017-09-13 16:51       ` James Greenhalgh
  0 siblings, 0 replies; 10+ messages in thread
From: James Greenhalgh @ 2017-09-13 16:51 UTC (permalink / raw)
  To: Jackson Woodruff
  Cc: Wilco Dijkstra, GCC Patches, richard.sandiford, nd, Richard Earnshaw

On Wed, Sep 13, 2017 at 05:34:56PM +0100, Jackson Woodruff wrote:
> Hi,
> 
> I have addressed the issues you raised below.
> 
> Is the amended patch OK for trunk?

Yes, thanks.

Committed as revision 252387.

Cheers,
James

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [AArch64, PATCH] Improve Neon store of zero
  2017-09-12 16:28   ` James Greenhalgh
@ 2017-09-13 16:35     ` Jackson Woodruff
  2017-09-13 16:51       ` James Greenhalgh
  0 siblings, 1 reply; 10+ messages in thread
From: Jackson Woodruff @ 2017-09-13 16:35 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: Wilco Dijkstra, GCC Patches, richard.sandiford, nd, Richard Earnshaw

[-- Attachment #1: Type: text/plain, Size: 3495 bytes --]

Hi,

I have addressed the issues you raised below.

Is the amended patch OK for trunk?

Thanks,

Jackson.

On 09/12/2017 05:28 PM, James Greenhalgh wrote:
> On Wed, Sep 06, 2017 at 10:02:52AM +0100, Jackson Woodruff wrote:
>> Hi all,
>>
>> I've attached a new patch that addresses some of the issues raised with
>> my original patch.
>>
>> On 08/23/2017 03:35 PM, Wilco Dijkstra wrote:
>>> Richard Sandiford wrote:
>>>>
>>>> Sorry for only noticing now, but the call to aarch64_legitimate_address_p
>>>> is asking whether the MEM itself is a legitimate LDP/STP address.  Also,
>>>> it might be better to pass false for strict_p, since this can be called
>>>> before RA.  So maybe:
>>>>
>>>>      if (GET_CODE (operands[0]) == MEM
>>>> 	&& !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
>>>> 	     && aarch64_mem_pair_operand (operands[0], <MODE>mode)))
>>
>> There were also some issues with the choice of mode for the call the
>> aarch64_mem_pair_operand.
>>
>> For a 128-bit wide mode, we want to check `aarch64_mem_pair_operand
>> (operands[0], DImode)` since that's what the stp will be.
>>
>> For a 64-bit wide mode, we don't need to do that check because a normal
>> `str` can be issued.
>>
>> I've updated the condition as such.
>>
>>>
>>> Is there any reason for doing this check at all (or at least this early during
>>> expand)?
>>
>> Not doing this check means that the zero is forced into a register, so
>> we then carry around a bit more RTL and rely on combine to merge things.
>>
>>>
>>> There is a similar issue with this part:
>>>
>>>    (define_insn "*aarch64_simd_mov<mode>"
>>>      [(set (match_operand:VQ 0 "nonimmediate_operand"
>>> -		"=w, m,  w, ?r, ?w, ?r, w")
>>> +		"=w, Ump,  m,  w, ?r, ?w, ?r, w")
>>>
>>> The Ump causes the instruction to always split off the address offset. Ump
>>> cannot be used in patterns that are generated before register allocation as it
>>> also calls laarch64_legitimate_address_p with strict_p set to true.
>>
>> I've changed the constraint to a new constraint 'Umq', that acts the
>> same as Ump, but calls aarch64_legitimate_address_p with strict_p set to
>> false and uses DImode for the mode to pass.
> 
> This looks mostly OK to me, but this conditional:
> 
>> +  if (GET_CODE (operands[0]) == MEM
>> +      && !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
>> +	   && ((GET_MODE_SIZE (<MODE>mode) == 16
>> +		&& aarch64_mem_pair_operand (operands[0], DImode))
>> +	       || GET_MODE_SIZE (<MODE>mode) == 8)))
> 
> Has grown a bit too big in such a general pattern to live without a comment
> explaining what is going on.
> 
>> +(define_memory_constraint "Umq"
>> +  "@internal
>> +   A memory address which uses a base register with an offset small enough for
>> +   a load/store pair operation in DI mode."
>> +   (and (match_code "mem")
>> +	(match_test "aarch64_legitimate_address_p (DImode, XEXP (op, 0),
>> +						   PARALLEL, 0)")))
> 
> And here you want 'false' rather than '0'.
> 
> I'll happily merge the patch with those changes, please send an update.
> 
> Thanks,
> James
> 
> 
>>
>> ChangeLog:
>>
>> gcc/
>>
>> 2017-08-29  Jackson Woodruff  <jackson.woodruff@arm.com>
>>
>> 	* config/aarch64/constraints.md (Umq): New constraint.
>> 	* config/aarch64/aarch64-simd.md (*aarch64_simd_mov<mode>):
>> 	Change to use Umq.
>> 	(mov<mode>): Update condition.
>>
>> gcc/testsuite
>>
>> 2017-08-29  Jackson Woodruff  <jackson.woodruff@arm.com>
>>
>> 	* gcc.target/aarch64/simd/vect_str_zero.c:
>> 	Update testcase.
> 

[-- Attachment #2: patchfile --]
[-- Type: text/x-patch, Size: 2974 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index f3e084f8778d70c82823b92fa80ff96021ad26db..c20e513f59a35f3410eae3eb0fdc2fc86352a9fc 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -23,10 +23,17 @@
 	(match_operand:VALL_F16 1 "general_operand" ""))]
   "TARGET_SIMD"
   "
-    if (GET_CODE (operands[0]) == MEM
-	&& !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
-	     && aarch64_legitimate_address_p (<MODE>mode, operands[0],
-					      PARALLEL, 1)))
+  /* Force the operand into a register if it is not an
+     immediate whose use can be replaced with xzr.
+     If the mode is 16 bytes wide, then we will be doing
+     a stp in DI mode, so we check the validity of that.
+     If the mode is 8 bytes wide, then we will do doing a
+     normal str, so the check need not apply.  */
+  if (GET_CODE (operands[0]) == MEM
+      && !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
+	   && ((GET_MODE_SIZE (<MODE>mode) == 16
+		&& aarch64_mem_pair_operand (operands[0], DImode))
+	       || GET_MODE_SIZE (<MODE>mode) == 8)))
       operands[1] = force_reg (<MODE>mode, operands[1]);
   "
 )
@@ -126,7 +133,7 @@
 
 (define_insn "*aarch64_simd_mov<mode>"
   [(set (match_operand:VQ 0 "nonimmediate_operand"
-		"=w, Ump,  m,  w, ?r, ?w, ?r, w")
+		"=w, Umq,  m,  w, ?r, ?w, ?r, w")
 	(match_operand:VQ 1 "general_operand"
 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 9ce3d4efaf31a301dfb7c1772a6b685fb2cbd2ee..3649fb48a33454c208a6b81e051fdd316c495710 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -156,6 +156,14 @@
  (and (match_code "mem")
       (match_test "REG_P (XEXP (op, 0))")))
 
+(define_memory_constraint "Umq"
+  "@internal
+   A memory address which uses a base register with an offset small enough for
+   a load/store pair operation in DI mode."
+   (and (match_code "mem")
+	(match_test "aarch64_legitimate_address_p (DImode, XEXP (op, 0),
+						   PARALLEL, false)")))
+
 (define_memory_constraint "Ump"
   "@internal
   A memory address suitable for a load/store pair operation."
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vect_str_zero.c b/gcc/testsuite/gcc.target/aarch64/simd/vect_str_zero.c
index 07198de109432b530745cc540790303ae0245efb..00cbf20a0b293e71ed713f0c08d89d8a525fa785 100644
--- a/gcc/testsuite/gcc.target/aarch64/simd/vect_str_zero.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vect_str_zero.c
@@ -7,7 +7,7 @@ void
 f (uint32x4_t *p)
 {
   uint32x4_t x = { 0, 0, 0, 0};
-  p[1] = x;
+  p[4] = x;
 
   /* { dg-final { scan-assembler "stp\txzr, xzr," } } */
 }
@@ -16,7 +16,9 @@ void
 g (float32x2_t *p)
 {
   float32x2_t x = {0.0, 0.0};
-  p[0] = x;
+  p[400] = x;
 
   /* { dg-final { scan-assembler "str\txzr, " } } */
 }
+
+/* { dg-final { scan-assembler-not "add\tx\[0-9\]\+, x0, \[0-9\]+" } } */

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [AArch64, PATCH] Improve Neon store of zero
  2017-09-06  9:03 ` Jackson Woodruff
@ 2017-09-12 16:28   ` James Greenhalgh
  2017-09-13 16:35     ` Jackson Woodruff
  0 siblings, 1 reply; 10+ messages in thread
From: James Greenhalgh @ 2017-09-12 16:28 UTC (permalink / raw)
  To: Jackson Woodruff
  Cc: Wilco Dijkstra, GCC Patches, richard.sandiford, nd, Richard Earnshaw

On Wed, Sep 06, 2017 at 10:02:52AM +0100, Jackson Woodruff wrote:
> Hi all,
> 
> I've attached a new patch that addresses some of the issues raised with 
> my original patch.
> 
> On 08/23/2017 03:35 PM, Wilco Dijkstra wrote:
> > Richard Sandiford wrote:
> >>
> >> Sorry for only noticing now, but the call to aarch64_legitimate_address_p
> >> is asking whether the MEM itself is a legitimate LDP/STP address.  Also,
> >> it might be better to pass false for strict_p, since this can be called
> >> before RA.  So maybe:
> >>
> >>     if (GET_CODE (operands[0]) == MEM
> >> 	&& !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
> >> 	     && aarch64_mem_pair_operand (operands[0], <MODE>mode)))
> 
> There were also some issues with the choice of mode for the call the 
> aarch64_mem_pair_operand.
> 
> For a 128-bit wide mode, we want to check `aarch64_mem_pair_operand 
> (operands[0], DImode)` since that's what the stp will be.
> 
> For a 64-bit wide mode, we don't need to do that check because a normal
> `str` can be issued.
> 
> I've updated the condition as such.
> 
> > 
> > Is there any reason for doing this check at all (or at least this early during
> > expand)?
> 
> Not doing this check means that the zero is forced into a register, so 
> we then carry around a bit more RTL and rely on combine to merge things.
> 
> > 
> > There is a similar issue with this part:
> > 
> >   (define_insn "*aarch64_simd_mov<mode>"
> >     [(set (match_operand:VQ 0 "nonimmediate_operand"
> > -		"=w, m,  w, ?r, ?w, ?r, w")
> > +		"=w, Ump,  m,  w, ?r, ?w, ?r, w")
> > 
> > The Ump causes the instruction to always split off the address offset. Ump
> > cannot be used in patterns that are generated before register allocation as it
> > also calls laarch64_legitimate_address_p with strict_p set to true.
> 
> I've changed the constraint to a new constraint 'Umq', that acts the 
> same as Ump, but calls aarch64_legitimate_address_p with strict_p set to 
> false and uses DImode for the mode to pass.

This looks mostly OK to me, but this conditional:

> +  if (GET_CODE (operands[0]) == MEM
> +      && !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
> +	   && ((GET_MODE_SIZE (<MODE>mode) == 16
> +		&& aarch64_mem_pair_operand (operands[0], DImode))
> +	       || GET_MODE_SIZE (<MODE>mode) == 8)))

Has grown a bit too big in such a general pattern to live without a comment
explaining what is going on.

> +(define_memory_constraint "Umq"
> +  "@internal
> +   A memory address which uses a base register with an offset small enough for
> +   a load/store pair operation in DI mode."
> +   (and (match_code "mem")
> +	(match_test "aarch64_legitimate_address_p (DImode, XEXP (op, 0),
> +						   PARALLEL, 0)")))

And here you want 'false' rather than '0'.

I'll happily merge the patch with those changes, please send an update.

Thanks,
James


> 
> ChangeLog:
> 
> gcc/
> 
> 2017-08-29  Jackson Woodruff  <jackson.woodruff@arm.com>
> 
> 	* config/aarch64/constraints.md (Umq): New constraint.
> 	* config/aarch64/aarch64-simd.md (*aarch64_simd_mov<mode>):
> 	Change to use Umq.
> 	(mov<mode>): Update condition.
> 
> gcc/testsuite
> 
> 2017-08-29  Jackson Woodruff  <jackson.woodruff@arm.com>
> 
> 	* gcc.target/aarch64/simd/vect_str_zero.c:
> 	Update testcase.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [AArch64, PATCH] Improve Neon store of zero
  2017-08-23 15:06 Wilco Dijkstra
@ 2017-09-06  9:03 ` Jackson Woodruff
  2017-09-12 16:28   ` James Greenhalgh
  0 siblings, 1 reply; 10+ messages in thread
From: Jackson Woodruff @ 2017-09-06  9:03 UTC (permalink / raw)
  To: Wilco Dijkstra, GCC Patches, richard.sandiford
  Cc: nd, James Greenhalgh, Richard Earnshaw

[-- Attachment #1: Type: text/plain, Size: 2257 bytes --]

Hi all,

I've attached a new patch that addresses some of the issues raised with 
my original patch.

On 08/23/2017 03:35 PM, Wilco Dijkstra wrote:
> Richard Sandiford wrote:
>>
>> Sorry for only noticing now, but the call to aarch64_legitimate_address_p
>> is asking whether the MEM itself is a legitimate LDP/STP address.  Also,
>> it might be better to pass false for strict_p, since this can be called
>> before RA.  So maybe:
>>
>>     if (GET_CODE (operands[0]) == MEM
>> 	&& !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
>> 	     && aarch64_mem_pair_operand (operands[0], <MODE>mode)))

There were also some issues with the choice of mode for the call the 
aarch64_mem_pair_operand.

For a 128-bit wide mode, we want to check `aarch64_mem_pair_operand 
(operands[0], DImode)` since that's what the stp will be.

For a 64-bit wide mode, we don't need to do that check because a normal
`str` can be issued.

I've updated the condition as such.

> 
> Is there any reason for doing this check at all (or at least this early during
> expand)?

Not doing this check means that the zero is forced into a register, so 
we then carry around a bit more RTL and rely on combine to merge things.

> 
> There is a similar issue with this part:
> 
>   (define_insn "*aarch64_simd_mov<mode>"
>     [(set (match_operand:VQ 0 "nonimmediate_operand"
> -		"=w, m,  w, ?r, ?w, ?r, w")
> +		"=w, Ump,  m,  w, ?r, ?w, ?r, w")
> 
> The Ump causes the instruction to always split off the address offset. Ump
> cannot be used in patterns that are generated before register allocation as it
> also calls laarch64_legitimate_address_p with strict_p set to true.

I've changed the constraint to a new constraint 'Umq', that acts the 
same as Ump, but calls aarch64_legitimate_address_p with strict_p set to 
false and uses DImode for the mode to pass.


OK for trunk?

Jackson

> 
> Wilco
> 

ChangeLog:

gcc/

2017-08-29  Jackson Woodruff  <jackson.woodruff@arm.com>

	* config/aarch64/constraints.md (Umq): New constraint.
	* config/aarch64/aarch64-simd.md (*aarch64_simd_mov<mode>):
	Change to use Umq.
	(mov<mode>): Update condition.

gcc/testsuite

2017-08-29  Jackson Woodruff  <jackson.woodruff@arm.com>

	* gcc.target/aarch64/simd/vect_str_zero.c:
	Update testcase.

[-- Attachment #2: patchfile --]
[-- Type: text/plain, Size: 2636 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index f3e084f8778d70c82823b92fa80ff96021ad26db..a044a1306a897b169ff3bfa06532c692aaf023c8 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -23,10 +23,11 @@
 	(match_operand:VALL_F16 1 "general_operand" ""))]
   "TARGET_SIMD"
   "
-    if (GET_CODE (operands[0]) == MEM
-	&& !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
-	     && aarch64_legitimate_address_p (<MODE>mode, operands[0],
-					      PARALLEL, 1)))
+  if (GET_CODE (operands[0]) == MEM
+      && !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
+	   && ((GET_MODE_SIZE (<MODE>mode) == 16
+		&& aarch64_mem_pair_operand (operands[0], DImode))
+	       || GET_MODE_SIZE (<MODE>mode) == 8)))
       operands[1] = force_reg (<MODE>mode, operands[1]);
   "
 )
@@ -126,7 +127,7 @@
 
 (define_insn "*aarch64_simd_mov<mode>"
   [(set (match_operand:VQ 0 "nonimmediate_operand"
-		"=w, Ump,  m,  w, ?r, ?w, ?r, w")
+		"=w, Umq,  m,  w, ?r, ?w, ?r, w")
 	(match_operand:VQ 1 "general_operand"
 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 9ce3d4efaf31a301dfb7c1772a6b685fb2cbd2ee..4b926bf80558532e87a1dc4cacc85ff008dd80aa 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -156,6 +156,14 @@
  (and (match_code "mem")
       (match_test "REG_P (XEXP (op, 0))")))
 
+(define_memory_constraint "Umq"
+  "@internal
+   A memory address which uses a base register with an offset small enough for
+   a load/store pair operation in DI mode."
+   (and (match_code "mem")
+	(match_test "aarch64_legitimate_address_p (DImode, XEXP (op, 0),
+						   PARALLEL, 0)")))
+
 (define_memory_constraint "Ump"
   "@internal
   A memory address suitable for a load/store pair operation."
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vect_str_zero.c b/gcc/testsuite/gcc.target/aarch64/simd/vect_str_zero.c
index 07198de109432b530745cc540790303ae0245efb..00cbf20a0b293e71ed713f0c08d89d8a525fa785 100644
--- a/gcc/testsuite/gcc.target/aarch64/simd/vect_str_zero.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vect_str_zero.c
@@ -7,7 +7,7 @@ void
 f (uint32x4_t *p)
 {
   uint32x4_t x = { 0, 0, 0, 0};
-  p[1] = x;
+  p[4] = x;
 
   /* { dg-final { scan-assembler "stp\txzr, xzr," } } */
 }
@@ -16,7 +16,9 @@ void
 g (float32x2_t *p)
 {
   float32x2_t x = {0.0, 0.0};
-  p[0] = x;
+  p[400] = x;
 
   /* { dg-final { scan-assembler "str\txzr, " } } */
 }
+
+/* { dg-final { scan-assembler-not "add\tx\[0-9\]\+, x0, \[0-9\]+" } } */

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [AArch64, PATCH] Improve Neon store of zero
@ 2017-08-23 15:06 Wilco Dijkstra
  2017-09-06  9:03 ` Jackson Woodruff
  0 siblings, 1 reply; 10+ messages in thread
From: Wilco Dijkstra @ 2017-08-23 15:06 UTC (permalink / raw)
  To: GCC Patches, richard.sandiford, Jackson Woodruff
  Cc: nd, James Greenhalgh, Richard Earnshaw

Richard Sandiford wrote:
>
> Sorry for only noticing now, but the call to aarch64_legitimate_address_p
> is asking whether the MEM itself is a legitimate LDP/STP address.  Also,
> it might be better to pass false for strict_p, since this can be called
> before RA.  So maybe:
>
>    if (GET_CODE (operands[0]) == MEM
>	&& !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
>	     && aarch64_mem_pair_operand (operands[0], <MODE>mode)))

Is there any reason for doing this check at all (or at least this early during
expand)?

There is a similar issue with this part:

 (define_insn "*aarch64_simd_mov<mode>"
   [(set (match_operand:VQ 0 "nonimmediate_operand"
-		"=w, m,  w, ?r, ?w, ?r, w")
+		"=w, Ump,  m,  w, ?r, ?w, ?r, w")

The Ump causes the instruction to always split off the address offset. Ump
cannot be used in patterns that are generated before register allocation as it
also calls laarch64_legitimate_address_p with strict_p set to true.

Wilco

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2017-09-13 16:51 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-10 13:38 [AArch64, PATCH] Improve Neon store of zero Jackson Woodruff
2017-08-11 15:16 ` Richard Earnshaw (lists)
2017-08-16 16:01   ` Jackson Woodruff
2017-08-17 13:56     ` Richard Earnshaw (lists)
2017-08-23 14:46     ` Richard Sandiford
2017-08-23 15:06 Wilco Dijkstra
2017-09-06  9:03 ` Jackson Woodruff
2017-09-12 16:28   ` James Greenhalgh
2017-09-13 16:35     ` Jackson Woodruff
2017-09-13 16:51       ` James Greenhalgh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).