public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion
@ 2020-07-28 13:28 Sudakshina Das
  2020-07-31 15:13 ` Richard Sandiford
  2020-08-05 12:32 ` Andreas Schwab
  0 siblings, 2 replies; 6+ messages in thread
From: Sudakshina Das @ 2020-07-28 13:28 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 6774 bytes --]

Hi

This is my attempt at reviving the old patch https://gcc.gnu.org/pipermail/gcc-patches/2019-January/514632.html

I have followed on Kyrill's comment upstream on the link above and I am using the recommended option iii that he mentioned.
"1) Adjust the copy_limit to 256 bits after checking AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS in the tuning.
 2) Adjust aarch64_copy_one_block_and_progress_pointers to handle 256-bit moves. by iii:
   iii) Emit explicit V4SI (or any other 128-bit vector mode) pairs ldp/stps. This wouldn't need any adjustments to
        MD patterns, but would make aarch64_copy_one_block_and_progress_pointers more complex as it would now have
        two paths, where one handles two adjacent memory addresses in one calls."

With this patch the following test

#define N 8
extern int src[N], dst[N];

void
foo (void)
{
  __builtin_memcpy (dst, src, N * sizeof (int));
}

which was originally giving
foo:
        adrp    x1, src
        add     x1, x1, :lo12:src
        ldp     x4, x5, [x1]
        adrp    x0, dst
        add     x0, x0, :lo12:dst
        ldp     x2, x3, [x1, 16]
        stp     x4, x5, [x0]
        stp     x2, x3, [x0, 16]
        ret


changes to the following
foo:
        adrp    x1, src
        add     x1, x1, :lo12:src
        adrp    x0, dst
        add     x0, x0, :lo12:dst
        ldp     q1, q0, [x1]
        stp     q1, q0, [x0]
        ret

This gives about 1.3% improvement on 523.xalancbmk_r in SPEC2017 and an overall code size reduction on most
SPEC2017 Int benchmarks on Neoverse N1 due to more LDP/STP Q pair registers.

Bootstrapped and regression tested on aarch64-none-linux-gnu.

Is this ok for trunk?

Thanks
Sudi

gcc/ChangeLog:

2020-07-23  Sudakshina Das  <sudi.das@arm.com>
	    Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

	* config/aarch64/aarch64.c (aarch64_gen_store_pair): Add case
	for E_V4SImode.
	(aarch64_gen_load_pair): Likewise.
	(aarch64_copy_one_block_and_progress_pointers): Handle 256 bit copy.
	(aarch64_expand_cpymem): Expand copy_limit to 256bits where
	appropriate.

gcc/testsuite/ChangeLog:

2020-07-23  Sudakshina Das  <sudi.das@arm.com>
	    Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

	* gcc.target/aarch64/cpymem-q-reg_1.c: New test.
	* gcc.target/aarch64/large_struct_copy_2.c: Update for ldp q regs.

****************************** Attachment inlined  **********************************

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 3fe1feaa80ccb0a287ee1c7ea1056e8f0a830532..a38ff39c4d5d53f056bbba3114ebaf8f0414c037 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -6920,6 +6920,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
     case E_TFmode:
       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
 
+    case E_V4SImode:
+      return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
+
     default:
       gcc_unreachable ();
     }
@@ -6943,6 +6946,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
     case E_TFmode:
       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
 
+    case E_V4SImode:
+      return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
+
     default:
       gcc_unreachable ();
     }
@@ -21097,6 +21103,27 @@ static void
 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
 					      machine_mode mode)
 {
+  /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
+     address copies using V4SImode so that we can use Q registers.  */
+  if (known_eq (GET_MODE_BITSIZE (mode), 256))
+    {
+      mode = V4SImode;
+      rtx reg1 = gen_reg_rtx (mode);
+      rtx reg2 = gen_reg_rtx (mode);
+      /* "Cast" the pointers to the correct mode.  */
+      *src = adjust_address (*src, mode, 0);
+      *dst = adjust_address (*dst, mode, 0);
+      /* Emit the memcpy.  */
+      emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
+					aarch64_progress_pointer (*src)));
+      emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
+					 aarch64_progress_pointer (*dst), reg2));
+      /* Move the pointers forward.  */
+      *src = aarch64_move_pointer (*src, 32);
+      *dst = aarch64_move_pointer (*dst, 32);
+      return;
+    }
+
   rtx reg = gen_reg_rtx (mode);
 
   /* "Cast" the pointers to the correct mode.  */
@@ -21150,9 +21177,12 @@ aarch64_expand_cpymem (rtx *operands)
   /* Convert n to bits to make the rest of the code simpler.  */
   n = n * BITS_PER_UNIT;
 
-  /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
-     larger than TImode, but we should not use them for loads/stores here.  */
-  const int copy_limit = GET_MODE_BITSIZE (TImode);
+  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
+     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter and TARGET_SIMD.  */
+  const int copy_limit = ((aarch64_tune_params.extra_tuning_flags
+			   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
+			  || !TARGET_SIMD)
+			 ? GET_MODE_BITSIZE (TImode) :  256;
 
   while (n > 0)
     {
diff --git a/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c b/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..df5f67e425bd6a52362bc912afbcd1ca3725d449
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define N 8
+extern int src[N], dst[N];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, N * sizeof (int));
+}
+
+/* { dg-final { scan-assembler {ldp\tq[0-9]*} } } */
+/* { dg-final { scan-assembler-not {ldp\tx[0-9]*} } } */
+/* { dg-final { scan-assembler {stp\tq[0-9]*} } } */
+/* { dg-final { scan-assembler-not {stp\tx[0-9]*} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c b/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c
index 565434244e8296749a99bebc4e095065945d825e..8ee0a9fae92f805573b763ec679f2a8045afb479 100644
--- a/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c
@@ -16,11 +16,10 @@ struct S2 {
   struct S0 f3;
 };
 
-void fn1 () {
-  struct S2 b = {0, 1, 7, 4073709551611, 4, 8, 7};
+void fn1 (struct S2 b) {
   a = b.f3;
 }
 
-/* { dg-final { scan-assembler-times {ldp\s+x[0-9]+} 2 } } */
-/* { dg-final { scan-assembler-times {stp\s+x[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {ldp\s+q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {stp\s+q[0-9]+} 1 } } */
 /* { dg-final { scan-assembler-not {ld[1-3]} } } */

[-- Attachment #2: rb13305.patch --]
[-- Type: application/octet-stream, Size: 4207 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 3fe1feaa80ccb0a287ee1c7ea1056e8f0a830532..a38ff39c4d5d53f056bbba3114ebaf8f0414c037 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -6920,6 +6920,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
     case E_TFmode:
       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
 
+    case E_V4SImode:
+      return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
+
     default:
       gcc_unreachable ();
     }
@@ -6943,6 +6946,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
     case E_TFmode:
       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
 
+    case E_V4SImode:
+      return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
+
     default:
       gcc_unreachable ();
     }
@@ -21097,6 +21103,27 @@ static void
 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
 					      machine_mode mode)
 {
+  /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
+     address copies using V4SImode so that we can use Q registers.  */
+  if (known_eq (GET_MODE_BITSIZE (mode), 256))
+    {
+      mode = V4SImode;
+      rtx reg1 = gen_reg_rtx (mode);
+      rtx reg2 = gen_reg_rtx (mode);
+      /* "Cast" the pointers to the correct mode.  */
+      *src = adjust_address (*src, mode, 0);
+      *dst = adjust_address (*dst, mode, 0);
+      /* Emit the memcpy.  */
+      emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
+					aarch64_progress_pointer (*src)));
+      emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
+					 aarch64_progress_pointer (*dst), reg2));
+      /* Move the pointers forward.  */
+      *src = aarch64_move_pointer (*src, 32);
+      *dst = aarch64_move_pointer (*dst, 32);
+      return;
+    }
+
   rtx reg = gen_reg_rtx (mode);
 
   /* "Cast" the pointers to the correct mode.  */
@@ -21150,9 +21177,12 @@ aarch64_expand_cpymem (rtx *operands)
   /* Convert n to bits to make the rest of the code simpler.  */
   n = n * BITS_PER_UNIT;
 
-  /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
-     larger than TImode, but we should not use them for loads/stores here.  */
-  const int copy_limit = GET_MODE_BITSIZE (TImode);
+  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
+     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter and TARGET_SIMD.  */
+  const int copy_limit = ((aarch64_tune_params.extra_tuning_flags
+			   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
+			  || !TARGET_SIMD)
+			 ? GET_MODE_BITSIZE (TImode) :  256;
 
   while (n > 0)
     {
diff --git a/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c b/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..df5f67e425bd6a52362bc912afbcd1ca3725d449
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define N 8
+extern int src[N], dst[N];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, N * sizeof (int));
+}
+
+/* { dg-final { scan-assembler {ldp\tq[0-9]*} } } */
+/* { dg-final { scan-assembler-not {ldp\tx[0-9]*} } } */
+/* { dg-final { scan-assembler {stp\tq[0-9]*} } } */
+/* { dg-final { scan-assembler-not {stp\tx[0-9]*} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c b/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c
index 565434244e8296749a99bebc4e095065945d825e..8ee0a9fae92f805573b763ec679f2a8045afb479 100644
--- a/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c
@@ -16,11 +16,10 @@ struct S2 {
   struct S0 f3;
 };
 
-void fn1 () {
-  struct S2 b = {0, 1, 7, 4073709551611, 4, 8, 7};
+void fn1 (struct S2 b) {
   a = b.f3;
 }
 
-/* { dg-final { scan-assembler-times {ldp\s+x[0-9]+} 2 } } */
-/* { dg-final { scan-assembler-times {stp\s+x[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {ldp\s+q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {stp\s+q[0-9]+} 1 } } */
 /* { dg-final { scan-assembler-not {ld[1-3]} } } */

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion
  2020-07-28 13:28 [PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion Sudakshina Das
@ 2020-07-31 15:13 ` Richard Sandiford
  2020-08-04 11:13   ` Sudakshina Das
  2020-08-05 12:32 ` Andreas Schwab
  1 sibling, 1 reply; 6+ messages in thread
From: Richard Sandiford @ 2020-07-31 15:13 UTC (permalink / raw)
  To: Sudakshina Das; +Cc: gcc-patches, Kyrylo Tkachov

Sudakshina Das <Sudi.Das@arm.com> writes:
> Hi
>
> This is my attempt at reviving the old patch https://gcc.gnu.org/pipermail/gcc-patches/2019-January/514632.html
>
> I have followed on Kyrill's comment upstream on the link above and I am using the recommended option iii that he mentioned.
> "1) Adjust the copy_limit to 256 bits after checking AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS in the tuning.
>  2) Adjust aarch64_copy_one_block_and_progress_pointers to handle 256-bit moves. by iii:
>    iii) Emit explicit V4SI (or any other 128-bit vector mode) pairs ldp/stps. This wouldn't need any adjustments to
>         MD patterns, but would make aarch64_copy_one_block_and_progress_pointers more complex as it would now have
>         two paths, where one handles two adjacent memory addresses in one calls."
>
> With this patch the following test
>
> #define N 8
> extern int src[N], dst[N];
>
> void
> foo (void)
> {
>   __builtin_memcpy (dst, src, N * sizeof (int));
> }
>
> which was originally giving
> foo:
>         adrp    x1, src
>         add     x1, x1, :lo12:src
>         ldp     x4, x5, [x1]
>         adrp    x0, dst
>         add     x0, x0, :lo12:dst
>         ldp     x2, x3, [x1, 16]
>         stp     x4, x5, [x0]
>         stp     x2, x3, [x0, 16]
>         ret
>
>
> changes to the following
> foo:
>         adrp    x1, src
>         add     x1, x1, :lo12:src
>         adrp    x0, dst
>         add     x0, x0, :lo12:dst
>         ldp     q1, q0, [x1]
>         stp     q1, q0, [x0]
>         ret
>
> This gives about 1.3% improvement on 523.xalancbmk_r in SPEC2017 and an overall code size reduction on most
> SPEC2017 Int benchmarks on Neoverse N1 due to more LDP/STP Q pair registers.

Sorry for the slow review.  LGTM with a very minor nit (sorry)…

> @@ -21150,9 +21177,12 @@ aarch64_expand_cpymem (rtx *operands)
>    /* Convert n to bits to make the rest of the code simpler.  */
>    n = n * BITS_PER_UNIT;
>  
> -  /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
> -     larger than TImode, but we should not use them for loads/stores here.  */
> -  const int copy_limit = GET_MODE_BITSIZE (TImode);
> +  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
> +     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter and TARGET_SIMD.  */
> +  const int copy_limit = ((aarch64_tune_params.extra_tuning_flags
> +			   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
> +			  || !TARGET_SIMD)
> +			 ? GET_MODE_BITSIZE (TImode) :  256;

Should only be one space before “256”.

I guess at some point we should consider handling fixed-length SVE too,
but that's only worth it for -msve-vector-bits=512 and higher.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion
  2020-07-31 15:13 ` Richard Sandiford
@ 2020-08-04 11:13   ` Sudakshina Das
  0 siblings, 0 replies; 6+ messages in thread
From: Sudakshina Das @ 2020-08-04 11:13 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: gcc-patches, Kyrylo Tkachov

Hi Richard

> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: 31 July 2020 16:14
> To: Sudakshina Das <Sudi.Das@arm.com>
> Cc: gcc-patches@gcc.gnu.org; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH V2] aarch64: Use Q-reg loads/stores in movmem
> expansion
> 
> Sudakshina Das <Sudi.Das@arm.com> writes:
> > Hi
> >
> > This is my attempt at reviving the old patch
> > https://gcc.gnu.org/pipermail/gcc-patches/2019-January/514632.html
> >
> > I have followed on Kyrill's comment upstream on the link above and I am
> using the recommended option iii that he mentioned.
> > "1) Adjust the copy_limit to 256 bits after checking
> AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS in the tuning.
> >  2) Adjust aarch64_copy_one_block_and_progress_pointers to handle 256-
> bit moves. by iii:
> >    iii) Emit explicit V4SI (or any other 128-bit vector mode) pairs ldp/stps. This
> wouldn't need any adjustments to
> >         MD patterns, but would make
> aarch64_copy_one_block_and_progress_pointers more complex as it would
> now have
> >         two paths, where one handles two adjacent memory addresses in one
> calls."
> >
> > With this patch the following test
> >
> > #define N 8
> > extern int src[N], dst[N];
> >
> > void
> > foo (void)
> > {
> >   __builtin_memcpy (dst, src, N * sizeof (int)); }
> >
> > which was originally giving
> > foo:
> >         adrp    x1, src
> >         add     x1, x1, :lo12:src
> >         ldp     x4, x5, [x1]
> >         adrp    x0, dst
> >         add     x0, x0, :lo12:dst
> >         ldp     x2, x3, [x1, 16]
> >         stp     x4, x5, [x0]
> >         stp     x2, x3, [x0, 16]
> >         ret
> >
> >
> > changes to the following
> > foo:
> >         adrp    x1, src
> >         add     x1, x1, :lo12:src
> >         adrp    x0, dst
> >         add     x0, x0, :lo12:dst
> >         ldp     q1, q0, [x1]
> >         stp     q1, q0, [x0]
> >         ret
> >
> > This gives about 1.3% improvement on 523.xalancbmk_r in SPEC2017 and
> > an overall code size reduction on most
> > SPEC2017 Int benchmarks on Neoverse N1 due to more LDP/STP Q pair
> registers.
> 
> Sorry for the slow review.  LGTM with a very minor nit (sorry)…

Thanks. Committed with the change.
> 
> > @@ -21150,9 +21177,12 @@ aarch64_expand_cpymem (rtx *operands)
> >    /* Convert n to bits to make the rest of the code simpler.  */
> >    n = n * BITS_PER_UNIT;
> >
> > -  /* Maximum amount to copy in one go.  The AArch64 back-end has
> integer modes
> > -     larger than TImode, but we should not use them for loads/stores here.
> */
> > -  const int copy_limit = GET_MODE_BITSIZE (TImode);
> > +  /* Maximum amount to copy in one go.  We allow 256-bit chunks based
> on the
> > +     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter and
> > +TARGET_SIMD.  */
> > +  const int copy_limit = ((aarch64_tune_params.extra_tuning_flags
> > +			   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
> > +			  || !TARGET_SIMD)
> > +			 ? GET_MODE_BITSIZE (TImode) :  256;
> 
> Should only be one space before “256”.
> 
> I guess at some point we should consider handling fixed-length SVE too, but
> that's only worth it for -msve-vector-bits=512 and higher.

Yes sure I will add this for future backlog.
> 
> Thanks,
> Richard

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion
  2020-07-28 13:28 [PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion Sudakshina Das
  2020-07-31 15:13 ` Richard Sandiford
@ 2020-08-05 12:32 ` Andreas Schwab
  2020-08-05 13:51   ` Richard Sandiford
  1 sibling, 1 reply; 6+ messages in thread
From: Andreas Schwab @ 2020-08-05 12:32 UTC (permalink / raw)
  To: Sudakshina Das; +Cc: gcc-patches

This breaks bootstrap.

during RTL pass: final
../../gcc/timevar.c: In member function ‘void timer::push_internal(timer::timevar_def*)’:
../../gcc/timevar.c:373:1: internal compiler error: output_operand: invalid expression as operand
  373 | }
      | ^
0xbabdff output_operand_lossage(char const*, ...)
        ../../gcc/final.c:3609
0xbac5bb output_addr_const(_IO_FILE*, rtx_def*)
        ../../gcc/final.c:4166
0xbac04f output_address(machine_mode, rtx_def*)
        ../../gcc/final.c:4067
0xbabf8b output_operand(rtx_def*, int)
        ../../gcc/final.c:4051
0xbac9ff output_asm_insn(char const*, rtx_def**)
        ../../gcc/final.c:3963
0xbb09b7 output_asm_insn(char const*, rtx_def**)
        ../../gcc/final.c:3840
0xbb09b7 final_scan_insn_1
        ../../gcc/final.c:3106
0xbb1163 final_scan_insn(rtx_insn*, _IO_FILE*, int, int, int*)
        ../../gcc/final.c:3152
0xbb127b final_1
        ../../gcc/final.c:2020
0xbb1f2b rest_of_handle_final
        ../../gcc/final.c:4658
0xbb1f2b execute
        ../../gcc/final.c:4736

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion
  2020-08-05 12:32 ` Andreas Schwab
@ 2020-08-05 13:51   ` Richard Sandiford
  2020-08-05 15:23     ` Sudakshina Das
  0 siblings, 1 reply; 6+ messages in thread
From: Richard Sandiford @ 2020-08-05 13:51 UTC (permalink / raw)
  To: Andreas Schwab; +Cc: Sudakshina Das, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 180 bytes --]

Andreas Schwab <schwab@linux-m68k.org> writes:
> This breaks bootstrap.

I've pushed the below to fix this after bootstrapping & regression
testing on aarch64-linux-gnu.

Richard


[-- Attachment #2: 0001-aarch64-Add-missing-z-prefixes-to-LDP-STP-patterns.patch --]
[-- Type: text/plain, Size: 5046 bytes --]

From 4af98a21e10547679a643eed85d51aa5d7d2510b Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Wed, 5 Aug 2020 12:56:41 +0100
Subject: [PATCH] aarch64: Add missing %z prefixes to LDP/STP patterns

For LDP/STP Q, the memory operand might not be valid for "m",
so we need to use %z<N> instead of %<N> in the asm template.
This patch does that for all Ump LDP/STP patterns, regardless
of whether it's strictly needed.

This is needed to unbreak bootstrap.

2020-08-05  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
	* config/aarch64/aarch64.md (load_pair_sw_<SX:mode><SX2:mode>)
	(load_pair_dw_<DX:mode><DX2:mode>, load_pair_dw_tftf)
	(store_pair_sw_<SX:mode><SX2:mode>)
	(store_pair_dw_<DX:mode><DX2:mode>, store_pair_dw_tftf)
	(*load_pair_extendsidi2_aarch64)
	(*load_pair_zero_extendsidi2_aarch64): Use %z for the memory operand.
	* config/aarch64/aarch64-simd.md (load_pair<DREG:mode><DREG2:mode>)
	(vec_store_pair<DREG:mode><DREG2:mode>, load_pair<VQ:mode><VQ2:mode>)
	(vec_store_pair<VQ:mode><VQ2:mode>): Likewise.
---
 gcc/config/aarch64/aarch64-simd.md |  8 ++++----
 gcc/config/aarch64/aarch64.md      | 26 +++++++++++++-------------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 11ebf5b93c4..381a702eba0 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -187,7 +187,7 @@ (define_insn "load_pair<DREG:mode><DREG2:mode>"
 		   plus_constant (Pmode,
 				  XEXP (operands[1], 0),
 				  GET_MODE_SIZE (<DREG:MODE>mode)))"
-  "ldp\\t%d0, %d2, %1"
+  "ldp\\t%d0, %d2, %z1"
   [(set_attr "type" "neon_ldp")]
 )
 
@@ -201,7 +201,7 @@ (define_insn "vec_store_pair<DREG:mode><DREG2:mode>"
 		   plus_constant (Pmode,
 				  XEXP (operands[0], 0),
 				  GET_MODE_SIZE (<DREG:MODE>mode)))"
-  "stp\\t%d1, %d3, %0"
+  "stp\\t%d1, %d3, %z0"
   [(set_attr "type" "neon_stp")]
 )
 
@@ -215,7 +215,7 @@ (define_insn "load_pair<VQ:mode><VQ2:mode>"
 		    plus_constant (Pmode,
 			       XEXP (operands[1], 0),
 			       GET_MODE_SIZE (<VQ:MODE>mode)))"
-  "ldp\\t%q0, %q2, %1"
+  "ldp\\t%q0, %q2, %z1"
   [(set_attr "type" "neon_ldp_q")]
 )
 
@@ -228,7 +228,7 @@ (define_insn "vec_store_pair<VQ:mode><VQ2:mode>"
 		plus_constant (Pmode,
 			       XEXP (operands[0], 0),
 			       GET_MODE_SIZE (<VQ:MODE>mode)))"
-  "stp\\t%q1, %q3, %0"
+  "stp\\t%q1, %q3, %z0"
   [(set_attr "type" "neon_stp_q")]
 )
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index df780b86370..25d77256b96 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1574,8 +1574,8 @@ (define_insn "load_pair_sw_<SX:mode><SX2:mode>"
 				XEXP (operands[1], 0),
 				GET_MODE_SIZE (<SX:MODE>mode)))"
   "@
-   ldp\\t%w0, %w2, %1
-   ldp\\t%s0, %s2, %1"
+   ldp\\t%w0, %w2, %z1
+   ldp\\t%s0, %s2, %z1"
   [(set_attr "type" "load_8,neon_load1_2reg")
    (set_attr "arch" "*,fp")]
 )
@@ -1591,8 +1591,8 @@ (define_insn "load_pair_dw_<DX:mode><DX2:mode>"
 				XEXP (operands[1], 0),
 				GET_MODE_SIZE (<DX:MODE>mode)))"
   "@
-   ldp\\t%x0, %x2, %1
-   ldp\\t%d0, %d2, %1"
+   ldp\\t%x0, %x2, %z1
+   ldp\\t%d0, %d2, %z1"
   [(set_attr "type" "load_16,neon_load1_2reg")
    (set_attr "arch" "*,fp")]
 )
@@ -1607,7 +1607,7 @@ (define_insn "load_pair_dw_tftf"
 		    plus_constant (Pmode,
 				   XEXP (operands[1], 0),
 				   GET_MODE_SIZE (TFmode)))"
-  "ldp\\t%q0, %q2, %1"
+  "ldp\\t%q0, %q2, %z1"
   [(set_attr "type" "neon_ldp_q")
    (set_attr "fp" "yes")]
 )
@@ -1624,8 +1624,8 @@ (define_insn "store_pair_sw_<SX:mode><SX2:mode>"
 				XEXP (operands[0], 0),
 				GET_MODE_SIZE (<SX:MODE>mode)))"
   "@
-   stp\\t%w1, %w3, %0
-   stp\\t%s1, %s3, %0"
+   stp\\t%w1, %w3, %z0
+   stp\\t%s1, %s3, %z0"
   [(set_attr "type" "store_8,neon_store1_2reg")
    (set_attr "arch" "*,fp")]
 )
@@ -1641,8 +1641,8 @@ (define_insn "store_pair_dw_<DX:mode><DX2:mode>"
 				XEXP (operands[0], 0),
 				GET_MODE_SIZE (<DX:MODE>mode)))"
   "@
-   stp\\t%x1, %x3, %0
-   stp\\t%d1, %d3, %0"
+   stp\\t%x1, %x3, %z0
+   stp\\t%d1, %d3, %z0"
   [(set_attr "type" "store_16,neon_store1_2reg")
    (set_attr "arch" "*,fp")]
 )
@@ -1657,7 +1657,7 @@ (define_insn "store_pair_dw_tftf"
 		 plus_constant (Pmode,
 				XEXP (operands[0], 0),
 				GET_MODE_SIZE (TFmode)))"
-  "stp\\t%q1, %q3, %0"
+  "stp\\t%q1, %q3, %z0"
   [(set_attr "type" "neon_stp_q")
    (set_attr "fp" "yes")]
 )
@@ -1790,7 +1790,7 @@ (define_insn "*load_pair_extendsidi2_aarch64"
 		plus_constant (Pmode,
 			       XEXP (operands[1], 0),
 			       GET_MODE_SIZE (SImode)))"
-  "ldpsw\\t%0, %2, %1"
+  "ldpsw\\t%0, %2, %z1"
   [(set_attr "type" "load_8")]
 )
 
@@ -1819,8 +1819,8 @@ (define_insn "*load_pair_zero_extendsidi2_aarch64"
 			       XEXP (operands[1], 0),
 			       GET_MODE_SIZE (SImode)))"
   "@
-   ldp\t%w0, %w2, %1
-   ldp\t%s0, %s2, %1"
+   ldp\t%w0, %w2, %z1
+   ldp\t%s0, %s2, %z1"
   [(set_attr "type" "load_8,neon_load1_2reg")
    (set_attr "arch" "*,fp")]
 )
-- 
2.17.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion
  2020-08-05 13:51   ` Richard Sandiford
@ 2020-08-05 15:23     ` Sudakshina Das
  0 siblings, 0 replies; 6+ messages in thread
From: Sudakshina Das @ 2020-08-05 15:23 UTC (permalink / raw)
  To: Richard Sandiford, Andreas Schwab; +Cc: gcc-patches

Hi Richard

Thank you for fixing this. I apologise for the trouble. I ran bootstrap only on an
earlier version of the patch where I should have ran it again on the final one! ☹
I will be more careful in the future,

Thanks
Sudi


> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: 05 August 2020 14:52
> To: Andreas Schwab <schwab@linux-m68k.org>
> Cc: Sudakshina Das <Sudi.Das@arm.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH V2] aarch64: Use Q-reg loads/stores in movmem
> expansion
> 
> Andreas Schwab <schwab@linux-m68k.org> writes:
> > This breaks bootstrap.
> 
> I've pushed the below to fix this after bootstrapping & regression testing on
> aarch64-linux-gnu.
> 
> Richard


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-08-05 15:23 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-28 13:28 [PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion Sudakshina Das
2020-07-31 15:13 ` Richard Sandiford
2020-08-04 11:13   ` Sudakshina Das
2020-08-05 12:32 ` Andreas Schwab
2020-08-05 13:51   ` Richard Sandiford
2020-08-05 15:23     ` Sudakshina Das

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).