* [PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion
@ 2020-07-28 13:28 Sudakshina Das
2020-07-31 15:13 ` Richard Sandiford
2020-08-05 12:32 ` Andreas Schwab
0 siblings, 2 replies; 6+ messages in thread
From: Sudakshina Das @ 2020-07-28 13:28 UTC (permalink / raw)
To: gcc-patches
[-- Attachment #1: Type: text/plain, Size: 6774 bytes --]
Hi
This is my attempt at reviving the old patch https://gcc.gnu.org/pipermail/gcc-patches/2019-January/514632.html
I have followed on Kyrill's comment upstream on the link above and I am using the recommended option iii that he mentioned.
"1) Adjust the copy_limit to 256 bits after checking AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS in the tuning.
2) Adjust aarch64_copy_one_block_and_progress_pointers to handle 256-bit moves. by iii:
iii) Emit explicit V4SI (or any other 128-bit vector mode) pairs ldp/stps. This wouldn't need any adjustments to
MD patterns, but would make aarch64_copy_one_block_and_progress_pointers more complex as it would now have
two paths, where one handles two adjacent memory addresses in one calls."
With this patch the following test
#define N 8
extern int src[N], dst[N];
void
foo (void)
{
__builtin_memcpy (dst, src, N * sizeof (int));
}
which was originally giving
foo:
adrp x1, src
add x1, x1, :lo12:src
ldp x4, x5, [x1]
adrp x0, dst
add x0, x0, :lo12:dst
ldp x2, x3, [x1, 16]
stp x4, x5, [x0]
stp x2, x3, [x0, 16]
ret
changes to the following
foo:
adrp x1, src
add x1, x1, :lo12:src
adrp x0, dst
add x0, x0, :lo12:dst
ldp q1, q0, [x1]
stp q1, q0, [x0]
ret
This gives about 1.3% improvement on 523.xalancbmk_r in SPEC2017 and an overall code size reduction on most
SPEC2017 Int benchmarks on Neoverse N1 due to more LDP/STP Q pair registers.
Bootstrapped and regression tested on aarch64-none-linux-gnu.
Is this ok for trunk?
Thanks
Sudi
gcc/ChangeLog:
2020-07-23 Sudakshina Das <sudi.das@arm.com>
Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* config/aarch64/aarch64.c (aarch64_gen_store_pair): Add case
for E_V4SImode.
(aarch64_gen_load_pair): Likewise.
(aarch64_copy_one_block_and_progress_pointers): Handle 256 bit copy.
(aarch64_expand_cpymem): Expand copy_limit to 256bits where
appropriate.
gcc/testsuite/ChangeLog:
2020-07-23 Sudakshina Das <sudi.das@arm.com>
Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* gcc.target/aarch64/cpymem-q-reg_1.c: New test.
* gcc.target/aarch64/large_struct_copy_2.c: Update for ldp q regs.
****************************** Attachment inlined **********************************
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 3fe1feaa80ccb0a287ee1c7ea1056e8f0a830532..a38ff39c4d5d53f056bbba3114ebaf8f0414c037 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -6920,6 +6920,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
case E_TFmode:
return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
+ case E_V4SImode:
+ return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
+
default:
gcc_unreachable ();
}
@@ -6943,6 +6946,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
case E_TFmode:
return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
+ case E_V4SImode:
+ return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
+
default:
gcc_unreachable ();
}
@@ -21097,6 +21103,27 @@ static void
aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
machine_mode mode)
{
+ /* Handle 256-bit memcpy separately. We do this by making 2 adjacent memory
+ address copies using V4SImode so that we can use Q registers. */
+ if (known_eq (GET_MODE_BITSIZE (mode), 256))
+ {
+ mode = V4SImode;
+ rtx reg1 = gen_reg_rtx (mode);
+ rtx reg2 = gen_reg_rtx (mode);
+ /* "Cast" the pointers to the correct mode. */
+ *src = adjust_address (*src, mode, 0);
+ *dst = adjust_address (*dst, mode, 0);
+ /* Emit the memcpy. */
+ emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
+ aarch64_progress_pointer (*src)));
+ emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
+ aarch64_progress_pointer (*dst), reg2));
+ /* Move the pointers forward. */
+ *src = aarch64_move_pointer (*src, 32);
+ *dst = aarch64_move_pointer (*dst, 32);
+ return;
+ }
+
rtx reg = gen_reg_rtx (mode);
/* "Cast" the pointers to the correct mode. */
@@ -21150,9 +21177,12 @@ aarch64_expand_cpymem (rtx *operands)
/* Convert n to bits to make the rest of the code simpler. */
n = n * BITS_PER_UNIT;
- /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
- larger than TImode, but we should not use them for loads/stores here. */
- const int copy_limit = GET_MODE_BITSIZE (TImode);
+ /* Maximum amount to copy in one go. We allow 256-bit chunks based on the
+ AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter and TARGET_SIMD. */
+ const int copy_limit = ((aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
+ || !TARGET_SIMD)
+ ? GET_MODE_BITSIZE (TImode) : 256;
while (n > 0)
{
diff --git a/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c b/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..df5f67e425bd6a52362bc912afbcd1ca3725d449
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define N 8
+extern int src[N], dst[N];
+
+void
+foo (void)
+{
+ __builtin_memcpy (dst, src, N * sizeof (int));
+}
+
+/* { dg-final { scan-assembler {ldp\tq[0-9]*} } } */
+/* { dg-final { scan-assembler-not {ldp\tx[0-9]*} } } */
+/* { dg-final { scan-assembler {stp\tq[0-9]*} } } */
+/* { dg-final { scan-assembler-not {stp\tx[0-9]*} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c b/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c
index 565434244e8296749a99bebc4e095065945d825e..8ee0a9fae92f805573b763ec679f2a8045afb479 100644
--- a/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c
@@ -16,11 +16,10 @@ struct S2 {
struct S0 f3;
};
-void fn1 () {
- struct S2 b = {0, 1, 7, 4073709551611, 4, 8, 7};
+void fn1 (struct S2 b) {
a = b.f3;
}
-/* { dg-final { scan-assembler-times {ldp\s+x[0-9]+} 2 } } */
-/* { dg-final { scan-assembler-times {stp\s+x[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {ldp\s+q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {stp\s+q[0-9]+} 1 } } */
/* { dg-final { scan-assembler-not {ld[1-3]} } } */
[-- Attachment #2: rb13305.patch --]
[-- Type: application/octet-stream, Size: 4207 bytes --]
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 3fe1feaa80ccb0a287ee1c7ea1056e8f0a830532..a38ff39c4d5d53f056bbba3114ebaf8f0414c037 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -6920,6 +6920,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
case E_TFmode:
return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
+ case E_V4SImode:
+ return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
+
default:
gcc_unreachable ();
}
@@ -6943,6 +6946,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
case E_TFmode:
return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
+ case E_V4SImode:
+ return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
+
default:
gcc_unreachable ();
}
@@ -21097,6 +21103,27 @@ static void
aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
machine_mode mode)
{
+ /* Handle 256-bit memcpy separately. We do this by making 2 adjacent memory
+ address copies using V4SImode so that we can use Q registers. */
+ if (known_eq (GET_MODE_BITSIZE (mode), 256))
+ {
+ mode = V4SImode;
+ rtx reg1 = gen_reg_rtx (mode);
+ rtx reg2 = gen_reg_rtx (mode);
+ /* "Cast" the pointers to the correct mode. */
+ *src = adjust_address (*src, mode, 0);
+ *dst = adjust_address (*dst, mode, 0);
+ /* Emit the memcpy. */
+ emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
+ aarch64_progress_pointer (*src)));
+ emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
+ aarch64_progress_pointer (*dst), reg2));
+ /* Move the pointers forward. */
+ *src = aarch64_move_pointer (*src, 32);
+ *dst = aarch64_move_pointer (*dst, 32);
+ return;
+ }
+
rtx reg = gen_reg_rtx (mode);
/* "Cast" the pointers to the correct mode. */
@@ -21150,9 +21177,12 @@ aarch64_expand_cpymem (rtx *operands)
/* Convert n to bits to make the rest of the code simpler. */
n = n * BITS_PER_UNIT;
- /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
- larger than TImode, but we should not use them for loads/stores here. */
- const int copy_limit = GET_MODE_BITSIZE (TImode);
+ /* Maximum amount to copy in one go. We allow 256-bit chunks based on the
+ AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter and TARGET_SIMD. */
+ const int copy_limit = ((aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
+ || !TARGET_SIMD)
+ ? GET_MODE_BITSIZE (TImode) : 256;
while (n > 0)
{
diff --git a/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c b/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..df5f67e425bd6a52362bc912afbcd1ca3725d449
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define N 8
+extern int src[N], dst[N];
+
+void
+foo (void)
+{
+ __builtin_memcpy (dst, src, N * sizeof (int));
+}
+
+/* { dg-final { scan-assembler {ldp\tq[0-9]*} } } */
+/* { dg-final { scan-assembler-not {ldp\tx[0-9]*} } } */
+/* { dg-final { scan-assembler {stp\tq[0-9]*} } } */
+/* { dg-final { scan-assembler-not {stp\tx[0-9]*} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c b/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c
index 565434244e8296749a99bebc4e095065945d825e..8ee0a9fae92f805573b763ec679f2a8045afb479 100644
--- a/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c
@@ -16,11 +16,10 @@ struct S2 {
struct S0 f3;
};
-void fn1 () {
- struct S2 b = {0, 1, 7, 4073709551611, 4, 8, 7};
+void fn1 (struct S2 b) {
a = b.f3;
}
-/* { dg-final { scan-assembler-times {ldp\s+x[0-9]+} 2 } } */
-/* { dg-final { scan-assembler-times {stp\s+x[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {ldp\s+q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {stp\s+q[0-9]+} 1 } } */
/* { dg-final { scan-assembler-not {ld[1-3]} } } */
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion
2020-07-28 13:28 [PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion Sudakshina Das
@ 2020-07-31 15:13 ` Richard Sandiford
2020-08-04 11:13 ` Sudakshina Das
2020-08-05 12:32 ` Andreas Schwab
1 sibling, 1 reply; 6+ messages in thread
From: Richard Sandiford @ 2020-07-31 15:13 UTC (permalink / raw)
To: Sudakshina Das; +Cc: gcc-patches, Kyrylo Tkachov
Sudakshina Das <Sudi.Das@arm.com> writes:
> Hi
>
> This is my attempt at reviving the old patch https://gcc.gnu.org/pipermail/gcc-patches/2019-January/514632.html
>
> I have followed on Kyrill's comment upstream on the link above and I am using the recommended option iii that he mentioned.
> "1) Adjust the copy_limit to 256 bits after checking AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS in the tuning.
> 2) Adjust aarch64_copy_one_block_and_progress_pointers to handle 256-bit moves. by iii:
> iii) Emit explicit V4SI (or any other 128-bit vector mode) pairs ldp/stps. This wouldn't need any adjustments to
> MD patterns, but would make aarch64_copy_one_block_and_progress_pointers more complex as it would now have
> two paths, where one handles two adjacent memory addresses in one calls."
>
> With this patch the following test
>
> #define N 8
> extern int src[N], dst[N];
>
> void
> foo (void)
> {
> __builtin_memcpy (dst, src, N * sizeof (int));
> }
>
> which was originally giving
> foo:
> adrp x1, src
> add x1, x1, :lo12:src
> ldp x4, x5, [x1]
> adrp x0, dst
> add x0, x0, :lo12:dst
> ldp x2, x3, [x1, 16]
> stp x4, x5, [x0]
> stp x2, x3, [x0, 16]
> ret
>
>
> changes to the following
> foo:
> adrp x1, src
> add x1, x1, :lo12:src
> adrp x0, dst
> add x0, x0, :lo12:dst
> ldp q1, q0, [x1]
> stp q1, q0, [x0]
> ret
>
> This gives about 1.3% improvement on 523.xalancbmk_r in SPEC2017 and an overall code size reduction on most
> SPEC2017 Int benchmarks on Neoverse N1 due to more LDP/STP Q pair registers.
Sorry for the slow review. LGTM with a very minor nit (sorry)…
> @@ -21150,9 +21177,12 @@ aarch64_expand_cpymem (rtx *operands)
> /* Convert n to bits to make the rest of the code simpler. */
> n = n * BITS_PER_UNIT;
>
> - /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
> - larger than TImode, but we should not use them for loads/stores here. */
> - const int copy_limit = GET_MODE_BITSIZE (TImode);
> + /* Maximum amount to copy in one go. We allow 256-bit chunks based on the
> + AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter and TARGET_SIMD. */
> + const int copy_limit = ((aarch64_tune_params.extra_tuning_flags
> + & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
> + || !TARGET_SIMD)
> + ? GET_MODE_BITSIZE (TImode) : 256;
Should only be one space before “256”.
I guess at some point we should consider handling fixed-length SVE too,
but that's only worth it for -msve-vector-bits=512 and higher.
Thanks,
Richard
^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: [PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion
2020-07-31 15:13 ` Richard Sandiford
@ 2020-08-04 11:13 ` Sudakshina Das
0 siblings, 0 replies; 6+ messages in thread
From: Sudakshina Das @ 2020-08-04 11:13 UTC (permalink / raw)
To: Richard Sandiford; +Cc: gcc-patches, Kyrylo Tkachov
Hi Richard
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: 31 July 2020 16:14
> To: Sudakshina Das <Sudi.Das@arm.com>
> Cc: gcc-patches@gcc.gnu.org; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH V2] aarch64: Use Q-reg loads/stores in movmem
> expansion
>
> Sudakshina Das <Sudi.Das@arm.com> writes:
> > Hi
> >
> > This is my attempt at reviving the old patch
> > https://gcc.gnu.org/pipermail/gcc-patches/2019-January/514632.html
> >
> > I have followed on Kyrill's comment upstream on the link above and I am
> using the recommended option iii that he mentioned.
> > "1) Adjust the copy_limit to 256 bits after checking
> AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS in the tuning.
> > 2) Adjust aarch64_copy_one_block_and_progress_pointers to handle 256-
> bit moves. by iii:
> > iii) Emit explicit V4SI (or any other 128-bit vector mode) pairs ldp/stps. This
> wouldn't need any adjustments to
> > MD patterns, but would make
> aarch64_copy_one_block_and_progress_pointers more complex as it would
> now have
> > two paths, where one handles two adjacent memory addresses in one
> calls."
> >
> > With this patch the following test
> >
> > #define N 8
> > extern int src[N], dst[N];
> >
> > void
> > foo (void)
> > {
> > __builtin_memcpy (dst, src, N * sizeof (int)); }
> >
> > which was originally giving
> > foo:
> > adrp x1, src
> > add x1, x1, :lo12:src
> > ldp x4, x5, [x1]
> > adrp x0, dst
> > add x0, x0, :lo12:dst
> > ldp x2, x3, [x1, 16]
> > stp x4, x5, [x0]
> > stp x2, x3, [x0, 16]
> > ret
> >
> >
> > changes to the following
> > foo:
> > adrp x1, src
> > add x1, x1, :lo12:src
> > adrp x0, dst
> > add x0, x0, :lo12:dst
> > ldp q1, q0, [x1]
> > stp q1, q0, [x0]
> > ret
> >
> > This gives about 1.3% improvement on 523.xalancbmk_r in SPEC2017 and
> > an overall code size reduction on most
> > SPEC2017 Int benchmarks on Neoverse N1 due to more LDP/STP Q pair
> registers.
>
> Sorry for the slow review. LGTM with a very minor nit (sorry)…
Thanks. Committed with the change.
>
> > @@ -21150,9 +21177,12 @@ aarch64_expand_cpymem (rtx *operands)
> > /* Convert n to bits to make the rest of the code simpler. */
> > n = n * BITS_PER_UNIT;
> >
> > - /* Maximum amount to copy in one go. The AArch64 back-end has
> integer modes
> > - larger than TImode, but we should not use them for loads/stores here.
> */
> > - const int copy_limit = GET_MODE_BITSIZE (TImode);
> > + /* Maximum amount to copy in one go. We allow 256-bit chunks based
> on the
> > + AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter and
> > +TARGET_SIMD. */
> > + const int copy_limit = ((aarch64_tune_params.extra_tuning_flags
> > + & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
> > + || !TARGET_SIMD)
> > + ? GET_MODE_BITSIZE (TImode) : 256;
>
> Should only be one space before “256”.
>
> I guess at some point we should consider handling fixed-length SVE too, but
> that's only worth it for -msve-vector-bits=512 and higher.
Yes sure I will add this for future backlog.
>
> Thanks,
> Richard
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion
2020-07-28 13:28 [PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion Sudakshina Das
2020-07-31 15:13 ` Richard Sandiford
@ 2020-08-05 12:32 ` Andreas Schwab
2020-08-05 13:51 ` Richard Sandiford
1 sibling, 1 reply; 6+ messages in thread
From: Andreas Schwab @ 2020-08-05 12:32 UTC (permalink / raw)
To: Sudakshina Das; +Cc: gcc-patches
This breaks bootstrap.
during RTL pass: final
../../gcc/timevar.c: In member function ‘void timer::push_internal(timer::timevar_def*)’:
../../gcc/timevar.c:373:1: internal compiler error: output_operand: invalid expression as operand
373 | }
| ^
0xbabdff output_operand_lossage(char const*, ...)
../../gcc/final.c:3609
0xbac5bb output_addr_const(_IO_FILE*, rtx_def*)
../../gcc/final.c:4166
0xbac04f output_address(machine_mode, rtx_def*)
../../gcc/final.c:4067
0xbabf8b output_operand(rtx_def*, int)
../../gcc/final.c:4051
0xbac9ff output_asm_insn(char const*, rtx_def**)
../../gcc/final.c:3963
0xbb09b7 output_asm_insn(char const*, rtx_def**)
../../gcc/final.c:3840
0xbb09b7 final_scan_insn_1
../../gcc/final.c:3106
0xbb1163 final_scan_insn(rtx_insn*, _IO_FILE*, int, int, int*)
../../gcc/final.c:3152
0xbb127b final_1
../../gcc/final.c:2020
0xbb1f2b rest_of_handle_final
../../gcc/final.c:4658
0xbb1f2b execute
../../gcc/final.c:4736
Andreas.
--
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510 2552 DF73 E780 A9DA AEC1
"And now for something completely different."
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion
2020-08-05 12:32 ` Andreas Schwab
@ 2020-08-05 13:51 ` Richard Sandiford
2020-08-05 15:23 ` Sudakshina Das
0 siblings, 1 reply; 6+ messages in thread
From: Richard Sandiford @ 2020-08-05 13:51 UTC (permalink / raw)
To: Andreas Schwab; +Cc: Sudakshina Das, gcc-patches
[-- Attachment #1: Type: text/plain, Size: 180 bytes --]
Andreas Schwab <schwab@linux-m68k.org> writes:
> This breaks bootstrap.
I've pushed the below to fix this after bootstrapping & regression
testing on aarch64-linux-gnu.
Richard
[-- Attachment #2: 0001-aarch64-Add-missing-z-prefixes-to-LDP-STP-patterns.patch --]
[-- Type: text/plain, Size: 5046 bytes --]
From 4af98a21e10547679a643eed85d51aa5d7d2510b Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Wed, 5 Aug 2020 12:56:41 +0100
Subject: [PATCH] aarch64: Add missing %z prefixes to LDP/STP patterns
For LDP/STP Q, the memory operand might not be valid for "m",
so we need to use %z<N> instead of %<N> in the asm template.
This patch does that for all Ump LDP/STP patterns, regardless
of whether it's strictly needed.
This is needed to unbreak bootstrap.
2020-08-05 Richard Sandiford <richard.sandiford@arm.com>
gcc/
* config/aarch64/aarch64.md (load_pair_sw_<SX:mode><SX2:mode>)
(load_pair_dw_<DX:mode><DX2:mode>, load_pair_dw_tftf)
(store_pair_sw_<SX:mode><SX2:mode>)
(store_pair_dw_<DX:mode><DX2:mode>, store_pair_dw_tftf)
(*load_pair_extendsidi2_aarch64)
(*load_pair_zero_extendsidi2_aarch64): Use %z for the memory operand.
* config/aarch64/aarch64-simd.md (load_pair<DREG:mode><DREG2:mode>)
(vec_store_pair<DREG:mode><DREG2:mode>, load_pair<VQ:mode><VQ2:mode>)
(vec_store_pair<VQ:mode><VQ2:mode>): Likewise.
---
gcc/config/aarch64/aarch64-simd.md | 8 ++++----
gcc/config/aarch64/aarch64.md | 26 +++++++++++++-------------
2 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 11ebf5b93c4..381a702eba0 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -187,7 +187,7 @@ (define_insn "load_pair<DREG:mode><DREG2:mode>"
plus_constant (Pmode,
XEXP (operands[1], 0),
GET_MODE_SIZE (<DREG:MODE>mode)))"
- "ldp\\t%d0, %d2, %1"
+ "ldp\\t%d0, %d2, %z1"
[(set_attr "type" "neon_ldp")]
)
@@ -201,7 +201,7 @@ (define_insn "vec_store_pair<DREG:mode><DREG2:mode>"
plus_constant (Pmode,
XEXP (operands[0], 0),
GET_MODE_SIZE (<DREG:MODE>mode)))"
- "stp\\t%d1, %d3, %0"
+ "stp\\t%d1, %d3, %z0"
[(set_attr "type" "neon_stp")]
)
@@ -215,7 +215,7 @@ (define_insn "load_pair<VQ:mode><VQ2:mode>"
plus_constant (Pmode,
XEXP (operands[1], 0),
GET_MODE_SIZE (<VQ:MODE>mode)))"
- "ldp\\t%q0, %q2, %1"
+ "ldp\\t%q0, %q2, %z1"
[(set_attr "type" "neon_ldp_q")]
)
@@ -228,7 +228,7 @@ (define_insn "vec_store_pair<VQ:mode><VQ2:mode>"
plus_constant (Pmode,
XEXP (operands[0], 0),
GET_MODE_SIZE (<VQ:MODE>mode)))"
- "stp\\t%q1, %q3, %0"
+ "stp\\t%q1, %q3, %z0"
[(set_attr "type" "neon_stp_q")]
)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index df780b86370..25d77256b96 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1574,8 +1574,8 @@ (define_insn "load_pair_sw_<SX:mode><SX2:mode>"
XEXP (operands[1], 0),
GET_MODE_SIZE (<SX:MODE>mode)))"
"@
- ldp\\t%w0, %w2, %1
- ldp\\t%s0, %s2, %1"
+ ldp\\t%w0, %w2, %z1
+ ldp\\t%s0, %s2, %z1"
[(set_attr "type" "load_8,neon_load1_2reg")
(set_attr "arch" "*,fp")]
)
@@ -1591,8 +1591,8 @@ (define_insn "load_pair_dw_<DX:mode><DX2:mode>"
XEXP (operands[1], 0),
GET_MODE_SIZE (<DX:MODE>mode)))"
"@
- ldp\\t%x0, %x2, %1
- ldp\\t%d0, %d2, %1"
+ ldp\\t%x0, %x2, %z1
+ ldp\\t%d0, %d2, %z1"
[(set_attr "type" "load_16,neon_load1_2reg")
(set_attr "arch" "*,fp")]
)
@@ -1607,7 +1607,7 @@ (define_insn "load_pair_dw_tftf"
plus_constant (Pmode,
XEXP (operands[1], 0),
GET_MODE_SIZE (TFmode)))"
- "ldp\\t%q0, %q2, %1"
+ "ldp\\t%q0, %q2, %z1"
[(set_attr "type" "neon_ldp_q")
(set_attr "fp" "yes")]
)
@@ -1624,8 +1624,8 @@ (define_insn "store_pair_sw_<SX:mode><SX2:mode>"
XEXP (operands[0], 0),
GET_MODE_SIZE (<SX:MODE>mode)))"
"@
- stp\\t%w1, %w3, %0
- stp\\t%s1, %s3, %0"
+ stp\\t%w1, %w3, %z0
+ stp\\t%s1, %s3, %z0"
[(set_attr "type" "store_8,neon_store1_2reg")
(set_attr "arch" "*,fp")]
)
@@ -1641,8 +1641,8 @@ (define_insn "store_pair_dw_<DX:mode><DX2:mode>"
XEXP (operands[0], 0),
GET_MODE_SIZE (<DX:MODE>mode)))"
"@
- stp\\t%x1, %x3, %0
- stp\\t%d1, %d3, %0"
+ stp\\t%x1, %x3, %z0
+ stp\\t%d1, %d3, %z0"
[(set_attr "type" "store_16,neon_store1_2reg")
(set_attr "arch" "*,fp")]
)
@@ -1657,7 +1657,7 @@ (define_insn "store_pair_dw_tftf"
plus_constant (Pmode,
XEXP (operands[0], 0),
GET_MODE_SIZE (TFmode)))"
- "stp\\t%q1, %q3, %0"
+ "stp\\t%q1, %q3, %z0"
[(set_attr "type" "neon_stp_q")
(set_attr "fp" "yes")]
)
@@ -1790,7 +1790,7 @@ (define_insn "*load_pair_extendsidi2_aarch64"
plus_constant (Pmode,
XEXP (operands[1], 0),
GET_MODE_SIZE (SImode)))"
- "ldpsw\\t%0, %2, %1"
+ "ldpsw\\t%0, %2, %z1"
[(set_attr "type" "load_8")]
)
@@ -1819,8 +1819,8 @@ (define_insn "*load_pair_zero_extendsidi2_aarch64"
XEXP (operands[1], 0),
GET_MODE_SIZE (SImode)))"
"@
- ldp\t%w0, %w2, %1
- ldp\t%s0, %s2, %1"
+ ldp\t%w0, %w2, %z1
+ ldp\t%s0, %s2, %z1"
[(set_attr "type" "load_8,neon_load1_2reg")
(set_attr "arch" "*,fp")]
)
--
2.17.1
^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: [PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion
2020-08-05 13:51 ` Richard Sandiford
@ 2020-08-05 15:23 ` Sudakshina Das
0 siblings, 0 replies; 6+ messages in thread
From: Sudakshina Das @ 2020-08-05 15:23 UTC (permalink / raw)
To: Richard Sandiford, Andreas Schwab; +Cc: gcc-patches
Hi Richard
Thank you for fixing this. I apologise for the trouble. I ran bootstrap only on an
earlier version of the patch where I should have ran it again on the final one! ☹
I will be more careful in the future,
Thanks
Sudi
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: 05 August 2020 14:52
> To: Andreas Schwab <schwab@linux-m68k.org>
> Cc: Sudakshina Das <Sudi.Das@arm.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH V2] aarch64: Use Q-reg loads/stores in movmem
> expansion
>
> Andreas Schwab <schwab@linux-m68k.org> writes:
> > This breaks bootstrap.
>
> I've pushed the below to fix this after bootstrapping & regression testing on
> aarch64-linux-gnu.
>
> Richard
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2020-08-05 15:23 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-28 13:28 [PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion Sudakshina Das
2020-07-31 15:13 ` Richard Sandiford
2020-08-04 11:13 ` Sudakshina Das
2020-08-05 12:32 ` Andreas Schwab
2020-08-05 13:51 ` Richard Sandiford
2020-08-05 15:23 ` Sudakshina Das
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).