public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/vendors/ARM/heads/morello)] aarch64: Alternative-base support for structure loads and stores
@ 2022-05-05 12:07 Matthew Malcomson
  0 siblings, 0 replies; only message in thread
From: Matthew Malcomson @ 2022-05-05 12:07 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:07070a5388ca6a76e81d17da460f57b30afe7218

commit 07070a5388ca6a76e81d17da460f57b30afe7218
Author: Richard Sandiford <richard.sandiford@arm.com>
Date:   Fri Apr 8 15:08:24 2022 +0100

    aarch64: Alternative-base support for structure loads and stores
    
    OI, CI and XI represent tuples of 2, 3 and 4 vectors respectively.
    They are loaded and stored using multi-register forms of LD1 and ST1.
    However, there are no alternative-base forms of those instructions,
    so we need to split them into individual LDR Qs and STR Qs instead.
    This is similar to what we do for TI and TF, but much simpler,
    since there is no risk of a loaded Q register overlapping the
    addresses.
    
    On trunk, OI, CI and XI have been replaced by real vector modes,
    but this code should scale naturally to that.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md                 |  34 +++++-
 gcc/config/aarch64/aarch64.c                       |  26 ++++-
 .../aarch64/morello/alt-base-load-v256-1.c         | 106 ++++++++++++++++++
 .../aarch64/morello/alt-base-load-v384-1.c         | 115 +++++++++++++++++++
 .../aarch64/morello/alt-base-load-v512-1.c         | 124 +++++++++++++++++++++
 .../aarch64/morello/alt-base-store-v256-1.c        | 106 ++++++++++++++++++
 .../aarch64/morello/alt-base-store-v256-2.c        |  38 +++++++
 .../aarch64/morello/alt-base-store-v384-1.c        | 115 +++++++++++++++++++
 .../aarch64/morello/alt-base-store-v384-2.c        |  38 +++++++
 .../aarch64/morello/alt-base-store-v512-1.c        | 124 +++++++++++++++++++++
 .../aarch64/morello/alt-base-store-v512-2.c        |  38 +++++++
 11 files changed, 853 insertions(+), 11 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index a82c662e867..1b1e5c02894 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -5334,18 +5334,22 @@
 )
 
 (define_insn "*aarch64_mov<mode>"
-  [(set (match_operand:VSTRUCT 0 "aarch64_simd_nonimmediate_operand" "=w,Utv,w")
-	(match_operand:VSTRUCT 1 "aarch64_simd_general_operand" " w,w,Utv"))]
+  [(set (match_operand:VSTRUCT 0 "nonimmediate_operand" "=w,Utv,w,UAa,w")
+	(match_operand:VSTRUCT 1 "general_operand" " w,w,Utv,w,UAa"))]
   "TARGET_SIMD && !BYTES_BIG_ENDIAN
    && (register_operand (operands[0], <MODE>mode)
        || register_operand (operands[1], <MODE>mode))"
   "@
    #
    st1\\t{%S1.16b - %<Vendreg>1.16b}, %0
-   ld1\\t{%S0.16b - %<Vendreg>0.16b}, %1"
-  [(set_attr "type" "multiple,neon_store<nregs>_<nregs>reg_q,\
+   ld1\\t{%S0.16b - %<Vendreg>0.16b}, %1
+   #
+   #"
+  [(set_attr "type" "multiple,neon_store<nregs>_<nregs>reg_q,
+		     neon_load<nregs>_<nregs>reg_q,
+		     neon_store<nregs>_<nregs>reg_q,
 		     neon_load<nregs>_<nregs>reg_q")
-   (set_attr "length" "<insn_count>,4,4")]
+   (set_attr "length" "<insn_count>,4,4,<insn_count>,<insn_count>")]
 )
 
 (define_insn "aarch64_be_ld1<mode>"
@@ -5403,6 +5407,26 @@
    (set_attr "length" "16,4,4")]
 )
 
+(define_split
+  [(set (match_operand:VSTRUCT 0 "nonimmediate_operand")
+	(match_operand:VSTRUCT 1 "general_operand"))]
+  "TARGET_SIMD
+   && reload_completed
+   && (aarch64_alt_base_mem_operand (operands[0], <MODE>mode)
+       || aarch64_alt_base_mem_operand (operands[1], <MODE>mode))"
+  [(const_int 0)]
+{
+  for (unsigned int i = 0; i < <insn_count> / 4; ++i)
+    {
+      machine_mode new_mode = V16QImode;
+      auto byte = i * GET_MODE_SIZE (new_mode);
+      rtx dst = simplify_gen_subreg (new_mode, operands[0], <MODE>mode, byte);
+      rtx src = simplify_gen_subreg (new_mode, operands[1], <MODE>mode, byte);
+      emit_move_insn (dst, src);
+    }
+  DONE;
+})
+
 (define_split
   [(set (match_operand:OI 0 "register_operand")
 	(match_operand:OI 1 "register_operand"))]
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 8d268af60eb..f2a72a0ca2f 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -2604,6 +2604,17 @@ aarch64_mem_addr_mode_p (machine_mode mode)
   return mode == Pmode;
 }
 
+/* Return true if MODE is the mode of "normal" (as opposed to alternative)
+   base registers.  */
+
+static bool
+aarch64_normal_base_mode_p (machine_mode mode)
+{
+  if (!CAPABILITY_MODE_P (Pmode) && mode == VOIDmode)
+    return true;
+  return mode == Pmode;
+}
+
 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
    prefer to use the first arithmetic operand as the else value if
    the else value doesn't matter, since that exactly matches the SVE
@@ -9801,6 +9812,10 @@ aarch64_classify_address (struct aarch64_address_info *info,
 	ldp_stp_mode = DImode;
       ldr_str_mode = mode;
     }
+  /* There are no alternative-base forms of multi-register LD1 and ST1,
+     so we need to split structure moves into individual LDRs and STRs.  */
+  else if (alt_base_p && advsimd_struct_p)
+    split_mode = V16QImode;
   /* On BE, we use load/store pair for multi-vector load/stores.  */
   else if (BYTES_BIG_ENDIAN && advsimd_struct_p)
     {
@@ -9851,9 +9866,9 @@ aarch64_classify_address (struct aarch64_address_info *info,
 	  if (code != POST_INC)
 	    return false;
 	}
-      /* On LE, for AdvSIMD, don't support anything other than POST_INC or
-	 REG addressing.  */
-      else if (!BYTES_BIG_ENDIAN && code != REG)
+      /* Don't support anything other than POST_INC or REG addressing if
+	 we can use LD1 and ST1.  */
+      else if (!alt_base_p && !BYTES_BIG_ENDIAN && code != REG)
 	return false;
     }
 
@@ -19447,7 +19462,7 @@ aarch64_simd_mem_operand_p (rtx op)
   return (MEM_P (op)
 	  && (GET_CODE (XEXP (op, 0)) == POST_INC
 	      || REG_P (XEXP (op, 0)))
-	  && aarch64_mem_addr_mode_p (GET_MODE (XEXP (op, 0))));
+	  && aarch64_normal_base_mode_p (mem_address_mode (op)));
 }
 
 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
@@ -23783,8 +23798,7 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load)
 
       /* The addres must use a normal rather than an alternative
 	 base register.  */
-      if (TARGET_CAPABILITY_HYBRID
-	  && CAPABILITY_MODE_P (mem_address_mode (mem[i])))
+      if (!aarch64_normal_base_mode_p (mem_address_mode (mem[i])))
 	return false;
 
       /* Check if the addresses are in the form of [base+offset].  */
diff --git a/gcc/testsuite/gcc.target/aarch64/morello/alt-base-load-v256-1.c b/gcc/testsuite/gcc.target/aarch64/morello/alt-base-load-v256-1.c
new file mode 100644
index 00000000000..772400f92ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/morello/alt-base-load-v256-1.c
@@ -0,0 +1,106 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" ""  { {-O[123s]} } } } */
+/* { dg-skip-if "" { *-*-* } { "-mabi=purecap" "-mfake-capability" } { "" } }  */
+
+#define ALT_BASE
+#include "load-store-utils.h"
+
+#include <arm_neon.h>
+
+/*
+** load_q20_int8x16x2_t_m257:
+**	sub	(c[0-9]+), c0, #257
+**	ldr	q20, \[\1\]
+**	ldr	q21, \[\1, #?16\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x2_t, m257)
+
+/*
+** load_q20_int8x16x2_t_m256:
+**	ldr	q20, \[c0, #?-256\]
+**	ldr	q21, \[c0, #?-240\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x2_t, m256)
+
+/*
+** load_q20_int8x16x2_t_m255:
+**	ldr	q20, \[c0, #?-255\]
+**	ldr	q21, \[c0, #?-239\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x2_t, m255)
+
+/*
+** load_q20_int8x16x2_t_m1:
+**	ldr	q20, \[c0, #?-1\]
+**	ldr	q21, \[c0, #?15\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x2_t, m1)
+
+/*
+** load_q20_int8x16x2_t_1:
+**	ldr	q20, \[c0, #?1\]
+**	ldr	q21, \[c0, #?17\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x2_t, 1)
+
+/*
+** load_q20_int8x16x2_t_239:
+**	ldr	q20, \[c0, #?239\]
+**	ldr	q21, \[c0, #?255\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x2_t, 239)
+
+/*
+** load_q20_int8x16x2_t_240:
+**	add	(c[0-9]+), c0, #?240
+**	ldr	q20, \[\1\]
+**	ldr	q21, \[\1, #?16\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x2_t, 240)
+
+/*
+** load_q20_int8x16x2_t_241:
+**	add	(c[0-9]+), c0, #?241
+**	ldr	q20, \[\1\]
+**	ldr	q21, \[\1, #?16\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x2_t, 241)
+
+/*
+** load_q20_int8x16x2_t_256:
+**	add	(c[0-9]+), c0, #?256
+**	ldr	q20, \[\1\]
+**	ldr	q21, \[\1, #?16\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x2_t, 256)
+
+/* Check for valid asm, but don't mandate a particular sequence.  */
+LOAD_REG_INDEX (q20, int8x16x2_t, int32_t, 1)
+LOAD_REG_INDEX (q20, int8x16x2_t, uint32_t, 1)
+LOAD_REG_INDEX (q20, int8x16x2_t, uint64_t, 1)
+
+LOAD_REG_INDEX (q20, int8x16x2_t, int32_t, 2)
+LOAD_REG_INDEX (q20, int8x16x2_t, uint32_t, 2)
+LOAD_REG_INDEX (q20, int8x16x2_t, uint64_t, 2)
+
+LOAD_REG_INDEX (q20, int8x16x2_t, int32_t, 4)
+LOAD_REG_INDEX (q20, int8x16x2_t, uint32_t, 4)
+LOAD_REG_INDEX (q20, int8x16x2_t, uint64_t, 4)
+
+LOAD_REG_INDEX (q20, int8x16x2_t, int32_t, 8)
+LOAD_REG_INDEX (q20, int8x16x2_t, uint32_t, 8)
+LOAD_REG_INDEX (q20, int8x16x2_t, uint64_t, 8)
+
+LOAD_REG_INDEX (q20, int8x16x2_t, int32_t, 16)
+LOAD_REG_INDEX (q20, int8x16x2_t, uint32_t, 16)
+LOAD_REG_INDEX (q20, int8x16x2_t, uint64_t, 16)
diff --git a/gcc/testsuite/gcc.target/aarch64/morello/alt-base-load-v384-1.c b/gcc/testsuite/gcc.target/aarch64/morello/alt-base-load-v384-1.c
new file mode 100644
index 00000000000..adf0365794d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/morello/alt-base-load-v384-1.c
@@ -0,0 +1,115 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" ""  { {-O[123s]} } } } */
+/* { dg-skip-if "" { *-*-* } { "-mabi=purecap" "-mfake-capability" } { "" } }  */
+
+#define ALT_BASE
+#include "load-store-utils.h"
+
+#include <arm_neon.h>
+
+/*
+** load_q20_int8x16x3_t_m257:
+**	sub	(c[0-9]+), c0, #257
+**	ldr	q20, \[\1\]
+**	ldr	q21, \[\1, #?16\]
+**	ldr	q22, \[\1, #?32\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x3_t, m257)
+
+/*
+** load_q20_int8x16x3_t_m256:
+**	ldr	q20, \[c0, #?-256\]
+**	ldr	q21, \[c0, #?-240\]
+**	ldr	q22, \[c0, #?-224\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x3_t, m256)
+
+/*
+** load_q20_int8x16x3_t_m255:
+**	ldr	q20, \[c0, #?-255\]
+**	ldr	q21, \[c0, #?-239\]
+**	ldr	q22, \[c0, #?-223\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x3_t, m255)
+
+/*
+** load_q20_int8x16x3_t_m1:
+**	ldr	q20, \[c0, #?-1\]
+**	ldr	q21, \[c0, #?15\]
+**	ldr	q22, \[c0, #?31\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x3_t, m1)
+
+/*
+** load_q20_int8x16x3_t_1:
+**	ldr	q20, \[c0, #?1\]
+**	ldr	q21, \[c0, #?17\]
+**	ldr	q22, \[c0, #?33\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x3_t, 1)
+
+/*
+** load_q20_int8x16x3_t_223:
+**	ldr	q20, \[c0, #?223\]
+**	ldr	q21, \[c0, #?239\]
+**	ldr	q22, \[c0, #?255\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x3_t, 223)
+
+/*
+** load_q20_int8x16x3_t_224:
+**	add	(c[0-9]+), c0, #?224
+**	ldr	q20, \[\1\]
+**	ldr	q21, \[\1, #?16\]
+**	ldr	q22, \[\1, #?32\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x3_t, 224)
+
+/*
+** load_q20_int8x16x3_t_225:
+**	add	(c[0-9]+), c0, #?225
+**	ldr	q20, \[\1\]
+**	ldr	q21, \[\1, #?16\]
+**	ldr	q22, \[\1, #?32\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x3_t, 225)
+
+/*
+** load_q20_int8x16x3_t_256:
+**	add	(c[0-9]+), c0, #?256
+**	ldr	q20, \[\1\]
+**	ldr	q21, \[\1, #?16\]
+**	ldr	q22, \[\1, #?32\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x3_t, 256)
+
+/* Check for valid asm, but don't mandate a particular sequence.  */
+LOAD_REG_INDEX (q20, int8x16x3_t, int32_t, 1)
+LOAD_REG_INDEX (q20, int8x16x3_t, uint32_t, 1)
+LOAD_REG_INDEX (q20, int8x16x3_t, uint64_t, 1)
+
+LOAD_REG_INDEX (q20, int8x16x3_t, int32_t, 2)
+LOAD_REG_INDEX (q20, int8x16x3_t, uint32_t, 2)
+LOAD_REG_INDEX (q20, int8x16x3_t, uint64_t, 2)
+
+LOAD_REG_INDEX (q20, int8x16x3_t, int32_t, 4)
+LOAD_REG_INDEX (q20, int8x16x3_t, uint32_t, 4)
+LOAD_REG_INDEX (q20, int8x16x3_t, uint64_t, 4)
+
+LOAD_REG_INDEX (q20, int8x16x3_t, int32_t, 8)
+LOAD_REG_INDEX (q20, int8x16x3_t, uint32_t, 8)
+LOAD_REG_INDEX (q20, int8x16x3_t, uint64_t, 8)
+
+LOAD_REG_INDEX (q20, int8x16x3_t, int32_t, 16)
+LOAD_REG_INDEX (q20, int8x16x3_t, uint32_t, 16)
+LOAD_REG_INDEX (q20, int8x16x3_t, uint64_t, 16)
diff --git a/gcc/testsuite/gcc.target/aarch64/morello/alt-base-load-v512-1.c b/gcc/testsuite/gcc.target/aarch64/morello/alt-base-load-v512-1.c
new file mode 100644
index 00000000000..db91ffa9bfd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/morello/alt-base-load-v512-1.c
@@ -0,0 +1,124 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" ""  { {-O[123s]} } } } */
+/* { dg-skip-if "" { *-*-* } { "-mabi=purecap" "-mfake-capability" } { "" } }  */
+
+#define ALT_BASE
+#include "load-store-utils.h"
+
+#include <arm_neon.h>
+
+/*
+** load_q20_int8x16x4_t_m257:
+**	sub	(c[0-9]+), c0, #257
+**	ldr	q20, \[\1\]
+**	ldr	q21, \[\1, #?16\]
+**	ldr	q22, \[\1, #?32\]
+**	ldr	q23, \[\1, #?48\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x4_t, m257)
+
+/*
+** load_q20_int8x16x4_t_m256:
+**	ldr	q20, \[c0, #?-256\]
+**	ldr	q21, \[c0, #?-240\]
+**	ldr	q22, \[c0, #?-224\]
+**	ldr	q23, \[c0, #?-208\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x4_t, m256)
+
+/*
+** load_q20_int8x16x4_t_m255:
+**	ldr	q20, \[c0, #?-255\]
+**	ldr	q21, \[c0, #?-239\]
+**	ldr	q22, \[c0, #?-223\]
+**	ldr	q23, \[c0, #?-207\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x4_t, m255)
+
+/*
+** load_q20_int8x16x4_t_m1:
+**	ldr	q20, \[c0, #?-1\]
+**	ldr	q21, \[c0, #?15\]
+**	ldr	q22, \[c0, #?31\]
+**	ldr	q23, \[c0, #?47\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x4_t, m1)
+
+/*
+** load_q20_int8x16x4_t_1:
+**	ldr	q20, \[c0, #?1\]
+**	ldr	q21, \[c0, #?17\]
+**	ldr	q22, \[c0, #?33\]
+**	ldr	q23, \[c0, #?49\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x4_t, 1)
+
+/*
+** load_q20_int8x16x4_t_207:
+**	ldr	q20, \[c0, #?207\]
+**	ldr	q21, \[c0, #?223\]
+**	ldr	q22, \[c0, #?239\]
+**	ldr	q23, \[c0, #?255\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x4_t, 207)
+
+/*
+** load_q20_int8x16x4_t_208:
+**	add	(c[0-9]+), c0, #?208
+**	ldr	q20, \[\1\]
+**	ldr	q21, \[\1, #?16\]
+**	ldr	q22, \[\1, #?32\]
+**	ldr	q23, \[\1, #?48\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x4_t, 208)
+
+/*
+** load_q20_int8x16x4_t_209:
+**	add	(c[0-9]+), c0, #?209
+**	ldr	q20, \[\1\]
+**	ldr	q21, \[\1, #?16\]
+**	ldr	q22, \[\1, #?32\]
+**	ldr	q23, \[\1, #?48\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x4_t, 209)
+
+/*
+** load_q20_int8x16x4_t_256:
+**	add	(c[0-9]+), c0, #?256
+**	ldr	q20, \[\1\]
+**	ldr	q21, \[\1, #?16\]
+**	ldr	q22, \[\1, #?32\]
+**	ldr	q23, \[\1, #?48\]
+**	ret
+*/
+LOAD_REG_OFFSET (q20, int8x16x4_t, 256)
+
+/* Check for valid asm, but don't mandate a particular sequence.  */
+LOAD_REG_INDEX (q20, int8x16x4_t, int32_t, 1)
+LOAD_REG_INDEX (q20, int8x16x4_t, uint32_t, 1)
+LOAD_REG_INDEX (q20, int8x16x4_t, uint64_t, 1)
+
+LOAD_REG_INDEX (q20, int8x16x4_t, int32_t, 2)
+LOAD_REG_INDEX (q20, int8x16x4_t, uint32_t, 2)
+LOAD_REG_INDEX (q20, int8x16x4_t, uint64_t, 2)
+
+LOAD_REG_INDEX (q20, int8x16x4_t, int32_t, 4)
+LOAD_REG_INDEX (q20, int8x16x4_t, uint32_t, 4)
+LOAD_REG_INDEX (q20, int8x16x4_t, uint64_t, 4)
+
+LOAD_REG_INDEX (q20, int8x16x4_t, int32_t, 8)
+LOAD_REG_INDEX (q20, int8x16x4_t, uint32_t, 8)
+LOAD_REG_INDEX (q20, int8x16x4_t, uint64_t, 8)
+
+LOAD_REG_INDEX (q20, int8x16x4_t, int32_t, 16)
+LOAD_REG_INDEX (q20, int8x16x4_t, uint32_t, 16)
+LOAD_REG_INDEX (q20, int8x16x4_t, uint64_t, 16)
diff --git a/gcc/testsuite/gcc.target/aarch64/morello/alt-base-store-v256-1.c b/gcc/testsuite/gcc.target/aarch64/morello/alt-base-store-v256-1.c
new file mode 100644
index 00000000000..07a1d5a79d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/morello/alt-base-store-v256-1.c
@@ -0,0 +1,106 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" ""  { {-O[123s]} } } } */
+/* { dg-skip-if "" { *-*-* } { "-mabi=purecap" "-mfake-capability" } { "" } }  */
+
+#define ALT_BASE
+#include "load-store-utils.h"
+
+#include <arm_neon.h>
+
+/*
+** store_q20_int8x16x2_t_m257:
+**	sub	(c[0-9]+), c0, #257
+**	str	q20, \[\1\]
+**	str	q21, \[\1, #?16\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x2_t, m257)
+
+/*
+** store_q20_int8x16x2_t_m256:
+**	str	q20, \[c0, #?-256\]
+**	str	q21, \[c0, #?-240\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x2_t, m256)
+
+/*
+** store_q20_int8x16x2_t_m255:
+**	str	q20, \[c0, #?-255\]
+**	str	q21, \[c0, #?-239\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x2_t, m255)
+
+/*
+** store_q20_int8x16x2_t_m1:
+**	str	q20, \[c0, #?-1\]
+**	str	q21, \[c0, #?15\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x2_t, m1)
+
+/*
+** store_q20_int8x16x2_t_1:
+**	str	q20, \[c0, #?1\]
+**	str	q21, \[c0, #?17\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x2_t, 1)
+
+/*
+** store_q20_int8x16x2_t_239:
+**	str	q20, \[c0, #?239\]
+**	str	q21, \[c0, #?255\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x2_t, 239)
+
+/*
+** store_q20_int8x16x2_t_240:
+**	add	(c[0-9]+), c0, #?240
+**	str	q20, \[\1\]
+**	str	q21, \[\1, #?16\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x2_t, 240)
+
+/*
+** store_q20_int8x16x2_t_241:
+**	add	(c[0-9]+), c0, #?241
+**	str	q20, \[\1\]
+**	str	q21, \[\1, #?16\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x2_t, 241)
+
+/*
+** store_q20_int8x16x2_t_256:
+**	add	(c[0-9]+), c0, #?256
+**	str	q20, \[\1\]
+**	str	q21, \[\1, #?16\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x2_t, 256)
+
+/* Check for valid asm, but don't mandate a particular sequence.  */
+STORE_REG_INDEX (q20, int8x16x2_t, int32_t, 1)
+STORE_REG_INDEX (q20, int8x16x2_t, uint32_t, 1)
+STORE_REG_INDEX (q20, int8x16x2_t, uint64_t, 1)
+
+STORE_REG_INDEX (q20, int8x16x2_t, int32_t, 2)
+STORE_REG_INDEX (q20, int8x16x2_t, uint32_t, 2)
+STORE_REG_INDEX (q20, int8x16x2_t, uint64_t, 2)
+
+STORE_REG_INDEX (q20, int8x16x2_t, int32_t, 4)
+STORE_REG_INDEX (q20, int8x16x2_t, uint32_t, 4)
+STORE_REG_INDEX (q20, int8x16x2_t, uint64_t, 4)
+
+STORE_REG_INDEX (q20, int8x16x2_t, int32_t, 8)
+STORE_REG_INDEX (q20, int8x16x2_t, uint32_t, 8)
+STORE_REG_INDEX (q20, int8x16x2_t, uint64_t, 8)
+
+STORE_REG_INDEX (q20, int8x16x2_t, int32_t, 16)
+STORE_REG_INDEX (q20, int8x16x2_t, uint32_t, 16)
+STORE_REG_INDEX (q20, int8x16x2_t, uint64_t, 16)
diff --git a/gcc/testsuite/gcc.target/aarch64/morello/alt-base-store-v256-2.c b/gcc/testsuite/gcc.target/aarch64/morello/alt-base-store-v256-2.c
new file mode 100644
index 00000000000..8a9ab893e6a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/morello/alt-base-store-v256-2.c
@@ -0,0 +1,38 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps" } */
+
+#define ALT_BASE
+#include "load-store-utils.h"
+
+#include <arm_neon.h>
+
+/* Check for valid asm, but don't mandate a particular sequence.  */
+STORE_ZERO_OFFSET (int8x16x2_t, m257)
+STORE_ZERO_OFFSET (int8x16x2_t, m256)
+STORE_ZERO_OFFSET (int8x16x2_t, m255)
+STORE_ZERO_OFFSET (int8x16x2_t, m1)
+STORE_ZERO_OFFSET (int8x16x2_t, 1)
+STORE_ZERO_OFFSET (int8x16x2_t, 239)
+STORE_ZERO_OFFSET (int8x16x2_t, 240)
+STORE_ZERO_OFFSET (int8x16x2_t, 241)
+STORE_ZERO_OFFSET (int8x16x2_t, 256)
+
+STORE_ZERO_INDEX (int8x16x2_t, int32_t, 1)
+STORE_ZERO_INDEX (int8x16x2_t, uint32_t, 1)
+STORE_ZERO_INDEX (int8x16x2_t, uint64_t, 1)
+
+STORE_ZERO_INDEX (int8x16x2_t, int32_t, 2)
+STORE_ZERO_INDEX (int8x16x2_t, uint32_t, 2)
+STORE_ZERO_INDEX (int8x16x2_t, uint64_t, 2)
+
+STORE_ZERO_INDEX (int8x16x2_t, int32_t, 4)
+STORE_ZERO_INDEX (int8x16x2_t, uint32_t, 4)
+STORE_ZERO_INDEX (int8x16x2_t, uint64_t, 4)
+
+STORE_ZERO_INDEX (int8x16x2_t, int32_t, 8)
+STORE_ZERO_INDEX (int8x16x2_t, uint32_t, 8)
+STORE_ZERO_INDEX (int8x16x2_t, uint64_t, 8)
+
+STORE_ZERO_INDEX (int8x16x2_t, int32_t, 16)
+STORE_ZERO_INDEX (int8x16x2_t, uint32_t, 16)
+STORE_ZERO_INDEX (int8x16x2_t, uint64_t, 16)
diff --git a/gcc/testsuite/gcc.target/aarch64/morello/alt-base-store-v384-1.c b/gcc/testsuite/gcc.target/aarch64/morello/alt-base-store-v384-1.c
new file mode 100644
index 00000000000..06d2b481f13
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/morello/alt-base-store-v384-1.c
@@ -0,0 +1,115 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" ""  { {-O[123s]} } } } */
+/* { dg-skip-if "" { *-*-* } { "-mabi=purecap" "-mfake-capability" } { "" } }  */
+
+#define ALT_BASE
+#include "load-store-utils.h"
+
+#include <arm_neon.h>
+
+/*
+** store_q20_int8x16x3_t_m257:
+**	sub	(c[0-9]+), c0, #257
+**	str	q20, \[\1\]
+**	str	q21, \[\1, #?16\]
+**	str	q22, \[\1, #?32\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x3_t, m257)
+
+/*
+** store_q20_int8x16x3_t_m256:
+**	str	q20, \[c0, #?-256\]
+**	str	q21, \[c0, #?-240\]
+**	str	q22, \[c0, #?-224\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x3_t, m256)
+
+/*
+** store_q20_int8x16x3_t_m255:
+**	str	q20, \[c0, #?-255\]
+**	str	q21, \[c0, #?-239\]
+**	str	q22, \[c0, #?-223\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x3_t, m255)
+
+/*
+** store_q20_int8x16x3_t_m1:
+**	str	q20, \[c0, #?-1\]
+**	str	q21, \[c0, #?15\]
+**	str	q22, \[c0, #?31\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x3_t, m1)
+
+/*
+** store_q20_int8x16x3_t_1:
+**	str	q20, \[c0, #?1\]
+**	str	q21, \[c0, #?17\]
+**	str	q22, \[c0, #?33\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x3_t, 1)
+
+/*
+** store_q20_int8x16x3_t_223:
+**	str	q20, \[c0, #?223\]
+**	str	q21, \[c0, #?239\]
+**	str	q22, \[c0, #?255\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x3_t, 223)
+
+/*
+** store_q20_int8x16x3_t_224:
+**	add	(c[0-9]+), c0, #?224
+**	str	q20, \[\1\]
+**	str	q21, \[\1, #?16\]
+**	str	q22, \[\1, #?32\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x3_t, 224)
+
+/*
+** store_q20_int8x16x3_t_225:
+**	add	(c[0-9]+), c0, #?225
+**	str	q20, \[\1\]
+**	str	q21, \[\1, #?16\]
+**	str	q22, \[\1, #?32\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x3_t, 225)
+
+/*
+** store_q20_int8x16x3_t_256:
+**	add	(c[0-9]+), c0, #?256
+**	str	q20, \[\1\]
+**	str	q21, \[\1, #?16\]
+**	str	q22, \[\1, #?32\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x3_t, 256)
+
+/* Check for valid asm, but don't mandate a particular sequence.  */
+STORE_REG_INDEX (q20, int8x16x3_t, int32_t, 1)
+STORE_REG_INDEX (q20, int8x16x3_t, uint32_t, 1)
+STORE_REG_INDEX (q20, int8x16x3_t, uint64_t, 1)
+
+STORE_REG_INDEX (q20, int8x16x3_t, int32_t, 2)
+STORE_REG_INDEX (q20, int8x16x3_t, uint32_t, 2)
+STORE_REG_INDEX (q20, int8x16x3_t, uint64_t, 2)
+
+STORE_REG_INDEX (q20, int8x16x3_t, int32_t, 4)
+STORE_REG_INDEX (q20, int8x16x3_t, uint32_t, 4)
+STORE_REG_INDEX (q20, int8x16x3_t, uint64_t, 4)
+
+STORE_REG_INDEX (q20, int8x16x3_t, int32_t, 8)
+STORE_REG_INDEX (q20, int8x16x3_t, uint32_t, 8)
+STORE_REG_INDEX (q20, int8x16x3_t, uint64_t, 8)
+
+STORE_REG_INDEX (q20, int8x16x3_t, int32_t, 16)
+STORE_REG_INDEX (q20, int8x16x3_t, uint32_t, 16)
+STORE_REG_INDEX (q20, int8x16x3_t, uint64_t, 16)
diff --git a/gcc/testsuite/gcc.target/aarch64/morello/alt-base-store-v384-2.c b/gcc/testsuite/gcc.target/aarch64/morello/alt-base-store-v384-2.c
new file mode 100644
index 00000000000..6eab2193574
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/morello/alt-base-store-v384-2.c
@@ -0,0 +1,38 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps" } */
+
+#define ALT_BASE
+#include "load-store-utils.h"
+
+#include <arm_neon.h>
+
+/* Check for valid asm, but don't mandate a particular sequence.  */
+STORE_ZERO_OFFSET (int8x16x3_t, m257)
+STORE_ZERO_OFFSET (int8x16x3_t, m256)
+STORE_ZERO_OFFSET (int8x16x3_t, m255)
+STORE_ZERO_OFFSET (int8x16x3_t, m1)
+STORE_ZERO_OFFSET (int8x16x3_t, 1)
+STORE_ZERO_OFFSET (int8x16x3_t, 223)
+STORE_ZERO_OFFSET (int8x16x3_t, 224)
+STORE_ZERO_OFFSET (int8x16x3_t, 225)
+STORE_ZERO_OFFSET (int8x16x3_t, 256)
+
+STORE_ZERO_INDEX (int8x16x3_t, int32_t, 1)
+STORE_ZERO_INDEX (int8x16x3_t, uint32_t, 1)
+STORE_ZERO_INDEX (int8x16x3_t, uint64_t, 1)
+
+STORE_ZERO_INDEX (int8x16x3_t, int32_t, 2)
+STORE_ZERO_INDEX (int8x16x3_t, uint32_t, 2)
+STORE_ZERO_INDEX (int8x16x3_t, uint64_t, 2)
+
+STORE_ZERO_INDEX (int8x16x3_t, int32_t, 4)
+STORE_ZERO_INDEX (int8x16x3_t, uint32_t, 4)
+STORE_ZERO_INDEX (int8x16x3_t, uint64_t, 4)
+
+STORE_ZERO_INDEX (int8x16x3_t, int32_t, 8)
+STORE_ZERO_INDEX (int8x16x3_t, uint32_t, 8)
+STORE_ZERO_INDEX (int8x16x3_t, uint64_t, 8)
+
+STORE_ZERO_INDEX (int8x16x3_t, int32_t, 16)
+STORE_ZERO_INDEX (int8x16x3_t, uint32_t, 16)
+STORE_ZERO_INDEX (int8x16x3_t, uint64_t, 16)
diff --git a/gcc/testsuite/gcc.target/aarch64/morello/alt-base-store-v512-1.c b/gcc/testsuite/gcc.target/aarch64/morello/alt-base-store-v512-1.c
new file mode 100644
index 00000000000..37d54f1b762
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/morello/alt-base-store-v512-1.c
@@ -0,0 +1,124 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" ""  { {-O[123s]} } } } */
+/* { dg-skip-if "" { *-*-* } { "-mabi=purecap" "-mfake-capability" } { "" } }  */
+
+#define ALT_BASE
+#include "load-store-utils.h"
+
+#include <arm_neon.h>
+
+/*
+** store_q20_int8x16x4_t_m257:
+**	sub	(c[0-9]+), c0, #257
+**	str	q20, \[\1\]
+**	str	q21, \[\1, #?16\]
+**	str	q22, \[\1, #?32\]
+**	str	q23, \[\1, #?48\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x4_t, m257)
+
+/*
+** store_q20_int8x16x4_t_m256:
+**	str	q20, \[c0, #?-256\]
+**	str	q21, \[c0, #?-240\]
+**	str	q22, \[c0, #?-224\]
+**	str	q23, \[c0, #?-208\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x4_t, m256)
+
+/*
+** store_q20_int8x16x4_t_m255:
+**	str	q20, \[c0, #?-255\]
+**	str	q21, \[c0, #?-239\]
+**	str	q22, \[c0, #?-223\]
+**	str	q23, \[c0, #?-207\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x4_t, m255)
+
+/*
+** store_q20_int8x16x4_t_m1:
+**	str	q20, \[c0, #?-1\]
+**	str	q21, \[c0, #?15\]
+**	str	q22, \[c0, #?31\]
+**	str	q23, \[c0, #?47\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x4_t, m1)
+
+/*
+** store_q20_int8x16x4_t_1:
+**	str	q20, \[c0, #?1\]
+**	str	q21, \[c0, #?17\]
+**	str	q22, \[c0, #?33\]
+**	str	q23, \[c0, #?49\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x4_t, 1)
+
+/*
+** store_q20_int8x16x4_t_207:
+**	str	q20, \[c0, #?207\]
+**	str	q21, \[c0, #?223\]
+**	str	q22, \[c0, #?239\]
+**	str	q23, \[c0, #?255\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x4_t, 207)
+
+/*
+** store_q20_int8x16x4_t_208:
+**	add	(c[0-9]+), c0, #?208
+**	str	q20, \[\1\]
+**	str	q21, \[\1, #?16\]
+**	str	q22, \[\1, #?32\]
+**	str	q23, \[\1, #?48\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x4_t, 208)
+
+/*
+** store_q20_int8x16x4_t_209:
+**	add	(c[0-9]+), c0, #?209
+**	str	q20, \[\1\]
+**	str	q21, \[\1, #?16\]
+**	str	q22, \[\1, #?32\]
+**	str	q23, \[\1, #?48\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x4_t, 209)
+
+/*
+** store_q20_int8x16x4_t_256:
+**	add	(c[0-9]+), c0, #?256
+**	str	q20, \[\1\]
+**	str	q21, \[\1, #?16\]
+**	str	q22, \[\1, #?32\]
+**	str	q23, \[\1, #?48\]
+**	ret
+*/
+STORE_REG_OFFSET (q20, int8x16x4_t, 256)
+
+/* Check for valid asm, but don't mandate a particular sequence.  */
+STORE_REG_INDEX (q20, int8x16x4_t, int32_t, 1)
+STORE_REG_INDEX (q20, int8x16x4_t, uint32_t, 1)
+STORE_REG_INDEX (q20, int8x16x4_t, uint64_t, 1)
+
+STORE_REG_INDEX (q20, int8x16x4_t, int32_t, 2)
+STORE_REG_INDEX (q20, int8x16x4_t, uint32_t, 2)
+STORE_REG_INDEX (q20, int8x16x4_t, uint64_t, 2)
+
+STORE_REG_INDEX (q20, int8x16x4_t, int32_t, 4)
+STORE_REG_INDEX (q20, int8x16x4_t, uint32_t, 4)
+STORE_REG_INDEX (q20, int8x16x4_t, uint64_t, 4)
+
+STORE_REG_INDEX (q20, int8x16x4_t, int32_t, 8)
+STORE_REG_INDEX (q20, int8x16x4_t, uint32_t, 8)
+STORE_REG_INDEX (q20, int8x16x4_t, uint64_t, 8)
+
+STORE_REG_INDEX (q20, int8x16x4_t, int32_t, 16)
+STORE_REG_INDEX (q20, int8x16x4_t, uint32_t, 16)
+STORE_REG_INDEX (q20, int8x16x4_t, uint64_t, 16)
diff --git a/gcc/testsuite/gcc.target/aarch64/morello/alt-base-store-v512-2.c b/gcc/testsuite/gcc.target/aarch64/morello/alt-base-store-v512-2.c
new file mode 100644
index 00000000000..8d4f6864491
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/morello/alt-base-store-v512-2.c
@@ -0,0 +1,38 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps" } */
+
+#define ALT_BASE
+#include "load-store-utils.h"
+
+#include <arm_neon.h>
+
+/* Check for valid asm, but don't mandate a particular sequence.  */
+STORE_ZERO_OFFSET (int8x16x4_t, m257)
+STORE_ZERO_OFFSET (int8x16x4_t, m256)
+STORE_ZERO_OFFSET (int8x16x4_t, m255)
+STORE_ZERO_OFFSET (int8x16x4_t, m1)
+STORE_ZERO_OFFSET (int8x16x4_t, 1)
+STORE_ZERO_OFFSET (int8x16x4_t, 207)
+STORE_ZERO_OFFSET (int8x16x4_t, 208)
+STORE_ZERO_OFFSET (int8x16x4_t, 209)
+STORE_ZERO_OFFSET (int8x16x4_t, 256)
+
+STORE_ZERO_INDEX (int8x16x4_t, int32_t, 1)
+STORE_ZERO_INDEX (int8x16x4_t, uint32_t, 1)
+STORE_ZERO_INDEX (int8x16x4_t, uint64_t, 1)
+
+STORE_ZERO_INDEX (int8x16x4_t, int32_t, 2)
+STORE_ZERO_INDEX (int8x16x4_t, uint32_t, 2)
+STORE_ZERO_INDEX (int8x16x4_t, uint64_t, 2)
+
+STORE_ZERO_INDEX (int8x16x4_t, int32_t, 4)
+STORE_ZERO_INDEX (int8x16x4_t, uint32_t, 4)
+STORE_ZERO_INDEX (int8x16x4_t, uint64_t, 4)
+
+STORE_ZERO_INDEX (int8x16x4_t, int32_t, 8)
+STORE_ZERO_INDEX (int8x16x4_t, uint32_t, 8)
+STORE_ZERO_INDEX (int8x16x4_t, uint64_t, 8)
+
+STORE_ZERO_INDEX (int8x16x4_t, int32_t, 16)
+STORE_ZERO_INDEX (int8x16x4_t, uint32_t, 16)
+STORE_ZERO_INDEX (int8x16x4_t, uint64_t, 16)


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-05-05 12:07 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-05 12:07 [gcc(refs/vendors/ARM/heads/morello)] aarch64: Alternative-base support for structure loads and stores Matthew Malcomson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).