public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* aarch64: Add support for SME2
@ 2023-11-17 17:37 Richard Sandiford
  2023-11-17 17:38 ` [PATCH 1/5] aarch64: Add +sme2 Richard Sandiford
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: Richard Sandiford @ 2023-11-17 17:37 UTC (permalink / raw)
  To: gcc-patches

This series of patches adds support for SME2.  It is gated behind
the earlier series for SME.

All of the detail is in the individual patch summaries.

Tested on aarch64-linux-gnu.

Richard

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/5] aarch64: Add +sme2
  2023-11-17 17:37 aarch64: Add support for SME2 Richard Sandiford
@ 2023-11-17 17:38 ` Richard Sandiford
  2023-11-17 17:39 ` [PATCH 2/5] aarch64: Add svcount_t Richard Sandiford
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Richard Sandiford @ 2023-11-17 17:38 UTC (permalink / raw)
  To: gcc-patches

gcc/
	* doc/invoke.texi: Document +sme2.
	* doc/sourcebuild.texi: Document aarch64_sme2.
	* config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION):
	Add sme2.
	* config/aarch64/aarch64.h (AARCH64_ISA_SME2, TARGET_SME2): New macros.

gcc/testsuite/
	* lib/target-supports.exp (check_effective_target_aarch64_sme2): New
	target test.
	(check_effective_target_aarch64_asm_sme2_ok): Likewise.
---
 gcc/config/aarch64/aarch64-option-extensions.def |  2 ++
 gcc/config/aarch64/aarch64.h                     |  4 ++++
 gcc/doc/invoke.texi                              |  3 ++-
 gcc/doc/sourcebuild.texi                         |  2 ++
 gcc/testsuite/lib/target-supports.exp            | 14 +++++++++++++-
 5 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index 1480e498bbb..c156d2ee76a 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -157,4 +157,6 @@ AARCH64_OPT_EXTENSION("sme-i16i64", SME_I16I64, (SME), (), (), "")
 
 AARCH64_OPT_EXTENSION("sme-f64f64", SME_F64F64, (SME), (), (), "")
 
+AARCH64_OPT_EXTENSION("sme2", SME2, (SME), (), (), "sme2")
+
 #undef AARCH64_OPT_EXTENSION
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 9f690809e79..14205ce34b3 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -227,6 +227,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF;
 #define AARCH64_ISA_SME		   (aarch64_isa_flags & AARCH64_FL_SME)
 #define AARCH64_ISA_SME_I16I64	   (aarch64_isa_flags & AARCH64_FL_SME_I16I64)
 #define AARCH64_ISA_SME_F64F64	   (aarch64_isa_flags & AARCH64_FL_SME_F64F64)
+#define AARCH64_ISA_SME2	   (aarch64_isa_flags & AARCH64_FL_SME2)
 #define AARCH64_ISA_V8_3A	   (aarch64_isa_flags & AARCH64_FL_V8_3A)
 #define AARCH64_ISA_DOTPROD	   (aarch64_isa_flags & AARCH64_FL_DOTPROD)
 #define AARCH64_ISA_AES	           (aarch64_isa_flags & AARCH64_FL_AES)
@@ -332,6 +333,9 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF;
 /* The FEAT_SME_F64F64 extension to SME, enabled through +sme-f64f64.  */
 #define TARGET_SME_F64F64 (AARCH64_ISA_SME_F64F64)
 
+/* SME2 instructions, enabled through +sme2.  */
+#define TARGET_SME2 (AARCH64_ISA_SME2)
+
 /* ARMv8.3-A features.  */
 #define TARGET_ARMV8_3	(AARCH64_ISA_V8_3A)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index bc56170aadb..475244bb4ff 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -21065,7 +21065,8 @@ Enable the Scalable Matrix Extension.
 Enable the FEAT_SME_I16I64 extension to SME.
 @item sme-f64f64
 Enable the FEAT_SME_F64F64 extension to SME.
-
++@item sme2
+Enable the Scalable Matrix Extension 2.  This also enables SME instructions.
 @end table
 
 Feature @option{crypto} implies @option{aes}, @option{sha2}, and @option{simd},
diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
index 448f5e08578..8d8d21f9fee 100644
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -2318,6 +2318,8 @@ Binutils installed on test system supports relocation types required by -fpic
 for AArch64 small memory model.
 @item aarch64_sme
 AArch64 target that generates instructions for SME.
+@item aarch64_sme2
+AArch64 target that generates instructions for SME2.
 @item aarch64_sve_hw
 AArch64 target that is able to generate and execute SVE code (regardless of
 whether it does so by default).
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index b9061e5a552..87ee26f9119 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4425,6 +4425,18 @@ proc check_effective_target_aarch64_sme { } {
     }]
 }
 
+# Return 1 if this is an AArch64 target that generates instructions for SME.
+proc check_effective_target_aarch64_sme2 { } {
+    if { ![istarget aarch64*-*-*] } {
+	return 0
+    }
+    return [check_no_compiler_messages aarch64_sme2 assembly {
+	#if !defined (__ARM_FEATURE_SME2)
+	#error FOO
+	#endif
+    }]
+}
+
 # Return 1 if this is a compiler supporting ARC atomic operations
 proc check_effective_target_arc_atomic { } {
     return [check_no_compiler_messages arc_atomic assembly {
@@ -11621,7 +11633,7 @@ proc check_effective_target_aarch64_tiny { } {
 
 foreach { aarch64_ext } { "fp" "simd" "crypto" "crc" "lse" "dotprod" "sve"
 			  "i8mm" "f32mm" "f64mm" "bf16" "sb" "sve2" "ls64"
-			  "sme" "sme-i16i64" } {
+			  "sme" "sme-i16i64" "sme2" } {
     eval [string map [list FUNC $aarch64_ext] {
 	proc check_effective_target_aarch64_asm_FUNC_ok { } {
 	  if { [istarget aarch64*-*-*] } {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 2/5] aarch64: Add svcount_t
  2023-11-17 17:37 aarch64: Add support for SME2 Richard Sandiford
  2023-11-17 17:38 ` [PATCH 1/5] aarch64: Add +sme2 Richard Sandiford
@ 2023-11-17 17:39 ` Richard Sandiford
  2023-11-17 17:39 ` [PATCH 3/5] aarch64: Add svboolx2_t Richard Sandiford
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Richard Sandiford @ 2023-11-17 17:39 UTC (permalink / raw)
  To: gcc-patches

Some SME2 instructions interpret predicates as counters, rather than
as bit-per-byte masks.  The SME2 ACLE defines an svcount_t type for
this interpretation.

I don't think we have a better way of representing counters than
the VNx16BI that we use for masks.  The patch therefore doesn't
add a new mode for this representation.  It's just something that
is interpreted in context, a bit like signed vs. unsigned integers.

gcc/
	* config/aarch64/aarch64-sve-builtins-base.cc
	(svreinterpret_impl::fold): Handle reinterprets between svbool_t
	and svcount_t.
	(svreinterpret_impl::expand): Likewise.
	* config/aarch64/aarch64-sve-builtins-base.def (svreinterpret): Add
	b<->c forms.
	* config/aarch64/aarch64-sve-builtins.cc (TYPES_reinterpret_b): New
	type suffix list.
	(wrap_type_in_struct, register_type_decl): New functions, split out
	from...
	(register_tuple_type): ...here.
	(register_builtin_types): Handle svcount_t.
	(handle_arm_sve_h): Don't create tuples of svcount_t.
	* config/aarch64/aarch64-sve-builtins.def (svcount_t): New type.
	(c): New type suffix.
	* config/aarch64/aarch64-sve-builtins.h (TYPE_count): New type class.

gcc/testsuite/
	* g++.target/aarch64/sve/acle/general-c++/mangle_1.C: Add test
	for svcount_t.
	* g++.target/aarch64/sve/acle/general-c++/mangle_2.C: Likewise.
	* g++.target/aarch64/sve/acle/general-c++/svcount_1.C: New test.
	* gcc.target/aarch64/sve/acle/asm/test_sve_acle.h (TEST_DUAL_P)
	(TEST_DUAL_P_REV): New macros.
	* gcc.target/aarch64/sve/acle/asm/reinterpret_b.c: New test.
	* gcc.target/aarch64/sve/acle/general-c/load_1.c: Test passing
	an svcount_t.
	* gcc.target/aarch64/sve/acle/general-c/svcount_1.c: New test.
	* gcc.target/aarch64/sve/acle/general-c/unary_convert_1.c: Test
	reinterprets involving svcount_t.
	* gcc.target/aarch64/sve/acle/general/attributes_7.c: Test svcount_t.
	* gcc.target/aarch64/sve/pcs/annotate_1.c: Likewise.
	* gcc.target/aarch64/sve/pcs/annotate_2.c: Likewise.
	* gcc.target/aarch64/sve/pcs/args_12.c: New test.
---
 .../aarch64/aarch64-sve-builtins-base.cc      |   8 +-
 .../aarch64/aarch64-sve-builtins-base.def     |   1 +
 gcc/config/aarch64/aarch64-sve-builtins.cc    | 157 ++++++++-----
 gcc/config/aarch64/aarch64-sve-builtins.def   |   2 +
 gcc/config/aarch64/aarch64-sve-builtins.h     |   4 +-
 .../aarch64/sve/acle/general-c++/mangle_1.C   |   2 +
 .../aarch64/sve/acle/general-c++/mangle_2.C   |   2 +
 .../aarch64/sve/acle/general-c++/svcount_1.C  |  10 +
 .../aarch64/sve/acle/asm/reinterpret_b.c      |  20 ++
 .../aarch64/sve/acle/asm/test_sve_acle.h      |  15 ++
 .../aarch64/sve/acle/general-c/load_1.c       |   4 +-
 .../aarch64/sve/acle/general-c/svcount_1.c    |  10 +
 .../sve/acle/general-c/unary_convert_1.c      |   8 +-
 .../aarch64/sve/acle/general/attributes_7.c   |   1 +
 .../gcc.target/aarch64/sve/pcs/annotate_1.c   |   4 +
 .../gcc.target/aarch64/sve/pcs/annotate_2.c   |   4 +
 .../gcc.target/aarch64/sve/pcs/args_12.c      | 214 ++++++++++++++++++
 17 files changed, 402 insertions(+), 64 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/svcount_1.C
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_b.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/svcount_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/pcs/args_12.c

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 5b75b903e5f..7d9ec5a911f 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -2166,8 +2166,9 @@ public:
 
     /* Punt to rtl if the effect of the reinterpret on registers does not
        conform to GCC's endianness model.  */
-    if (!targetm.can_change_mode_class (f.vector_mode (0),
-					f.vector_mode (1), FP_REGS))
+    if (GET_MODE_CLASS (f.vector_mode (0)) != MODE_VECTOR_BOOL
+	&& !targetm.can_change_mode_class (f.vector_mode (0),
+					   f.vector_mode (1), FP_REGS))
       return NULL;
 
     /* Otherwise svreinterpret corresponds directly to a VIEW_CONVERT_EXPR
@@ -2181,6 +2182,9 @@ public:
   expand (function_expander &e) const override
   {
     machine_mode mode = e.tuple_mode (0);
+    /* Handle svbool_t <-> svcount_t.  */
+    if (mode == e.tuple_mode (1))
+      return e.args[0];
     return e.use_exact_insn (code_for_aarch64_sve_reinterpret (mode));
   }
 };
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.def b/gcc/config/aarch64/aarch64-sve-builtins-base.def
index ac53f35220d..a742c7bbc56 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.def
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.def
@@ -198,6 +198,7 @@ DEF_SVE_FUNCTION (svrecpe, unary, all_float, none)
 DEF_SVE_FUNCTION (svrecps, binary, all_float, none)
 DEF_SVE_FUNCTION (svrecpx, unary, all_float, mxz)
 DEF_SVE_FUNCTION_GS (svreinterpret, reinterpret, reinterpret, x1234, none)
+DEF_SVE_FUNCTION (svreinterpret, reinterpret, reinterpret_b, none)
 DEF_SVE_FUNCTION (svrev, unary, all_data, none)
 DEF_SVE_FUNCTION (svrev, unary_pred, all_pred, none)
 DEF_SVE_FUNCTION (svrevb, unary, hsd_integer, mxz)
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
index 5f2062fa3e2..a6973206951 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -430,6 +430,12 @@ CONSTEXPR const group_suffix_info group_suffixes[] = {
   TYPES_reinterpret1 (D, u32), \
   TYPES_reinterpret1 (D, u64)
 
+/* _b_c
+   _c_b.  */
+#define TYPES_reinterpret_b(S, D) \
+  D (b, c), \
+  D (c, b)
+
 /* { _b8 _b16 _b32 _b64 } x { _s32 _s64 }
 			    { _u32 _u64 } */
 #define TYPES_while1(D, bn) \
@@ -579,6 +585,7 @@ DEF_SVE_TYPES_ARRAY (cvt_narrow_s);
 DEF_SVE_TYPES_ARRAY (cvt_narrow);
 DEF_SVE_TYPES_ARRAY (inc_dec_n);
 DEF_SVE_TYPES_ARRAY (reinterpret);
+DEF_SVE_TYPES_ARRAY (reinterpret_b);
 DEF_SVE_TYPES_ARRAY (while);
 DEF_SVE_TYPES_ARRAY (all_za);
 DEF_SVE_TYPES_ARRAY (d_za);
@@ -3669,6 +3676,49 @@ function_expander::expand ()
   return base->expand (*this);
 }
 
+/* Return a structure type that contains a single field of type FIELD_TYPE.
+   The field is called __val, but that's an internal detail rather than
+   an exposed part of the API.  */
+static tree
+wrap_type_in_struct (tree field_type)
+{
+  tree field = build_decl (input_location, FIELD_DECL,
+			   get_identifier ("__val"), field_type);
+  tree struct_type = lang_hooks.types.make_type (RECORD_TYPE);
+  DECL_FIELD_CONTEXT (field) = struct_type;
+  TYPE_FIELDS (struct_type) = field;
+  make_type_sizeless (struct_type);
+  layout_type (struct_type);
+  return struct_type;
+}
+
+/* Register a built-in TYPE_DECL called NAME for TYPE.  This is used/needed
+   when TYPE is a structure type.  */
+static void
+register_type_decl (tree type, const char *name)
+{
+  tree decl = build_decl (input_location, TYPE_DECL,
+			  get_identifier (name), type);
+  TYPE_NAME (type) = decl;
+  TYPE_STUB_DECL (type) = decl;
+  lang_hooks.decls.pushdecl (decl);
+  /* ??? Undo the effect of set_underlying_type for C.  The C frontend
+     doesn't recognize DECL as a built-in because (as intended) the decl has
+     a real location instead of BUILTINS_LOCATION.  The frontend therefore
+     treats the decl like a normal C "typedef struct foo foo;", expecting
+     the type for tag "struct foo" to have a dummy unnamed TYPE_DECL instead
+     of the named one we attached above.  It then sets DECL_ORIGINAL_TYPE
+     on the supposedly unnamed decl, creating a circularity that upsets
+     dwarf2out.
+
+     We don't want to follow the normal C model and create "struct foo"
+     tags for tuple types since (a) the types are supposed to be opaque
+     and (b) they couldn't be defined as a real struct anyway.  Treating
+     the TYPE_DECLs as "typedef struct foo foo;" without creating
+     "struct foo" would lead to confusing error messages.  */
+  DECL_ORIGINAL_TYPE (decl) = NULL_TREE;
+}
+
 /* Register the built-in SVE ABI types, such as __SVBool_t.  */
 static void
 register_builtin_types ()
@@ -3679,48 +3729,63 @@ register_builtin_types ()
 
   for (unsigned int i = 0; i < NUM_VECTOR_TYPES; ++i)
     {
-      tree eltype = scalar_types[i];
       tree vectype;
       unsigned int num_zr = 0, num_pr = 0;
-      if (eltype == boolean_type_node)
+      if (vector_type_index (i) == VECTOR_TYPE_svcount_t)
 	{
-	  vectype = build_truth_vector_type_for_mode (BYTES_PER_SVE_VECTOR,
-						      VNx16BImode);
-	  gcc_assert (TYPE_MODE (vectype) == VNx16BImode
-		      && TYPE_MODE (vectype) == TYPE_MODE_RAW (vectype)
-		      && TYPE_ALIGN (vectype) == 16
-		      && known_eq (wi::to_poly_offset (TYPE_SIZE (vectype)),
-				   BYTES_PER_SVE_VECTOR));
+	  vectype = abi_vector_types[VECTOR_TYPE_svbool_t];
+	  vectype = wrap_type_in_struct (vectype);
 	  num_pr = 1;
 	}
       else
 	{
-	  scalar_mode elmode = SCALAR_TYPE_MODE (eltype);
-	  unsigned int elbytes = GET_MODE_SIZE (elmode);
-	  poly_uint64 nunits = exact_div (BYTES_PER_SVE_VECTOR, elbytes);
-	  machine_mode mode
-	    = aarch64_sve_data_mode (elmode, nunits).require ();
-	  vectype = build_vector_type_for_mode (eltype, mode);
-	  gcc_assert (VECTOR_MODE_P (TYPE_MODE (vectype))
-		      && TYPE_MODE (vectype) == mode
-		      && TYPE_MODE_RAW (vectype) == mode
-		      && TYPE_ALIGN (vectype) == 128
-		      && known_eq (wi::to_poly_offset (TYPE_SIZE (vectype)),
-				   BITS_PER_SVE_VECTOR));
-	  num_zr = 1;
+	  tree eltype = scalar_types[i];
+	  if (eltype == boolean_type_node)
+	    {
+	      vectype = build_truth_vector_type_for_mode (BYTES_PER_SVE_VECTOR,
+							  VNx16BImode);
+	      num_pr = 1;
+	    }
+	  else
+	    {
+	      scalar_mode elmode = SCALAR_TYPE_MODE (eltype);
+	      unsigned int elbytes = GET_MODE_SIZE (elmode);
+	      poly_uint64 nunits = exact_div (BYTES_PER_SVE_VECTOR, elbytes);
+	      machine_mode mode
+		= aarch64_sve_data_mode (elmode, nunits).require ();
+	      vectype = build_vector_type_for_mode (eltype, mode);
+	      auto size = wi::to_poly_offset (TYPE_SIZE (vectype));
+	      gcc_assert (VECTOR_MODE_P (TYPE_MODE (vectype))
+			  && TYPE_MODE (vectype) == mode
+			  && TYPE_MODE_RAW (vectype) == mode
+			  && TYPE_ALIGN (vectype) == 128
+			  && known_eq (size, BITS_PER_SVE_VECTOR));
+	      num_zr = 1;
+	    }
+	  vectype = build_distinct_type_copy (vectype);
+	  gcc_assert (vectype == TYPE_MAIN_VARIANT (vectype));
+	  SET_TYPE_STRUCTURAL_EQUALITY (vectype);
+	  TYPE_ARTIFICIAL (vectype) = 1;
+	  TYPE_INDIVISIBLE_P (vectype) = 1;
+	  make_type_sizeless (vectype);
+	}
+      if (num_pr)
+	{
+	  auto size = wi::to_poly_offset (TYPE_SIZE (vectype));
+	  gcc_assert (TYPE_MODE (vectype) == VNx16BImode
+		      && TYPE_MODE (vectype) == TYPE_MODE_RAW (vectype)
+		      && TYPE_ALIGN (vectype) == 16
+		      && known_eq (size, BYTES_PER_SVE_VECTOR));
 	}
-      vectype = build_distinct_type_copy (vectype);
-      gcc_assert (vectype == TYPE_MAIN_VARIANT (vectype));
-      SET_TYPE_STRUCTURAL_EQUALITY (vectype);
-      TYPE_ARTIFICIAL (vectype) = 1;
-      TYPE_INDIVISIBLE_P (vectype) = 1;
       add_sve_type_attribute (vectype, num_zr, num_pr,
 			      vector_types[i].mangled_name,
 			      vector_types[i].acle_name);
-      make_type_sizeless (vectype);
       abi_vector_types[i] = vectype;
-      lang_hooks.types.register_builtin_type (vectype,
-					      vector_types[i].abi_name);
+      if (TREE_CODE (vectype) == RECORD_TYPE)
+	register_type_decl (vectype, vector_types[i].abi_name);
+      else
+	lang_hooks.types.register_builtin_type (vectype,
+						vector_types[i].abi_name);
     }
 }
 
@@ -3764,8 +3829,6 @@ register_vector_type (vector_type_index type)
 static void
 register_tuple_type (unsigned int num_vectors, vector_type_index type)
 {
-  tree tuple_type = lang_hooks.types.make_type (RECORD_TYPE);
-
   /* Work out the structure name.  */
   char buffer[sizeof ("svbfloat16x4_t")];
   const char *vector_type_name = vector_types[type].acle_name;
@@ -3792,37 +3855,13 @@ register_tuple_type (unsigned int num_vectors, vector_type_index type)
 	      && TYPE_MODE_RAW (array_type) == TYPE_MODE (array_type)
 	      && TYPE_ALIGN (array_type) == 128);
 
-  tree field = build_decl (input_location, FIELD_DECL,
-			   get_identifier ("__val"), array_type);
-  DECL_FIELD_CONTEXT (field) = tuple_type;
-  TYPE_FIELDS (tuple_type) = field;
+  tree tuple_type = wrap_type_in_struct (array_type);
   add_sve_type_attribute (tuple_type, num_vectors, 0, NULL, buffer);
-  make_type_sizeless (tuple_type);
-  layout_type (tuple_type);
   gcc_assert (VECTOR_MODE_P (TYPE_MODE (tuple_type))
 	      && TYPE_MODE_RAW (tuple_type) == TYPE_MODE (tuple_type)
 	      && TYPE_ALIGN (tuple_type) == 128);
 
-  tree decl = build_decl (input_location, TYPE_DECL,
-			  get_identifier (buffer), tuple_type);
-  TYPE_NAME (tuple_type) = decl;
-  TYPE_STUB_DECL (tuple_type) = decl;
-  lang_hooks.decls.pushdecl (decl);
-  /* ??? Undo the effect of set_underlying_type for C.  The C frontend
-     doesn't recognize DECL as a built-in because (as intended) the decl has
-     a real location instead of BUILTINS_LOCATION.  The frontend therefore
-     treats the decl like a normal C "typedef struct foo foo;", expecting
-     the type for tag "struct foo" to have a dummy unnamed TYPE_DECL instead
-     of the named one we attached above.  It then sets DECL_ORIGINAL_TYPE
-     on the supposedly unnamed decl, creating a circularity that upsets
-     dwarf2out.
-
-     We don't want to follow the normal C model and create "struct foo"
-     tags for tuple types since (a) the types are supposed to be opaque
-     and (b) they couldn't be defined as a real struct anyway.  Treating
-     the TYPE_DECLs as "typedef struct foo foo;" without creating
-     "struct foo" would lead to confusing error messages.  */
-  DECL_ORIGINAL_TYPE (decl) = NULL_TREE;
+  register_type_decl (tuple_type, buffer);
 
   acle_vector_types[num_vectors - 1][type] = tuple_type;
 }
@@ -3872,7 +3911,7 @@ handle_arm_sve_h ()
     {
       vector_type_index type = vector_type_index (type_i);
       register_vector_type (type);
-      if (type != VECTOR_TYPE_svbool_t)
+      if (scalar_types[type_i] != boolean_type_node)
 	for (unsigned int count = 2; count <= MAX_TUPLE_SIZE; ++count)
 	  register_tuple_type (count, type);
     }
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.def b/gcc/config/aarch64/aarch64-sve-builtins.def
index 5824dc797f9..297904f3e47 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.def
+++ b/gcc/config/aarch64/aarch64-sve-builtins.def
@@ -84,6 +84,7 @@ DEF_SVE_MODE (u64offset, none, svuint64_t, bytes)
 DEF_SVE_MODE (vnum, none, none, vectors)
 
 DEF_SVE_TYPE (svbool_t, 10, __SVBool_t, boolean_type_node)
+DEF_SVE_TYPE (svcount_t, 11, __SVCount_t, boolean_type_node)
 DEF_SVE_TYPE (svbfloat16_t, 14, __SVBfloat16_t, bfloat16_type_node)
 DEF_SVE_TYPE (svfloat16_t, 13, __SVFloat16_t, aarch64_fp16_type_node)
 DEF_SVE_TYPE (svfloat32_t, 13, __SVFloat32_t, float_type_node)
@@ -106,6 +107,7 @@ DEF_SVE_TYPE_SUFFIX (b16, svbool_t, bool, 16, VNx8BImode)
 DEF_SVE_TYPE_SUFFIX (b32, svbool_t, bool, 32, VNx4BImode)
 DEF_SVE_TYPE_SUFFIX (b64, svbool_t, bool, 64, VNx2BImode)
 DEF_SVE_TYPE_SUFFIX (bf16, svbfloat16_t, bfloat, 16, VNx8BFmode)
+DEF_SVE_TYPE_SUFFIX (c, svcount_t, count, 8, VNx16BImode)
 DEF_SVE_TYPE_SUFFIX (f16, svfloat16_t, float, 16, VNx8HFmode)
 DEF_SVE_TYPE_SUFFIX (f32, svfloat32_t, float, 32, VNx4SFmode)
 DEF_SVE_TYPE_SUFFIX (f64, svfloat64_t, float, 64, VNx2DFmode)
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h b/gcc/config/aarch64/aarch64-sve-builtins.h
index 2eb432b7f3f..f56760ccdb6 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.h
+++ b/gcc/config/aarch64/aarch64-sve-builtins.h
@@ -152,11 +152,13 @@ enum predication_index
 };
 
 /* Classifies element types, based on type suffixes with the bit count
-   removed.  */
+   removed.  "count" isn't really an element type, but we pretend it is
+   for consistency.  */
 enum type_class_index
 {
   TYPE_bool,
   TYPE_bfloat,
+  TYPE_count,
   TYPE_float,
   TYPE_signed,
   TYPE_unsigned,
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_1.C
index 36dab3c9b71..2ad0c7f9838 100644
--- a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_1.C
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_1.C
@@ -15,6 +15,7 @@ void f10(svfloat16_t) {}
 void f11(svfloat32_t) {}
 void f12(svfloat64_t) {}
 void f13(svbfloat16_t) {}
+void f14(svcount_t) {}
 
 /* { dg-final { scan-assembler "_Z2f1u10__SVBool_t:" } } */
 /* { dg-final { scan-assembler "_Z2f2u10__SVInt8_t:" } } */
@@ -29,3 +30,4 @@ void f13(svbfloat16_t) {}
 /* { dg-final { scan-assembler "_Z3f11u13__SVFloat32_t:" } } */
 /* { dg-final { scan-assembler "_Z3f12u13__SVFloat64_t:" } } */
 /* { dg-final { scan-assembler "_Z3f13u14__SVBfloat16_t:" } } */
+/* { dg-final { scan-assembler "_Z3f14u11__SVCount_t:" } } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_2.C
index ad4aaee291f..c8bfcc5a9c2 100644
--- a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_2.C
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_2.C
@@ -13,6 +13,7 @@ void f10(__SVFloat16_t) {}
 void f11(__SVFloat32_t) {}
 void f12(__SVFloat64_t) {}
 void f13(__SVBfloat16_t) {}
+void f14(__SVCount_t) {}
 
 /* { dg-final { scan-assembler "_Z2f1u10__SVBool_t:" } } */
 /* { dg-final { scan-assembler "_Z2f2u10__SVInt8_t:" } } */
@@ -27,3 +28,4 @@ void f13(__SVBfloat16_t) {}
 /* { dg-final { scan-assembler "_Z3f11u13__SVFloat32_t:" } } */
 /* { dg-final { scan-assembler "_Z3f12u13__SVFloat64_t:" } } */
 /* { dg-final { scan-assembler "_Z3f13u14__SVBfloat16_t:" } } */
+/* { dg-final { scan-assembler "_Z3f14u11__SVCount_t:" } } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/svcount_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/svcount_1.C
new file mode 100644
index 00000000000..9eac65aafff
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/svcount_1.C
@@ -0,0 +1,10 @@
+#include <arm_sve.h>
+
+svbool_t f1 (svcount_t x) { return x; } // { dg-error {cannot convert 'svcount_t' to 'svbool_t' in return} }
+svcount_t f2 (svbool_t x) { return x; } // { dg-error {cannot convert 'svbool_t' to 'svcount_t' in return} }
+void f3 (svbool_t *p, svcount_t x) { *p = x; } // { dg-error {cannot convert 'svcount_t' to 'svbool_t' in assignment} }
+void f4 (svcount_t *p, svbool_t x) { *p = x; } // { dg-error {cannot convert 'svbool_t' to 'svcount_t' in assignment} }
+svbool_t *f5 (svcount_t *p) { return p; } // { dg-error {cannot convert} }
+svcount_t *f6 (svbool_t *p) { return p; } // { dg-error {cannot convert} }
+svbool_t f7 (svcount_t x) { return (svbool_t) x; } // { dg-error {invalid cast from type 'svcount_t' to type 'svbool_t'} }
+svcount_t f8 (svbool_t x) { return (svcount_t) x; } // { dg-error {invalid cast from type 'svbool_t' to type 'svcount_t'} }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_b.c
new file mode 100644
index 00000000000..57736ecb6c2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_b.c
@@ -0,0 +1,20 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** reinterpret_b_c_tied1:
+**	ret
+*/
+TEST_DUAL_P_REV (reinterpret_b_c_tied1, svbool_t, svcount_t,
+		 p0_res = svreinterpret_b_c (p0),
+		 p0_res = svreinterpret_b (p0))
+
+/*
+** reinterpret_b_c_untied:
+**	mov	p0\.b, p2\.b
+**	ret
+*/
+TEST_DUAL_P (reinterpret_b_c_untied, svbool_t, svcount_t,
+	     p0 = svreinterpret_b_c (p2),
+	     p0 = svreinterpret_b (p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
index 84925b9bd48..5ce0be5947b 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
@@ -85,6 +85,21 @@
     return z0_res;						\
   }
 
+#define TEST_DUAL_P(NAME, TYPE1, TYPE2, CODE1, CODE2)		\
+  PROTO (NAME, TYPE1, (TYPE1 p0, TYPE1 p1, TYPE2 p2, TYPE2 p3))	\
+  {								\
+    INVOKE (CODE1, CODE2);					\
+    return p0;							\
+  }
+
+#define TEST_DUAL_P_REV(NAME, TYPE1, TYPE2, CODE1, CODE2)	\
+  PROTO (NAME, TYPE1, (TYPE2 p0, TYPE2 p1, TYPE1 p2, TYPE1 p3))	\
+  {								\
+    TYPE1 p0_res;						\
+    INVOKE (CODE1, CODE2);					\
+    return p0_res;						\
+  }
+
 #define TEST_TRIPLE_Z(NAME, TYPE1, TYPE2, TYPE3, CODE1, CODE2)	\
   PROTO (NAME, TYPE1, (TYPE1 z0, TYPE1 z1, TYPE2 z2, TYPE2 z3,	\
 		       TYPE3 z4, TYPE3 z5,			\
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_1.c
index 784fdc317e6..564295a87f3 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_1.c
@@ -6,12 +6,14 @@
 struct s { signed char x; };
 
 svuint8_t
-f1 (svbool_t pg, signed char *s8_ptr, void *void_ptr, struct s *s_ptr,
+f1 (svbool_t pg, svcount_t pn,
+    signed char *s8_ptr, void *void_ptr, struct s *s_ptr,
     float *f32_ptr, _Complex float *cf32_ptr, int **ptr_ptr)
 {
   svld1 (pg); /* { dg-error {too few arguments to function 'svld1'} } */
   svld1 (pg, s8_ptr, 0); /* { dg-error {too many arguments to function 'svld1'} } */
   svld1 (0, s8_ptr); /* { dg-error {passing 'int' to argument 1 of 'svld1', which expects 'svbool_t'} } */
+  svld1 (pn, s8_ptr); /* { dg-error {passing 'svcount_t' to argument 1 of 'svld1', which expects 'svbool_t'} } */
   svld1 (pg, 0); /* { dg-error {passing 'int' to argument 2 of 'svld1', which expects a pointer type} } */
   svld1 (pg, (int32_t *) 0);
   svld1 (pg, void_ptr); /* { dg-error {passing 'void \*' to argument 2 of 'svld1', but 'void' is not a valid SVE element type} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/svcount_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/svcount_1.c
new file mode 100644
index 00000000000..920d37e4ce7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/svcount_1.c
@@ -0,0 +1,10 @@
+#include <arm_sve.h>
+
+svbool_t f1 (svcount_t x) { return x; } /* { dg-error {incompatible types} } */
+svcount_t f2 (svbool_t x) { return x; } /* { dg-error {incompatible types} } */
+void f3 (svbool_t *p, svcount_t x) { *p = x; } /* { dg-error {incompatible types} } */
+void f4 (svcount_t *p, svbool_t x) { *p = x; } /* { dg-error {incompatible types} } */
+svbool_t *f5 (svcount_t *p) { return p; } /* { dg-error {incompatible return type} } */
+svcount_t *f6 (svbool_t *p) { return p; } /* { dg-error {incompatible return type} } */
+svbool_t f7 (svcount_t x) { return (svbool_t) x; } /* { dg-error {conversion to non-scalar} } */
+svcount_t f8 (svbool_t x) { return (svcount_t) x; } /* { dg-error {conversion to non-scalar} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_convert_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_convert_1.c
index caa4e623d3f..ce06fcefa5f 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_convert_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_convert_1.c
@@ -4,7 +4,7 @@ void
 test (svbool_t pg, svint8_t s8, svuint8_t u8,
       svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
       svint64_t s64, svuint64_t u64, svfloat16_t f16, svfloat32_t f32,
-      svfloat64_t f64)
+      svfloat64_t f64, svcount_t pn)
 {
   svcvt_f64_x (pg); /* { dg-error {too few arguments to function 'svcvt_f64_x'} } */
   svcvt_f64_x (pg, s32, 0); /* { dg-error {too many arguments to function 'svcvt_f64_x'} } */
@@ -70,4 +70,10 @@ test (svbool_t pg, svint8_t s8, svuint8_t u8,
   svcvt_u16_x (pg, f16);
   svcvt_u16_x (pg, f32); /* { dg-error {'svcvt_u16_x' has no form that takes 'svfloat32_t' arguments} } */
   svcvt_u16_x (pg, f64); /* { dg-error {'svcvt_u16_x' has no form that takes 'svfloat64_t' arguments} } */
+
+  svreinterpret_b (pg); /* { dg-error {'svreinterpret_b' has no form that takes 'svbool_t' arguments} } */
+  svreinterpret_b (pn);
+
+  svreinterpret_c (pg);
+  svreinterpret_c (pn); /* { dg-error {'svreinterpret_c' has no form that takes 'svcount_t' arguments} } */
 }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/attributes_7.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/attributes_7.c
index 95be60591fb..8e967f5a787 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/attributes_7.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/attributes_7.c
@@ -27,6 +27,7 @@ typedef svint32x2_t bad_type_6 __attribute__ ((arm_sve_vector_bits (N))); // { d
 typedef svint8_t bad_type_7 __attribute__ ((arm_sve_vector_bits (N))) __attribute__ ((arm_sve_vector_bits (N))); // { dg-error {'arm_sve_vector_bits' applied to type 'svint8_t __attribute__\(\(arm_sve_vector_bits\([0-9]+\)\)\)', which already has a size} }
 typedef fixed_bool_t bad_type_8 __attribute__ ((arm_sve_vector_bits (N))) __attribute__ ((arm_sve_vector_bits (N))); // { dg-error {'arm_sve_vector_bits' applied to type 'fixed_bool_t' {aka 'svbool_t __attribute__\(\(arm_sve_vector_bits\([0-9]+\)\)\)'}, which already has a size} }
 typedef gnu_int8_t bad_type_9 __attribute__ ((arm_sve_vector_bits (N))) __attribute__ ((arm_sve_vector_bits (N))); // { dg-error {'arm_sve_vector_bits' applied to non-SVE type 'gnu_int8_t'} }
+typedef svcount_t bad_type_10 __attribute__ ((arm_sve_vector_bits (N))); // { dg-error {'arm_sve_vector_bits' applied to non-vector type 'svcount_t'} }
 
 void
 f (int c)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_1.c
index 12ae7678948..c3ac692d7ff 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_1.c
@@ -4,6 +4,8 @@
 
 svbool_t ret_b (void) { return svptrue_b8 (); }
 
+svcount_t ret_c (svcount_t *ptr) { return *ptr; }
+
 svint8_t ret_s8 (void) { return svdup_s8 (0); }
 svint16_t ret_s16 (void) { return svdup_s16 (0); }
 svint32_t ret_s32 (void) { return svdup_s32 (0); }
@@ -58,6 +60,8 @@ svfloat64x4_t ret_f64x4 (void) { return svundef4_f64 (); }
 
 /* { dg-final { scan-assembler {\t\.variant_pcs\tret_b\n} } } */
 
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_c\n} } } */
+
 /* { dg-final { scan-assembler {\t\.variant_pcs\tret_s8\n} } } */
 /* { dg-final { scan-assembler {\t\.variant_pcs\tret_s16\n} } } */
 /* { dg-final { scan-assembler {\t\.variant_pcs\tret_s32\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_2.c
index 9f0741e3c26..c3508735fc4 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_2.c
@@ -4,6 +4,8 @@
 
 void fn_b (svbool_t x) {}
 
+void fn_c (svcount_t x) {}
+
 void fn_s8 (svint8_t x) {}
 void fn_s16 (svint16_t x) {}
 void fn_s32 (svint32_t x) {}
@@ -58,6 +60,8 @@ void fn_f64x4 (svfloat64x4_t x) {}
 
 /* { dg-final { scan-assembler {\t\.variant_pcs\tfn_b\n} } } */
 
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_c\n} } } */
+
 /* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8\n} } } */
 /* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16\n} } } */
 /* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_12.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_12.c
new file mode 100644
index 00000000000..a589484b394
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_12.c
@@ -0,0 +1,214 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+/*
+** callee_1:
+**	mov	p0\.b, p3\.b
+**	ret
+*/
+svcount_t __attribute__ ((noipa))
+callee_1 (svcount_t p0, svcount_t p1, svcount_t p2, svcount_t p3)
+{
+  return p3;
+}
+
+/*
+** callee_2:
+**	str	p0, \[x0\]
+**	str	p1, \[x1\]
+**	str	p2, \[x2\]
+**	str	p3, \[x3\]
+**	ret
+*/
+void __attribute__ ((noipa))
+callee_2 (svcount_t p0, svcount_t p1, svcount_t p2, svcount_t p3,
+	  svcount_t *ptr0, svcount_t *ptr1, svcount_t *ptr2, svcount_t *ptr3)
+{
+  *ptr0 = p0;
+  *ptr1 = p1;
+  *ptr2 = p2;
+  *ptr3 = p3;
+}
+
+/*
+** callee_3:
+**	str	p3, \[x0\]
+**	ret
+*/
+void __attribute__ ((noipa))
+callee_3 (svbool_t p0, svbool_t p1, svbool_t p2, svcount_t p3, svcount_t *ptr)
+{
+  *ptr = p3;
+}
+
+/*
+** callee_4:
+**	str	p3, \[x0\]
+**	ret
+*/
+void __attribute__ ((noipa))
+callee_4 (svcount_t p0, svcount_t p1, svcount_t p2, svbool_t p3, svbool_t *ptr)
+{
+  *ptr = p3;
+}
+
+/*
+** callee_5:
+**	ldr	p0, \[x0\]
+**	ret
+*/
+svcount_t __attribute__ ((noipa))
+callee_5 (svcount_t p0, svcount_t p1, svcount_t p2, svcount_t p3,
+	  svcount_t p4)
+{
+  return p4;
+}
+
+/*
+** callee_6:
+**	ldr	p0, \[x0\]
+**	ret
+*/
+svcount_t __attribute__ ((noipa))
+callee_6 (svcount_t p0, svcount_t p1, svcount_t p2, svcount_t p3,
+	  svcount_t p4, int x1, int x2, int x3, int x4, int x5, int x6, int x7,
+	  int x8)
+{
+  return p4;
+}
+
+/*
+** callee_7:
+**	ldr	(x[0-9]+), \[sp\]
+**	ldr	p0, \[\1\]
+**	ret
+*/
+svcount_t __attribute__ ((noipa))
+callee_7 (svcount_t p0, svcount_t p1, svcount_t p2, svcount_t p3,
+	  int x0, int x1, int x2, int x3, int x4, int x5, int x6, int x7,
+	  svcount_t p4)
+{
+  return p4;
+}
+
+/*
+** caller_1:
+**	...
+**	ldr	p0, \[x0\]
+**	ldr	p1, \[x1\]
+**	ldr	p2, \[x2\]
+**	ldr	p3, \[x3\]
+**	bl	callee_1
+**	...
+**	str	p0, .*
+**	...
+*/
+void __attribute__ ((noipa))
+caller_1 (volatile svcount_t *ptr0, volatile svcount_t *ptr1,
+	  volatile svcount_t *ptr2, volatile svcount_t *ptr3,
+	  svcount_t *ptr4)
+{
+  svcount_t p0 = *ptr0;
+  svcount_t p1 = *ptr1;
+  svcount_t p2 = *ptr2;
+  svcount_t p3 = *ptr3;
+  *ptr4 = callee_1 (p0, p1, p2, p3);
+}
+
+/*
+** caller_3:
+**	...
+**	ldr	p0, \[x1\]
+**	ldr	p1, \[x2\]
+**	ldr	p2, \[x3\]
+**	ldr	p3, \[x4\]
+**	bl	callee_3
+**	...
+*/
+void __attribute__ ((noipa))
+caller_3 (svcount_t *ptr,
+	  volatile svbool_t *ptr0, volatile svbool_t *ptr1,
+	  volatile svbool_t *ptr2, volatile svcount_t *ptr3)
+{
+  svbool_t p0 = *ptr0;
+  svbool_t p1 = *ptr1;
+  svbool_t p2 = *ptr2;
+  svcount_t p3 = *ptr3;
+  callee_3 (p0, p1, p2, p3, ptr);
+}
+
+/*
+** caller_4:
+**	...
+**	ldr	p0, \[x1\]
+**	ldr	p1, \[x2\]
+**	ldr	p2, \[x3\]
+**	ldr	p3, \[x4\]
+**	bl	callee_4
+**	...
+*/
+void __attribute__ ((noipa))
+caller_4 (svbool_t *ptr,
+	  volatile svcount_t *ptr0, volatile svcount_t *ptr1,
+	  volatile svcount_t *ptr2, volatile svbool_t *ptr3)
+{
+  svcount_t p0 = *ptr0;
+  svcount_t p1 = *ptr1;
+  svcount_t p2 = *ptr2;
+  svbool_t p3 = *ptr3;
+  callee_4 (p0, p1, p2, p3, ptr);
+}
+
+/*
+** caller_5:
+**	...
+**	ldr	p0, \[x1\]
+**	ldr	p1, \[x2\]
+**	ldr	p2, \[x3\]
+**	ldr	p3, \[x4\]
+**	...
+**	mov	x0, sp
+**	...
+**	str	p[0-9]+, \[(?:x0|sp)\]
+**	...
+**	bl	callee_5
+**	...
+**	str	p0, .*
+**	...
+*/
+void __attribute__ ((noipa))
+caller_5 (svcount_t *ptr,
+	  volatile svcount_t *ptr0, volatile svcount_t *ptr1,
+	  volatile svcount_t *ptr2, volatile svcount_t *ptr3,
+	  volatile svcount_t *ptr4)
+{
+  svcount_t p0 = *ptr0;
+  svcount_t p1 = *ptr1;
+  svcount_t p2 = *ptr2;
+  svcount_t p3 = *ptr3;
+  svcount_t p4 = *ptr4;
+  *ptr = callee_5 (p0, p1, p2, p3, p4);
+}
+
+/*
+** caller_7:
+**	...
+**	ldr	(p[0-9]+), \[x2\]
+**	...
+**	str	\1, \[(x[0-9]+)\]
+**	...
+**	str	\2, \[sp\]
+**	...
+**	bl	callee_7
+**	...
+*/
+void __attribute__ ((noipa))
+caller_7 (svcount_t *ptr, volatile svcount_t *ptr0, volatile svcount_t *ptr1)
+{
+  svcount_t p0 = *ptr0;
+  svcount_t p1 = *ptr1;
+  *ptr = callee_7 (p0, p0, p0, p0, 0, 0, 0, 0, 0, 0, 0, 0, p1);
+}
-- 
2.25.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 3/5] aarch64: Add svboolx2_t
  2023-11-17 17:37 aarch64: Add support for SME2 Richard Sandiford
  2023-11-17 17:38 ` [PATCH 1/5] aarch64: Add +sme2 Richard Sandiford
  2023-11-17 17:39 ` [PATCH 2/5] aarch64: Add svcount_t Richard Sandiford
@ 2023-11-17 17:39 ` Richard Sandiford
  2023-11-17 17:39 ` [PATCH 4/5] aarch64: Add ZT0 Richard Sandiford
  2023-11-17 17:42 ` [PATCH 5/5] aarch64: Add support for SME2 intrinsics Richard Sandiford
  4 siblings, 0 replies; 6+ messages in thread
From: Richard Sandiford @ 2023-11-17 17:39 UTC (permalink / raw)
  To: gcc-patches

SME2 has some instructions that operate on pairs of predicates.
The SME2 ACLE defines an svboolx2_t type for the associated
intrinsics.

The patch uses a double-width predicate mode, VNx32BI, to represent
the contents, similarly to how data vector tuples work.  At present
there doesn't seem to be any need to define pairs for VNx2BI,
VNx4BI and VNx8BI.

We already supported pairs of svbool_ts at the PCS level, as part
of a more general framework.  All that changes on the PCS side is
that we now have an associated mode.

gcc/
	* config/aarch64/aarch64-modes.def (VNx32BI): New mode.
	* config/aarch64/aarch64-protos.h (aarch64_split_double_move): Declare.
	* config/aarch64/aarch64-sve-builtins.cc
	(register_tuple_type): Handle tuples of predicates.
	(handle_arm_sve_h): Define svboolx2_t as a pair of two svbool_ts.
	* config/aarch64/aarch64-sve.md (movvnx32bi): New insn.
	* config/aarch64/aarch64.cc
	(pure_scalable_type_info::piece::get_rtx): Use VNx32BI for pairs
	of predicates.
	(pure_scalable_type_info::add_piece): Don't try to form pairs of
	predicates.
	(VEC_STRUCT): Generalize comment.
	(aarch64_classify_vector_mode): Handle VNx32BI.
	(aarch64_array_mode): Likewise.  Return BLKmode for arrays of
	predicates that have no associated mode, rather than allowing
	an integer mode to be chosen.
	(aarch64_hard_regno_nregs): Handle VNx32BI.
	(aarch64_hard_regno_mode_ok): Likewise.
	(aarch64_split_double_move): New function, split out from...
	(aarch64_split_128bit_move): ...here.
	(aarch64_ptrue_reg): Tighten assert to aarch64_sve_pred_mode_p.
	(aarch64_pfalse_reg): Likewise.
	(aarch64_sve_same_pred_for_ptest_p): Likewise.
	(aarch64_sme_mode_switch_regs::add_reg): Handle VNx32BI.
	(aarch64_expand_mov_immediate): Restrict handling of boolean vector
	constants to single-predicate modes.
	(aarch64_classify_address): Handle VNx32BI, ensuring that both halves
	can be addressed.
	(aarch64_class_max_nregs): Handle VNx32BI.
	(aarch64_member_type_forces_blk): Don't for BLKmode for svboolx2_t.
	(aarch64_simd_valid_immediate): Allow all-zeros and all-ones for
	VNx32BI.
	(aarch64_mov_operand_p): Restrict predicate constant canonicalization
	to single-predicate modes.
	(aarch64_evpc_ext): Generalize exclusion to all predicate modes.
	(aarch64_evpc_rev_local, aarch64_evpc_dup): Likewise.
	* config/aarch64/constraints.md (PR_REGS): New predicate.

gcc/testsuite/
	* gcc.target/aarch64/sve/pcs/struct_3_128.c (test_nonpst3): Adjust
	stack offsets.
	(ret_nonpst3): Remove XFAIL.
	* gcc.target/aarch64/sve/acle/general-c/svboolx2_1.c: New test.
---
 gcc/config/aarch64/aarch64-modes.def          |   3 +
 gcc/config/aarch64/aarch64-protos.h           |   1 +
 gcc/config/aarch64/aarch64-sve-builtins.cc    |  18 ++-
 gcc/config/aarch64/aarch64-sve.md             |  22 +++
 gcc/config/aarch64/aarch64.cc                 | 136 ++++++++++++------
 gcc/config/aarch64/constraints.md             |   4 +
 .../aarch64/sve/acle/general-c/svboolx2_1.c   | 135 +++++++++++++++++
 .../gcc.target/aarch64/sve/pcs/struct_3_128.c |   6 +-
 8 files changed, 272 insertions(+), 53 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/svboolx2_1.c

diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
index a3efc5b8484..ffca5517dec 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -48,16 +48,19 @@ ADJUST_FLOAT_FORMAT (HF, &ieee_half_format);
 
 /* Vector modes.  */
 
+VECTOR_BOOL_MODE (VNx32BI, 32, BI, 4);
 VECTOR_BOOL_MODE (VNx16BI, 16, BI, 2);
 VECTOR_BOOL_MODE (VNx8BI, 8, BI, 2);
 VECTOR_BOOL_MODE (VNx4BI, 4, BI, 2);
 VECTOR_BOOL_MODE (VNx2BI, 2, BI, 2);
 
+ADJUST_NUNITS (VNx32BI, aarch64_sve_vg * 16);
 ADJUST_NUNITS (VNx16BI, aarch64_sve_vg * 8);
 ADJUST_NUNITS (VNx8BI, aarch64_sve_vg * 4);
 ADJUST_NUNITS (VNx4BI, aarch64_sve_vg * 2);
 ADJUST_NUNITS (VNx2BI, aarch64_sve_vg);
 
+ADJUST_ALIGNMENT (VNx32BI, 2);
 ADJUST_ALIGNMENT (VNx16BI, 2);
 ADJUST_ALIGNMENT (VNx8BI, 2);
 ADJUST_ALIGNMENT (VNx4BI, 2);
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 3afb521c55c..25e2375c4fa 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -948,6 +948,7 @@ rtx aarch64_simd_expand_builtin (int, tree, rtx);
 void aarch64_simd_lane_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT, const_tree);
 rtx aarch64_endian_lane_rtx (machine_mode, unsigned int);
 
+void aarch64_split_double_move (rtx, rtx, machine_mode);
 void aarch64_split_128bit_move (rtx, rtx);
 
 bool aarch64_split_128bit_move_p (rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
index a6973206951..7cc5bdd8060 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -3829,6 +3829,9 @@ register_vector_type (vector_type_index type)
 static void
 register_tuple_type (unsigned int num_vectors, vector_type_index type)
 {
+  tree vector_type = acle_vector_types[0][type];
+  bool is_pred = GET_MODE_CLASS (TYPE_MODE (vector_type)) == MODE_VECTOR_BOOL;
+
   /* Work out the structure name.  */
   char buffer[sizeof ("svbfloat16x4_t")];
   const char *vector_type_name = vector_types[type].acle_name;
@@ -3849,17 +3852,19 @@ register_tuple_type (unsigned int num_vectors, vector_type_index type)
 
      Using arrays simplifies the handling of svget and svset for variable
      arguments.  */
-  tree vector_type = acle_vector_types[0][type];
   tree array_type = build_array_type_nelts (vector_type, num_vectors);
   gcc_assert (VECTOR_MODE_P (TYPE_MODE (array_type))
 	      && TYPE_MODE_RAW (array_type) == TYPE_MODE (array_type)
-	      && TYPE_ALIGN (array_type) == 128);
+	      && TYPE_ALIGN (array_type) == (is_pred ? 16 : 128));
 
   tree tuple_type = wrap_type_in_struct (array_type);
-  add_sve_type_attribute (tuple_type, num_vectors, 0, NULL, buffer);
+  if (is_pred)
+    add_sve_type_attribute (tuple_type, 0, num_vectors, NULL, buffer);
+  else
+    add_sve_type_attribute (tuple_type, num_vectors, 0, NULL, buffer);
   gcc_assert (VECTOR_MODE_P (TYPE_MODE (tuple_type))
 	      && TYPE_MODE_RAW (tuple_type) == TYPE_MODE (tuple_type)
-	      && TYPE_ALIGN (tuple_type) == 128);
+	      && TYPE_ALIGN (tuple_type) == TYPE_ALIGN (array_type));
 
   register_type_decl (tuple_type, buffer);
 
@@ -3911,9 +3916,10 @@ handle_arm_sve_h ()
     {
       vector_type_index type = vector_type_index (type_i);
       register_vector_type (type);
-      if (scalar_types[type_i] != boolean_type_node)
+      if (type != VECTOR_TYPE_svcount_t)
 	for (unsigned int count = 2; count <= MAX_TUPLE_SIZE; ++count)
-	  register_tuple_type (count, type);
+	  if (type != VECTOR_TYPE_svbool_t || count == 2)
+	    register_tuple_type (count, type);
     }
 
   /* Define the enums.  */
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 3f48e4cdf26..3729c67eb69 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -33,6 +33,7 @@
 ;; ---- Moves of single vectors
 ;; ---- Moves of multiple vectors
 ;; ---- Moves of predicates
+;; ---- Moves of multiple predicates
 ;; ---- Moves relating to the FFR
 ;;
 ;; == Loads
@@ -1069,6 +1070,27 @@ (define_insn_and_rewrite "*aarch64_sve_ptrue<mode>_ptest"
   }
 )
 
+;; -------------------------------------------------------------------------
+;; ---- Moves of multiple predicates
+;; -------------------------------------------------------------------------
+
+(define_insn_and_split "movvnx32bi"
+  [(set (match_operand:VNx32BI 0 "nonimmediate_operand")
+	(match_operand:VNx32BI 1 "aarch64_mov_operand"))]
+  "TARGET_SVE"
+  {@ [ cons: =0 , 1   ]
+     [ Upa      , Upa ] #
+     [ Upa      , m   ] #
+     [ m        , Upa ] #
+  }
+  "&& reload_completed"
+  [(const_int 0)]
+  {
+    aarch64_split_double_move (operands[0], operands[1], VNx16BImode);
+    DONE;
+  }
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- Moves relating to the FFR
 ;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index c8f99d5c991..36576159b4f 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -3220,7 +3220,7 @@ pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
   if (num_zr > 0 && num_pr == 0)
     return gen_rtx_REG (mode, first_zr);
 
-  if (num_zr == 0 && num_pr == 1)
+  if (num_zr == 0 && num_pr <= 2)
     return gen_rtx_REG (mode, first_pr);
 
   gcc_unreachable ();
@@ -3443,6 +3443,7 @@ pure_scalable_type_info::add_piece (const piece &p)
       gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
       unsigned int nelems1, nelems2;
       if (prev.orig_mode == p.orig_mode
+	  && GET_MODE_CLASS (p.orig_mode) != MODE_VECTOR_BOOL
 	  && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
 	  && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
 				  GET_MODE_NUNITS (p.orig_mode), &nelems1)
@@ -3744,8 +3745,7 @@ aarch64_sve_pred_mode_p (machine_mode mode)
 const unsigned int VEC_ADVSIMD  = 1;
 const unsigned int VEC_SVE_DATA = 2;
 const unsigned int VEC_SVE_PRED = 4;
-/* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
-   a structure of 2, 3 or 4 vectors.  */
+/* Indicates a structure of 2, 3 or 4 vectors or predicates.  */
 const unsigned int VEC_STRUCT   = 8;
 /* Can be used in combination with VEC_SVE_DATA to indicate that the
    vector has fewer significant bytes than a full SVE vector.  */
@@ -3908,6 +3908,9 @@ aarch64_classify_vector_mode (machine_mode mode, bool any_target_p = false)
     case E_V2DFmode:
       return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD : 0;
 
+    case E_VNx32BImode:
+      return TARGET_SVE ? VEC_SVE_PRED | VEC_STRUCT : 0;
+
     default:
       return 0;
     }
@@ -4035,12 +4038,24 @@ aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
 static opt_machine_mode
 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
 {
-  if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
-      && IN_RANGE (nelems, 2, 4))
+  if (TARGET_SVE && GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
+    {
+      /* Use VNx32BI for pairs of predicates, but explicitly reject giving
+	 a mode to other array sizes.  Using integer modes requires a round
+	 trip through memory and generates terrible code.  */
+      if (nelems == 1)
+	return mode;
+      if (mode == VNx16BImode && nelems == 2)
+	return VNx32BImode;
+      return BLKmode;
+    }
+
+  auto flags = aarch64_classify_vector_mode (mode);
+  if (flags == VEC_SVE_DATA && IN_RANGE (nelems, 2, 4))
     return aarch64_sve_data_mode (GET_MODE_INNER (mode),
 				  GET_MODE_NUNITS (mode) * nelems);
-  if (aarch64_classify_vector_mode (mode) == VEC_ADVSIMD
-      && IN_RANGE (nelems, 2, 4))
+
+  if (flags == VEC_ADVSIMD && IN_RANGE (nelems, 2, 4))
     return aarch64_advsimd_vector_array_mode (mode, nelems);
 
   return opt_machine_mode ();
@@ -4260,13 +4275,17 @@ aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 	  return GET_MODE_SIZE (mode).to_constant () / 8;
 	return CEIL (lowest_size, UNITS_PER_VREG);
       }
+
     case PR_REGS:
     case PR_LO_REGS:
     case PR_HI_REGS:
+      return mode == VNx32BImode ? 2 : 1;
+
     case FFR_REGS:
     case PR_AND_FFR_REGS:
     case FAKE_REGS:
       return 1;
+
     default:
       return CEIL (lowest_size, UNITS_PER_WORD);
     }
@@ -4290,9 +4309,12 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
     return mode == DImode;
 
   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
-  if (vec_flags & VEC_SVE_PRED)
+  if (vec_flags == VEC_SVE_PRED)
     return pr_or_ffr_regnum_p (regno);
 
+  if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
+    return PR_REGNUM_P (regno);
+
   if (pr_or_ffr_regnum_p (regno))
     return false;
 
@@ -5374,6 +5396,33 @@ aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
     emit_move_insn (dest, tmp);
 }
 
+/* Split a move from SRC to DST into two moves of mode SINGLE_MODE.  */
+
+void
+aarch64_split_double_move (rtx dst, rtx src, machine_mode single_mode)
+{
+  machine_mode mode = GET_MODE (dst);
+
+  rtx dst0 = simplify_gen_subreg (single_mode, dst, mode, 0);
+  rtx dst1 = simplify_gen_subreg (single_mode, dst, mode,
+				  GET_MODE_SIZE (single_mode));
+  rtx src0 = simplify_gen_subreg (single_mode, src, mode, 0);
+  rtx src1 = simplify_gen_subreg (single_mode, src, mode,
+				  GET_MODE_SIZE (single_mode));
+
+  /* At most one pairing may overlap.  */
+  if (reg_overlap_mentioned_p (dst0, src1))
+    {
+      aarch64_emit_move (dst1, src1);
+      aarch64_emit_move (dst0, src0);
+    }
+  else
+    {
+      aarch64_emit_move (dst0, src0);
+      aarch64_emit_move (dst1, src1);
+    }
+}
+
 /* Split a 128-bit move operation into two 64-bit move operations,
    taking care to handle partial overlap of register to register
    copies.  Special cases are needed when moving between GP regs and
@@ -5383,9 +5432,6 @@ aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
 void
 aarch64_split_128bit_move (rtx dst, rtx src)
 {
-  rtx dst_lo, dst_hi;
-  rtx src_lo, src_hi;
-
   machine_mode mode = GET_MODE (dst);
 
   gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
@@ -5400,8 +5446,8 @@ aarch64_split_128bit_move (rtx dst, rtx src)
       /* Handle FP <-> GP regs.  */
       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
 	{
-	  src_lo = gen_lowpart (word_mode, src);
-	  src_hi = gen_highpart (word_mode, src);
+	  rtx src_lo = gen_lowpart (word_mode, src);
+	  rtx src_hi = gen_highpart (word_mode, src);
 
 	  emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
 	  emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
@@ -5409,8 +5455,8 @@ aarch64_split_128bit_move (rtx dst, rtx src)
 	}
       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
 	{
-	  dst_lo = gen_lowpart (word_mode, dst);
-	  dst_hi = gen_highpart (word_mode, dst);
+	  rtx dst_lo = gen_lowpart (word_mode, dst);
+	  rtx dst_hi = gen_highpart (word_mode, dst);
 
 	  emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
 	  emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
@@ -5418,22 +5464,7 @@ aarch64_split_128bit_move (rtx dst, rtx src)
 	}
     }
 
-  dst_lo = gen_lowpart (word_mode, dst);
-  dst_hi = gen_highpart (word_mode, dst);
-  src_lo = gen_lowpart (word_mode, src);
-  src_hi = gen_highpart_mode (word_mode, mode, src);
-
-  /* At most one pairing may overlap.  */
-  if (reg_overlap_mentioned_p (dst_lo, src_hi))
-    {
-      aarch64_emit_move (dst_hi, src_hi);
-      aarch64_emit_move (dst_lo, src_lo);
-    }
-  else
-    {
-      aarch64_emit_move (dst_lo, src_lo);
-      aarch64_emit_move (dst_hi, src_hi);
-    }
+  aarch64_split_double_move (dst, src, word_mode);
 }
 
 /* Return true if we should split a move from 128-bit value SRC
@@ -5699,7 +5730,7 @@ aarch64_ptrue_all (unsigned int elt_size)
 rtx
 aarch64_ptrue_reg (machine_mode mode)
 {
-  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
+  gcc_assert (aarch64_sve_pred_mode_p (mode));
   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
   return gen_lowpart (mode, reg);
 }
@@ -5709,7 +5740,7 @@ aarch64_ptrue_reg (machine_mode mode)
 rtx
 aarch64_pfalse_reg (machine_mode mode)
 {
-  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
+  gcc_assert (aarch64_sve_pred_mode_p (mode));
   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
   return gen_lowpart (mode, reg);
 }
@@ -5725,7 +5756,7 @@ bool
 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
 {
   machine_mode mode = GET_MODE (pred1[0]);
-  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
+  gcc_assert (aarch64_sve_pred_mode_p (mode)
 	      && mode == GET_MODE (pred2[0])
 	      && aarch64_sve_ptrue_flag (pred1[1], SImode)
 	      && aarch64_sve_ptrue_flag (pred2[1], SImode));
@@ -7198,7 +7229,9 @@ aarch64_sme_mode_switch_regs::add_reg (machine_mode mode, unsigned int regno)
       machine_mode submode = mode;
       if (vec_flags & VEC_STRUCT)
 	{
-	  if (vec_flags & VEC_SVE_DATA)
+	  if (vec_flags & VEC_SVE_PRED)
+	    submode = VNx16BImode;
+	  else if (vec_flags & VEC_SVE_DATA)
 	    submode = SVE_BYTE_MODE;
 	  else if (vec_flags & VEC_PARTIAL)
 	    submode = V8QImode;
@@ -7207,7 +7240,7 @@ aarch64_sme_mode_switch_regs::add_reg (machine_mode mode, unsigned int regno)
 	}
       save_location loc;
       loc.reg = gen_rtx_REG (submode, regno);
-      if (vec_flags == VEC_SVE_PRED)
+      if (vec_flags & VEC_SVE_PRED)
 	{
 	  gcc_assert (PR_REGNUM_P (regno));
 	  loc.group = MEM_SVE_PRED;
@@ -8219,7 +8252,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
 
   if (!CONST_INT_P (imm))
     {
-      if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
+      if (aarch64_sve_pred_mode_p (mode))
 	{
 	  /* Only the low bit of each .H, .S and .D element is defined,
 	     so we can set the upper bits to whatever we like.  If the
@@ -12720,6 +12753,15 @@ aarch64_classify_address (struct aarch64_address_info *info,
 	  if (vec_flags == VEC_SVE_PRED)
 	    return offset_9bit_signed_scaled_p (mode, offset);
 
+	  if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
+	    {
+	      poly_int64 end_offset = (offset
+				       + GET_MODE_SIZE (mode)
+				       - BYTES_PER_SVE_PRED);
+	      return (offset_9bit_signed_scaled_p (VNx16BImode, end_offset)
+		      && offset_9bit_signed_scaled_p (VNx16BImode, offset));
+	    }
+
 	  if (load_store_pair_p)
 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
 		     || known_eq (GET_MODE_SIZE (mode), 8)
@@ -15020,10 +15062,12 @@ aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
 	      ? CEIL (lowest_size, UNITS_PER_VREG)
 	      : CEIL (lowest_size, UNITS_PER_WORD));
 
-    case STACK_REG:
     case PR_REGS:
     case PR_LO_REGS:
     case PR_HI_REGS:
+      return mode == VNx32BImode ? 2 : 1;
+
+    case STACK_REG:
     case FFR_REGS:
     case PR_AND_FFR_REGS:
     case FAKE_REGS:
@@ -22630,11 +22674,11 @@ aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
      an ARRAY_TYPE.  In both cases we're interested in the TREE_TYPE.  */
   const_tree type = TREE_TYPE (field_or_array);
 
-  /* Assign BLKmode to anything that contains multiple SVE predicates.
+  /* Assign BLKmode to anything that contains more than 2 SVE predicates.
      For structures, the "multiple" case is indicated by MODE being
      VOIDmode.  */
   unsigned int num_zr, num_pr;
-  if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
+  if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr > 2)
     {
       if (TREE_CODE (field_or_array) == ARRAY_TYPE)
 	return !simple_cst_equal (TYPE_SIZE (field_or_array),
@@ -23874,6 +23918,9 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
   if ((vec_flags & VEC_ADVSIMD) && !TARGET_SIMD)
     return false;
 
+  if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
+    return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode);
+
   if (vec_flags & VEC_SVE_PRED)
     return aarch64_sve_pred_valid_immediate (op, info);
 
@@ -24047,7 +24094,8 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
 	 force everything to have a canonical form.  */
       if (!lra_in_progress
 	  && !reload_completed
-	  && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
+	  && aarch64_sve_pred_mode_p (GET_MODE (x))
+	  && known_eq (GET_MODE_SIZE (GET_MODE (x)), BYTES_PER_SVE_PRED)
 	  && GET_MODE (x) != VNx16BImode)
 	return false;
 
@@ -26623,7 +26671,7 @@ aarch64_evpc_ext (struct expand_vec_perm_d *d)
 
   /* The first element always refers to the first vector.
      Check if the extracted indices are increasing by one.  */
-  if (d->vec_flags == VEC_SVE_PRED
+  if ((d->vec_flags & VEC_SVE_PRED)
       || !d->perm[0].is_constant (&location)
       || !d->perm.series_p (0, 1, location, 1))
     return false;
@@ -26667,7 +26715,7 @@ aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
   unsigned int i, size, unspec;
   machine_mode pred_mode;
 
-  if (d->vec_flags == VEC_SVE_PRED
+  if ((d->vec_flags & VEC_SVE_PRED)
       || !d->one_vector_p
       || !d->perm[0].is_constant (&diff)
       || !diff)
@@ -26748,7 +26796,7 @@ aarch64_evpc_dup (struct expand_vec_perm_d *d)
   machine_mode vmode = d->vmode;
   rtx lane;
 
-  if (d->vec_flags == VEC_SVE_PRED
+  if ((d->vec_flags & VEC_SVE_PRED)
       || d->perm.encoding ().encoded_nelts () != 1
       || !d->perm[0].is_constant (&elt))
     return false;
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 38ed927ec14..78a62af1abf 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -42,6 +42,10 @@ (define_register_constraint "w" "FP_REGS"
 (define_register_constraint "Upa" "PR_REGS"
   "SVE predicate registers p0 - p15.")
 
+(define_register_constraint "Up2" "PR_REGS"
+  "An even SVE predicate register, p0 - p14."
+  "regno % 2 == 0")
+
 (define_register_constraint "Upl" "PR_LO_REGS"
   "SVE predicate registers p0 - p7.")
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/svboolx2_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/svboolx2_1.c
new file mode 100644
index 00000000000..877b1849986
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/svboolx2_1.c
@@ -0,0 +1,135 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+/*
+** ret_p0:
+**	ret
+*/
+svboolx2_t
+ret_p0 (svboolx2_t p0)
+{
+  return p0;
+}
+
+/*
+** ret_p1:
+**	mov	p0\.b, p1\.b
+**	mov	p1\.b, p2\.b
+**	ret
+*/
+svboolx2_t
+ret_p1 (svbool_t p0, svboolx2_t p1)
+{
+  return p1;
+}
+
+/*
+** ret_p2:
+** (
+**	mov	p0\.b, p2\.b
+**	mov	p1\.b, p3\.b
+** |
+**	mov	p1\.b, p3\.b
+**	mov	p0\.b, p2\.b
+** )
+**	ret
+*/
+svboolx2_t
+ret_p2 (svboolx2_t p0, svboolx2_t p2)
+{
+  return p2;
+}
+
+/*
+** ret_mem:
+** (
+**	ldr	p0, \[x0\]
+**	ldr	p1, \[x0, #1, mul vl\]
+** |
+**	ldr	p1, \[x0, #1, mul vl\]
+**	ldr	p0, \[x0\]
+** )
+**	ret
+*/
+svboolx2_t
+ret_mem (svboolx2_t p0, svbool_t p2, svboolx2_t mem)
+{
+  return mem;
+}
+
+/*
+** load:
+** (
+**	ldr	p0, \[x0\]
+**	ldr	p1, \[x0, #1, mul vl\]
+** |
+**	ldr	p1, \[x0, #1, mul vl\]
+**	ldr	p0, \[x0\]
+** )
+**	ret
+*/
+svboolx2_t
+load (svboolx2_t *ptr)
+{
+  return *ptr;
+}
+
+/*
+** store:
+** (
+**	str	p1, \[x0\]
+**	str	p2, \[x0, #1, mul vl\]
+** |
+**	str	p2, \[x0, #1, mul vl\]
+**	str	p1, \[x0\]
+** )
+**	ret
+*/
+void
+store (svbool_t p0, svboolx2_t p1, svboolx2_t *ptr)
+{
+  *ptr = p1;
+}
+
+/*
+** upa_p1:
+**	ret
+*/
+void
+upa_p1 (svbool_t p0, svboolx2_t p1)
+{
+  asm volatile ("" :: "Upa" (p1));
+}
+
+/*
+** up2_p1:
+** (
+**	mov	p0\.b, p1\.b
+**	mov	p1\.b, p2\.b
+** |
+**	mov	p3\.b, p2\.b
+**	mov	p2\.b, p1\.b
+** )
+**	ret
+*/
+void
+up2_p1 (svbool_t p0, svboolx2_t p1)
+{
+  asm volatile ("" :: "Up2" (p1));
+}
+
+/*
+** p1_to_p2:
+**	mov	p3\.b, p2\.b
+**	mov	p2\.b, p1\.b
+**	ret
+*/
+void
+p1_to_p2 (svbool_t p0, svboolx2_t p1)
+{
+  register svboolx2_t p2 asm ("p2") = p1;
+  asm volatile ("" :: "Up2" (p2));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c
index f6d78469aa5..b8fe86058a9 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c
@@ -908,8 +908,8 @@ SEL2 (union, nonpst3)
 /*
 ** test_nonpst3:
 **	sub	sp, sp, #16
-**	str	w0, \[sp, #?8\]
-**	ldr	p0, \[sp, #4, mul vl\]
+**	str	w0, \[sp, #?12\]
+**	ldr	p0, \[sp, #6, mul vl\]
 **	add	sp, sp, #?16
 **	ret
 */
@@ -921,7 +921,7 @@ test_nonpst3 (union nonpst3 x)
 }
 
 /*
-** ret_nonpst3: { xfail *-*-* }
+** ret_nonpst3:
 **	mov	w0, #?(?:0xffff|65535)
 **	ret
 */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 4/5] aarch64: Add ZT0
  2023-11-17 17:37 aarch64: Add support for SME2 Richard Sandiford
                   ` (2 preceding siblings ...)
  2023-11-17 17:39 ` [PATCH 3/5] aarch64: Add svboolx2_t Richard Sandiford
@ 2023-11-17 17:39 ` Richard Sandiford
  2023-11-17 17:42 ` [PATCH 5/5] aarch64: Add support for SME2 intrinsics Richard Sandiford
  4 siblings, 0 replies; 6+ messages in thread
From: Richard Sandiford @ 2023-11-17 17:39 UTC (permalink / raw)
  To: gcc-patches

SME2 adds a 512-bit lookup table called ZT0.  It is enabled
and disabled by PSTATE.ZA, just like ZA itself.  This patch
adds support for the register, including saving and restoring
contents.

The code reuses the V8DI that was added for LS64, including
the associated memory classification rules.  (The ZT0 range
is more restricted than the LS64 range, but that's enforced
by predicates and constraints.)

gcc/
	* config/aarch64/aarch64.md (ZT0_REGNUM): New constant.
	(LAST_FAKE_REGNUM): Bump to include it.
	* config/aarch64/aarch64.h (FIXED_REGISTERS): Add an entry for ZT0.
	(CALL_REALLY_USED_REGISTERS, REGISTER_NAMES): Likewise.
	(REG_CLASS_CONTENTS): Likewise.
	(machine_function): Add zt0_save_buffer.
	(CUMULATIVE_ARGS): Add shared_zt0_flags;
	* config/aarch64/aarch64.cc (aarch64_check_state_string): Handle zt0.
	(aarch64_fntype_pstate_za, aarch64_fndecl_pstate_za): Likewise.
	(aarch64_function_arg): Add the shared ZT0 flags as an extra
	limb of the parallel.
	(aarch64_init_cumulative_args): Initialize shared_zt0_flags.
	(aarch64_extra_live_on_entry): Handle ZT0_REGNUM.
	(aarch64_epilogue_uses): Likewise.
	(aarch64_get_zt0_save_buffer, aarch64_save_zt0): New functions.
	(aarch64_restore_zt0): Likewise.
	(aarch64_start_call_args): Reject calls to functions that share
	ZT0 from functions that have no ZT0 state.  Save ZT0 around shared-ZA
	calls that do not share ZT0.
	(aarch64_expand_call): Handle ZT0.  Reject calls to functions that
	share ZT0 but not ZA from functions with ZA state.
	(aarch64_end_call_args): Restore ZT0 after calls to shared-ZA functions
	that do not share ZT0.
	(aarch64_set_current_function): Require +sme2 for functions that
	have ZT0 state.
	(aarch64_function_attribute_inlinable_p): Don't allow functions to
	be inlined if they have local zt0 state.
	(AARCH64_IPA_CLOBBERS_ZT0): New constant.
	(aarch64_update_ipa_fn_target_info): Record asms that clobber ZT0.
	(aarch64_can_inline_p): Don't inline callees that clobber ZT0
	into functions that have ZT0 state.
	(aarch64_comp_type_attributes): Check for compatible ZT0 sharing.
	(aarch64_optimize_mode_switching): Use mode switching if the
	function has ZT0 state.
	(aarch64_mode_emit_local_sme_state): Save and restore ZT0 around
	calls to private-ZA functions.
	(aarch64_mode_needed_local_sme_state): Require ZA to be active
	for instructions that access ZT0.
	(aarch64_md_asm_adjust): Extend handling of ZA clobbers to ZT0.
	* config/aarch64/aarch64-c.cc (aarch64_define_unconditional_macros):
	Define __ARM_STATE_ZT0.
	* config/aarch64/aarch64-sme.md (UNSPECV_ASM_UPDATE_ZT0): New unspecv.
	(aarch64_asm_update_zt0): New insn.
	(UNSPEC_RESTORE_ZT0): New unspec.
	(aarch64_sme_ldr_zt0, aarch64_restore_zt0): New insns.
	(aarch64_sme_str_zt0): Likewise.

gcc/testsuite/
	* gcc.target/aarch64/sme/zt0_state_1.c: New test.
	* gcc.target/aarch64/sme/zt0_state_2.c: Likewise.
	* gcc.target/aarch64/sme/zt0_state_3.c: Likewise.
	* gcc.target/aarch64/sme/zt0_state_4.c: Likewise.
	* gcc.target/aarch64/sme/zt0_state_5.c: Likewise.
---
 gcc/config/aarch64/aarch64-c.cc               |   1 +
 gcc/config/aarch64/aarch64-sme.md             |  63 +++++
 gcc/config/aarch64/aarch64.cc                 | 205 ++++++++++++--
 gcc/config/aarch64/aarch64.h                  |  14 +-
 gcc/config/aarch64/aarch64.md                 |   7 +-
 .../gcc.target/aarch64/sme/zt0_state_1.c      |  65 +++++
 .../gcc.target/aarch64/sme/zt0_state_2.c      |  31 +++
 .../gcc.target/aarch64/sme/zt0_state_3.c      |   6 +
 .../gcc.target/aarch64/sme/zt0_state_4.c      |  53 ++++
 .../gcc.target/aarch64/sme/zt0_state_5.c      | 260 ++++++++++++++++++
 10 files changed, 670 insertions(+), 35 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/zt0_state_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/zt0_state_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/zt0_state_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/zt0_state_4.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/zt0_state_5.c

diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
index 2a8ca46987a..017380b7563 100644
--- a/gcc/config/aarch64/aarch64-c.cc
+++ b/gcc/config/aarch64/aarch64-c.cc
@@ -74,6 +74,7 @@ aarch64_define_unconditional_macros (cpp_reader *pfile)
   builtin_define ("__GCC_ASM_FLAG_OUTPUTS__");
 
   builtin_define ("__ARM_STATE_ZA");
+  builtin_define ("__ARM_STATE_ZT0");
 
   /* Define keyword attributes like __arm_streaming as macros that expand
      to the associated [[...]] attribute.  Use __extension__ in the attribute
diff --git a/gcc/config/aarch64/aarch64-sme.md b/gcc/config/aarch64/aarch64-sme.md
index da0745f6570..505805e2ecf 100644
--- a/gcc/config/aarch64/aarch64-sme.md
+++ b/gcc/config/aarch64/aarch64-sme.md
@@ -27,7 +27,9 @@
 ;;
 ;; == Loads, stores and moves
 ;; ---- Single-vector loads
+;; ---- Table loads
 ;; ---- Single-vector stores
+;; ---- Table stores
 ;; ---- Single-vector moves
 ;; ---- Zeroing
 ;;
@@ -209,6 +211,7 @@ (define_c_enum "unspec" [
 
 (define_c_enum "unspecv" [
   UNSPECV_ASM_UPDATE_ZA
+  UNSPECV_ASM_UPDATE_ZT0
 ])
 
 ;; Use the ABI-defined routine to commit an uncommitted lazy save.
@@ -400,6 +403,19 @@ (define_insn "aarch64_asm_update_za"
   [(set_attr "type" "no_insn")]
 )
 
+;; A similar pattern for ZT0.
+(define_insn "aarch64_asm_update_zt0"
+  [(set (reg:V8DI ZT0_REGNUM)
+	(unspec_volatile:V8DI
+	  [(reg:V8DI ZT0_REGNUM)
+	   (reg:DI SME_STATE_REGNUM)
+	   (match_operand 0 "const_int_operand")]
+	  UNSPECV_ASM_UPDATE_ZT0))]
+  ""
+  ""
+  [(set_attr "type" "no_insn")]
+)
+
 ;; This pseudo-instruction is emitted as part of a call to a private-ZA
 ;; function from a function with ZA state.  It marks a natural place to set
 ;; up a lazy save, if that turns out to be necessary.  The save itself
@@ -544,6 +560,38 @@ (define_insn "@aarch64_sme_ldrn<mode>"
   "ldr\tza[%w0, %1], [%2, #%1, mul vl]"
 )
 
+;; -------------------------------------------------------------------------
+;; ---- Table loads
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - LDR
+;; -------------------------------------------------------------------------
+
+(define_c_enum "unspec" [
+  UNSPEC_RESTORE_ZT0
+])
+
+(define_insn "aarch64_sme_ldr_zt0"
+  [(set (reg:V8DI ZT0_REGNUM)
+	(match_operand:V8DI 0 "aarch64_sync_memory_operand" "Q"))
+   (use (reg:DI SME_STATE_REGNUM))]
+  "TARGET_SME2"
+  "ldr\tzt0, %0"
+)
+
+;; This version is used after calls to private-ZA functions.  Since ZT0_REGNUM
+;; represents the current function's state, it isn't clobbered by private-ZA
+;; functions, so we need to make it depend on the ZA reinitialization code.
+(define_insn "aarch64_restore_zt0"
+  [(set (reg:V8DI ZT0_REGNUM)
+	(unspec:V8DI
+	  [(reg:DI SME_STATE_REGNUM)
+	   (match_operand:V8DI 0 "aarch64_sync_memory_operand" "Q")]
+	  UNSPEC_RESTORE_ZT0))]
+  "TARGET_SME2"
+  "ldr\tzt0, %0"
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- Single-vector stores
 ;; -------------------------------------------------------------------------
@@ -614,6 +662,21 @@ (define_insn "@aarch64_sme_strn<mode>"
   "str\tza[%w0, %1], [%2, #%1, mul vl]"
 )
 
+;; -------------------------------------------------------------------------
+;; ---- Table stores
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - STR
+;; -------------------------------------------------------------------------
+
+(define_insn "aarch64_sme_str_zt0"
+  [(set (match_operand:V8DI 0 "aarch64_sync_memory_operand" "=Q")
+	(reg:V8DI ZT0_REGNUM))
+   (use (reg:DI SME_STATE_REGNUM))]
+  "TARGET_SME2"
+  "str\tzt0, %0"
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- Single-vector moves
 ;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 36576159b4f..6a6ae1c723c 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -2911,7 +2911,8 @@ aarch64_check_state_string (tree name, tree value)
     }
 
   const char *state_name = TREE_STRING_POINTER (value);
-  if (strcmp (state_name, "za") != 0)
+  if (strcmp (state_name, "za") != 0
+      && strcmp (state_name, "zt0") != 0)
     {
       error ("unrecognized state string %qs", state_name);
       return false;
@@ -4457,7 +4458,8 @@ aarch64_fntype_shared_flags (const_tree fntype, const char *state_name)
 static aarch64_feature_flags
 aarch64_fntype_pstate_za (const_tree fntype)
 {
-  if (aarch64_fntype_shared_flags (fntype, "za"))
+  if (aarch64_fntype_shared_flags (fntype, "za")
+      || aarch64_fntype_shared_flags (fntype, "zt0"))
     return AARCH64_FL_ZA_ON;
 
   return 0;
@@ -4512,7 +4514,8 @@ aarch64_fndecl_has_state (tree fndecl, const char *state_name)
 static aarch64_feature_flags
 aarch64_fndecl_pstate_za (const_tree fndecl)
 {
-  if (aarch64_fndecl_has_new_state (fndecl, "za"))
+  if (aarch64_fndecl_has_new_state (fndecl, "za")
+      || aarch64_fndecl_has_new_state (fndecl, "zt0"))
     return AARCH64_FL_ZA_ON;
 
   return aarch64_fntype_pstate_za (TREE_TYPE (fndecl));
@@ -9330,9 +9333,11 @@ aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
 						  pcum->pcs_variant);
       rtx sme_mode_switch_args = aarch64_finish_sme_mode_switch_args (pcum);
       rtx shared_za_flags = gen_int_mode (pcum->shared_za_flags, SImode);
-      return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (3, abi_cookie,
+      rtx shared_zt0_flags = gen_int_mode (pcum->shared_zt0_flags, SImode);
+      return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (4, abi_cookie,
 						    sme_mode_switch_args,
-						    shared_za_flags));
+						    shared_za_flags,
+						    shared_zt0_flags));
     }
 
   aarch64_layout_arg (pcum_v, arg);
@@ -9370,6 +9375,8 @@ aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
   pcum->silent_p = silent_p;
   pcum->shared_za_flags
     = (fntype ? aarch64_fntype_shared_flags (fntype, "za") : 0U);
+  pcum->shared_zt0_flags
+    = (fntype ? aarch64_fntype_shared_flags (fntype, "zt0") : 0U);
   pcum->num_sme_mode_switch_args = 0;
 
   if (!silent_p
@@ -11516,6 +11523,13 @@ aarch64_extra_live_on_entry (bitmap regs)
       auto za_flags = aarch64_cfun_shared_flags ("za");
       if (za_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
 	bitmap_set_bit (regs, ZA_REGNUM);
+
+      /* Since ZT0 is call-clobbered, it is only live on input if
+	 it is explicitly shared, and is not a pure output.  */
+      auto zt0_flags = aarch64_cfun_shared_flags ("zt0");
+      if (zt0_flags != 0
+	  && zt0_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
+	bitmap_set_bit (regs, ZT0_REGNUM);
     }
 }
 
@@ -11544,6 +11558,8 @@ aarch64_epilogue_uses (int regno)
     return 1;
   if (regno == ZA_REGNUM && aarch64_cfun_shared_flags ("za") != 0)
     return 1;
+  if (regno == ZT0_REGNUM && aarch64_cfun_shared_flags ("zt0") != 0)
+    return 1;
   return 0;
 }
 
@@ -13237,6 +13253,40 @@ aarch64_restore_za (rtx tpidr2_block)
   emit_insn (gen_aarch64_tpidr2_restore ());
 }
 
+/* Return the ZT0 save buffer, creating one if necessary.  */
+
+static rtx
+aarch64_get_zt0_save_buffer ()
+{
+  if (!cfun->machine->zt0_save_buffer)
+    cfun->machine->zt0_save_buffer = assign_stack_local (V8DImode, 64, 128);
+  return cfun->machine->zt0_save_buffer;
+}
+
+/* Save ZT0 to the current function's save buffer.  */
+
+static void
+aarch64_save_zt0 ()
+{
+  rtx mem = aarch64_get_zt0_save_buffer ();
+  mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
+  emit_insn (gen_aarch64_sme_str_zt0 (mem));
+}
+
+/* Restore ZT0 from the current function's save buffer.  FROM_LAZY_SAVE_P
+   is true if the load is happening after a call to a private-ZA function,
+   false if it can be treated as a normal load.  */
+
+static void
+aarch64_restore_zt0 (bool from_lazy_save_p)
+{
+  rtx mem = aarch64_get_zt0_save_buffer ();
+  mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
+  emit_insn (from_lazy_save_p
+	     ? gen_aarch64_restore_zt0 (mem)
+	     : gen_aarch64_sme_ldr_zt0 (mem));
+}
+
 /* Implement TARGET_START_CALL_ARGS.  */
 
 static void
@@ -13257,6 +13307,10 @@ aarch64_start_call_args (cumulative_args_t ca_v)
       && !aarch64_cfun_has_state ("za"))
     error ("call to a function that shares %qs state from a function"
 	   " that has no %qs state", "za", "za");
+  else if ((ca->shared_zt0_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
+	   && !aarch64_cfun_has_state ("zt0"))
+    error ("call to a function that shares %qs state from a function"
+	   " that has no %qs state", "zt0", "zt0");
   else if (!TARGET_ZA && (ca->isa_mode & AARCH64_FL_ZA_ON))
     error ("call to a function that shares SME state from a function"
 	   " that has no SME state");
@@ -13266,6 +13320,13 @@ aarch64_start_call_args (cumulative_args_t ca_v)
      The code itself is inserted by the mode-switching pass.  */
   if (TARGET_ZA && !(ca->isa_mode & AARCH64_FL_ZA_ON))
     emit_insn (gen_aarch64_start_private_za_call ());
+
+  /* If this is a call to a shared-ZA function that doesn't share ZT0,
+     save and restore ZT0 around the call.  */
+  if (aarch64_cfun_has_state ("zt0")
+      && (ca->isa_mode & AARCH64_FL_ZA_ON)
+      && ca->shared_zt0_flags == 0)
+    aarch64_save_zt0 ();
 }
 
 /* This function is used by the call expanders of the machine description.
@@ -13278,8 +13339,8 @@ aarch64_start_call_args (cumulative_args_t ca_v)
        The second element is a PARALLEL that lists all the argument
        registers that need to be saved and restored around a change
        in PSTATE.SM, or const0_rtx if no such switch is needed.
-       The third element is a const_int that contains the sharing flags
-       for ZA.
+       The third and fourth elements are const_ints that contain the
+       sharing flags for ZA and ZT0 respectively.
    SIBCALL indicates whether this function call is normal call or sibling call.
    It will generate different pattern accordingly.  */
 
@@ -13293,16 +13354,28 @@ aarch64_expand_call (rtx result, rtx mem, rtx cookie, bool sibcall)
   rtx callee_abi = cookie;
   rtx sme_mode_switch_args = const0_rtx;
   unsigned int shared_za_flags = 0;
+  unsigned int shared_zt0_flags = 0;
   if (GET_CODE (cookie) == PARALLEL)
     {
       callee_abi = XVECEXP (cookie, 0, 0);
       sme_mode_switch_args = XVECEXP (cookie, 0, 1);
       shared_za_flags = INTVAL (XVECEXP (cookie, 0, 2));
+      shared_zt0_flags = INTVAL (XVECEXP (cookie, 0, 3));
     }
 
   gcc_assert (CONST_INT_P (callee_abi));
   auto callee_isa_mode = aarch64_callee_isa_mode (callee_abi);
 
+  if (aarch64_cfun_has_state ("za")
+      && (callee_isa_mode & AARCH64_FL_ZA_ON)
+      && !shared_za_flags)
+    {
+      sorry ("call to a function that shares state other than %qs"
+	     " from a function that has %qs state", "za", "za");
+      inform (input_location, "use %<__arm_preserves(\"za\")%> if the"
+	      " callee preserves ZA");
+    }
+
   gcc_assert (MEM_P (mem));
   callee = XEXP (mem, 0);
   mode = GET_MODE (callee);
@@ -13335,6 +13408,8 @@ aarch64_expand_call (rtx result, rtx mem, rtx cookie, bool sibcall)
      we want to know whether the call committed a lazy save.  */
   if (TARGET_ZA && !shared_za_flags)
     return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
+  if (shared_zt0_flags & AARCH64_STATE_OUT)
+    return_values.safe_push (gen_rtx_REG (V8DImode, ZT0_REGNUM));
 
   /* Create the new return value, if necessary.  */
   if (orig_num_return_values != return_values.length ())
@@ -13420,10 +13495,12 @@ aarch64_expand_call (rtx result, rtx mem, rtx cookie, bool sibcall)
     }
 
   /* Add any ZA-related information.
+
      ZA_REGNUM represents the current function's ZA state, rather than
      the contents of the ZA register itself.  We ensure that the function's
      ZA state is preserved by private-ZA call sequences, so the call itself
-     does not use or clobber ZA_REGNUM.  */
+     does not use or clobber ZA_REGNUM.  The same thing applies to
+     ZT0_REGNUM.  */
   if (TARGET_ZA)
     {
       /* The callee requires ZA to be active if the callee is shared-ZA,
@@ -13443,10 +13520,14 @@ aarch64_expand_call (rtx result, rtx mem, rtx cookie, bool sibcall)
 		 gen_rtx_REG (VNx16BImode, LOWERING_REGNUM));
 
       /* If the callee is a shared-ZA function, record whether it uses the
-	 current value of ZA.  */
+	 current value of ZA and ZT0.  */
       if (shared_za_flags & AARCH64_STATE_IN)
 	use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
 		 gen_rtx_REG (VNx16BImode, ZA_REGNUM));
+
+      if (shared_zt0_flags & AARCH64_STATE_IN)
+	use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
+		 gen_rtx_REG (V8DImode, ZT0_REGNUM));
     }
 }
 
@@ -13462,6 +13543,13 @@ aarch64_end_call_args (cumulative_args_t ca_v)
      The code itself is inserted by the mode-switching pass.  */
   if (TARGET_ZA && !(ca->isa_mode & AARCH64_FL_ZA_ON))
     emit_insn (gen_aarch64_end_private_za_call ());
+
+  /* If this is a call to a shared-ZA function that doesn't share ZT0,
+     save and restore ZT0 around the call.  */
+  if (aarch64_cfun_has_state ("zt0")
+      && (ca->isa_mode & AARCH64_FL_ZA_ON)
+      && ca->shared_zt0_flags == 0)
+    aarch64_restore_zt0 (false);
 }
 
 /* Emit call insn with PAT and do aarch64-specific handling.  */
@@ -20982,6 +21070,20 @@ aarch64_set_current_function (tree fndecl)
 		       : AARCH64_FL_DEFAULT_ISA_MODE);
   auto isa_flags = TREE_TARGET_OPTION (new_tree)->x_aarch64_isa_flags;
 
+  static bool reported_zt0_p;
+  if (!reported_zt0_p
+      && !(isa_flags & AARCH64_FL_SME2)
+      && fndecl
+      && aarch64_fndecl_has_state (fndecl, "zt0"))
+    {
+      error ("functions with %qs state require the ISA extension %qs",
+	     "zt0", "sme2");
+      inform (input_location, "you can enable %qs using the command-line"
+	      " option %<-march%>, or by using the %<target%>"
+	      " attribute or pragma", "sme2");
+      reported_zt0_p = true;
+    }
+
   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
      the default have been handled by aarch64_save_restore_target_globals from
      aarch64_pragma_target_parse.  */
@@ -21593,9 +21695,10 @@ aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
 static bool
 aarch64_function_attribute_inlinable_p (const_tree fndecl)
 {
-  /* A function that has local ZA state cannot be inlined into its caller,
-     since we only support managing ZA switches at function scope.  */
-  return !aarch64_fndecl_has_new_state (fndecl, "za");
+  /* A function that has local SME state cannot be inlined into its caller,
+     since we only support managing PSTATE.ZA switches at function scope.  */
+  return (!aarch64_fndecl_has_new_state (fndecl, "za")
+	  && !aarch64_fndecl_has_new_state (fndecl, "zt0"));
 }
 
 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
@@ -21626,9 +21729,10 @@ aarch64_tribools_ok_for_inlining_p (int caller, int callee,
    Not meaningful for streaming-compatible functions.  */
 constexpr auto AARCH64_IPA_SM_FIXED = 1U << 0;
 
-/* Set if the function clobbers ZA.  Not meaningful for functions that
+/* Set if the function clobbers ZA and ZT0.  Not meaningful for functions that
    have ZA state.  */
 constexpr auto AARCH64_IPA_CLOBBERS_ZA = 1U << 1;
+constexpr auto AARCH64_IPA_CLOBBERS_ZT0 = 1U << 2;
 
 /* Implement TARGET_NEED_IPA_FN_TARGET_INFO.  */
 
@@ -21656,6 +21760,8 @@ aarch64_update_ipa_fn_target_info (unsigned int &info, const gimple *stmt)
 	  const char *clobber = TREE_STRING_POINTER (TREE_VALUE (op));
 	  if (strcmp (clobber, "za") == 0)
 	    info |= AARCH64_IPA_CLOBBERS_ZA;
+	  if (strcmp (clobber, "zt0") == 0)
+	    info |= AARCH64_IPA_CLOBBERS_ZT0;
 	}
     }
   if (auto *call = dyn_cast<const gcall *> (stmt))
@@ -21731,21 +21837,25 @@ aarch64_can_inline_p (tree caller, tree callee)
       && callee_has_property (AARCH64_IPA_SM_FIXED))
     return false;
 
-  /* aarch64_function_attribute_inlinable_p prevents new-ZA functions
-     from being inlined into others.  We also need to prevent inlining
-     of shared-ZA functions into functions without ZA state, since this
-     is an error condition.
+  /* aarch64_function_attribute_inlinable_p prevents new-ZA and new-ZT0
+     functions from being inlined into others.  We also need to prevent
+     inlining of shared-ZA functions into functions without ZA state,
+     since this is an error condition.
 
      The only other problematic case for ZA is inlining a function that
-     directly clobbers ZA into a function that has ZA state.  */
+     directly clobbers ZA or ZT0 into a function that has ZA or ZT0 state.  */
   auto caller_za = (caller_opts->x_aarch64_isa_flags & AARCH64_FL_ZA_ON);
   auto callee_za = (callee_opts->x_aarch64_isa_flags & AARCH64_FL_ZA_ON);
   if (!caller_za && callee_za)
     return false;
-  if (caller_za
-      && !callee_za
+  if (!callee_za
+      && aarch64_fndecl_has_state (caller, "za")
       && callee_has_property (AARCH64_IPA_CLOBBERS_ZA))
     return false;
+  if (!callee_za
+      && aarch64_fndecl_has_state (caller, "zt0")
+      && callee_has_property (AARCH64_IPA_CLOBBERS_ZT0))
+    return false;
 
   /* Allow non-strict aligned functions inlining into strict
      aligned ones.  */
@@ -29826,6 +29936,9 @@ aarch64_comp_type_attributes (const_tree type1, const_tree type2)
   if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "za")
       != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "za"))
     return 0;
+  if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "zt0")
+      != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "zt0"))
+    return 0;
   return 1;
 }
 
@@ -30293,7 +30406,9 @@ aarch64_optimize_mode_switching (aarch64_mode_entity entity)
 {
   bool have_sme_state = (aarch64_cfun_incoming_pstate_za () != 0
 			 || (aarch64_cfun_has_new_state ("za")
-			     && df_regs_ever_live_p (ZA_REGNUM)));
+			     && df_regs_ever_live_p (ZA_REGNUM))
+			 || (aarch64_cfun_has_new_state ("zt0")
+			     && df_regs_ever_live_p (ZT0_REGNUM)));
 
   if (have_sme_state && nonlocal_goto_handler_labels)
     {
@@ -30380,6 +30495,11 @@ aarch64_mode_emit_local_sme_state (aarch64_local_sme_state mode,
 	     In that case, ZA still contains the current function's ZA state,
 	     and we just need to cancel the lazy save.  */
 	  emit_insn (gen_aarch64_clear_tpidr2 ());
+
+	  /* Restore the ZT0 state, if we have some.  */
+	  if (aarch64_cfun_has_state ("zt0"))
+	    aarch64_restore_zt0 (true);
+
 	  return;
 	}
 
@@ -30388,6 +30508,10 @@ aarch64_mode_emit_local_sme_state (aarch64_local_sme_state mode,
 	  /* Retrieve the current function's ZA state from the lazy save
 	     buffer.  */
 	  aarch64_restore_za (aarch64_get_tpidr2_ptr ());
+
+	  /* Restore the ZT0 state, if we have some.  */
+	  if (aarch64_cfun_has_state ("zt0"))
+	    aarch64_restore_zt0 (true);
 	  return;
 	}
 
@@ -30404,6 +30528,11 @@ aarch64_mode_emit_local_sme_state (aarch64_local_sme_state mode,
 
 	     Both cases leave ZA zeroed.  */
 	  emit_insn (gen_aarch64_smstart_za ());
+
+	  /* Restore the ZT0 state, if we have some.  */
+	  if (prev_mode == aarch64_local_sme_state::OFF
+	      && aarch64_cfun_has_state ("zt0"))
+	    aarch64_restore_zt0 (true);
 	  return;
 	}
 
@@ -30422,6 +30551,10 @@ aarch64_mode_emit_local_sme_state (aarch64_local_sme_state mode,
 	  || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
 	  || prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
 	{
+	  /* Save the ZT0 state, if we have some.  */
+	  if (aarch64_cfun_has_state ("zt0"))
+	    aarch64_save_zt0 ();
+
 	  /* A transition from ACTIVE_LIVE to INACTIVE_LOCAL is the usual
 	     case of setting up a lazy save buffer before a call.
 	     A transition from INACTIVE_CALLER is similar, except that
@@ -30449,6 +30582,13 @@ aarch64_mode_emit_local_sme_state (aarch64_local_sme_state mode,
   if (mode == aarch64_local_sme_state::INACTIVE_CALLER
       || mode == aarch64_local_sme_state::OFF)
     {
+      /* Save the ZT0 state, if we have some.  */
+      if ((prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
+	   || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD)
+	  && mode == aarch64_local_sme_state::OFF
+	  && aarch64_cfun_has_state ("zt0"))
+	aarch64_save_zt0 ();
+
       /* The transition to INACTIVE_CALLER is used before returning from
 	 new("za") functions.  Any state in ZA belongs to the current
 	 function rather than a caller, but that state is no longer
@@ -30597,8 +30737,10 @@ aarch64_mode_needed_local_sme_state (rtx_insn *insn, HARD_REG_SET live)
 	    : aarch64_local_sme_state::OFF);
 
   /* Force ZA to contain the current function's ZA state if INSN wants
-     to access it.  */
-  if (aarch64_insn_references_sme_state_p (insn, ZA_REGNUM))
+     to access it.  Do the same for accesses to ZT0, since ZA and ZT0
+     are both controlled by PSTATE.ZA.  */
+  if (aarch64_insn_references_sme_state_p (insn, ZA_REGNUM)
+      || aarch64_insn_references_sme_state_p (insn, ZT0_REGNUM))
     return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
 	    ? aarch64_local_sme_state::ACTIVE_LIVE
 	    : aarch64_local_sme_state::ACTIVE_DEAD);
@@ -30883,27 +31025,34 @@ aarch64_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
      write directly.   Use a separate insn to model the effect.
 
      We must ensure that ZA is active on entry, which is enforced by using
-     SME_STATE_REGNUM.  The asm must ensure that ZA is active on return.  */
+     SME_STATE_REGNUM.  The asm must ensure that ZA is active on return.
+
+     The same thing applies to ZT0.  */
   if (TARGET_ZA)
     for (unsigned int i = clobbers.length (); i-- > 0; )
       {
 	rtx x = clobbers[i];
-	if (REG_P (x) && REGNO (x) == ZA_REGNUM)
+	if (REG_P (x)
+	    && (REGNO (x) == ZA_REGNUM || REGNO (x) == ZT0_REGNUM))
 	  {
 	    auto id = cfun->machine->next_asm_update_za_id++;
 
 	    start_sequence ();
 	    if (seq)
 	      emit_insn (seq);
-	    emit_insn (gen_aarch64_asm_update_za (gen_int_mode (id, SImode)));
+	    rtx id_rtx = gen_int_mode (id, SImode);
+	    emit_insn (REGNO (x) == ZA_REGNUM
+		       ? gen_aarch64_asm_update_za (id_rtx)
+		       : gen_aarch64_asm_update_zt0 (id_rtx));
 	    seq = get_insns ();
 	    end_sequence ();
 
-	    uses.safe_push (gen_rtx_REG (VNx16QImode, ZA_REGNUM));
+	    auto mode = REGNO (x) == ZA_REGNUM ? VNx16QImode : V8DImode;
+	    uses.safe_push (gen_rtx_REG (mode, REGNO (x)));
 	    uses.safe_push (gen_rtx_REG (DImode, SME_STATE_REGNUM));
 
 	    clobbers.ordered_remove (i);
-	    CLEAR_HARD_REG_BIT (clobbered_regs, ZA_REGNUM);
+	    CLEAR_HARD_REG_BIT (clobbered_regs, REGNO (x));
 	  }
       }
   return seq;
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 14205ce34b3..e42be08bbd3 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -485,7 +485,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF;
     0, 0, 0, 0,   0, 0, 0, 0,   /* P0 - P7 */           \
     0, 0, 0, 0,   0, 0, 0, 0,   /* P8 - P15 */          \
     1, 1,			/* FFR and FFRT */	\
-    1, 1, 1, 1, 1, 1, 1		/* Fake registers */	\
+    1, 1, 1, 1, 1, 1, 1, 1	/* Fake registers */	\
   }
 
 /* X30 is marked as caller-saved which is in line with regular function call
@@ -509,7 +509,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF;
     1, 1, 1, 1,   1, 1, 1, 1,	/* P0 - P7 */		\
     1, 1, 1, 1,   1, 1, 1, 1,	/* P8 - P15 */		\
     1, 1,			/* FFR and FFRT */	\
-    0, 0, 0, 0, 0, 0, 0		/* Fake registers */	\
+    0, 0, 0, 0, 0, 0, 0, 0	/* Fake registers */	\
   }
 
 #define REGISTER_NAMES						\
@@ -527,7 +527,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF;
     "p8",  "p9",  "p10", "p11", "p12", "p13", "p14", "p15",	\
     "ffr", "ffrt",						\
     "lowering", "tpidr2_block", "sme_state", "tpidr2_setup",	\
-    "za_free", "za_saved", "za"					\
+    "za_free", "za_saved", "za", "zt0"				\
   }
 
 /* Generate the register aliases for core register N */
@@ -770,7 +770,7 @@ enum reg_class
   { 0x00000000, 0x00000000, 0x000ffff0 },	/* PR_REGS */		\
   { 0x00000000, 0x00000000, 0x00300000 },	/* FFR_REGS */		\
   { 0x00000000, 0x00000000, 0x003ffff0 },	/* PR_AND_FFR_REGS */	\
-  { 0x00000000, 0x00000000, 0x1fc00000 },	/* FAKE_REGS */		\
+  { 0x00000000, 0x00000000, 0x3fc00000 },	/* FAKE_REGS */		\
   { 0xffffffff, 0xffffffff, 0x000fffff }	/* ALL_REGS */		\
 }
 
@@ -980,6 +980,9 @@ typedef struct GTY (()) machine_function
      or null if none.  */
   rtx za_save_buffer;
 
+  /* A stack slot that stores the contents of the function's ZT0 state.  */
+  rtx zt0_save_buffer;
+
   bool label_is_assembled;
 
   /* True if we've expanded at least one call to a function that changes
@@ -1061,8 +1064,9 @@ typedef struct
 				   raise an error for invalid calls.  */
 
   /* AARCH64_STATE_* flags that describe whether the function shares ZA
-     with its callers.  */
+     and ZT0 with its callers.  */
   unsigned int shared_za_flags;
+  unsigned int shared_zt0_flags;
 
   /* A list of registers that need to be saved and restored around a
      change to PSTATE.SM.  An auto_vec would be more convenient, but those
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 79d4614924d..a50c3ea50c9 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -156,9 +156,12 @@ (define_constants
     ;; The contents persist even when the architected ZA is off.  Private-ZA
     ;; functions have no effect on its contents.
     (ZA_REGNUM 92)
-    ;; ----------------------------------------------------------------
+
+    ;; Similarly represents the contents of the current function's ZT0 state.
+    (ZT0_REGNUM 93)
+
     (FIRST_FAKE_REGNUM	LOWERING_REGNUM)
-    (LAST_FAKE_REGNUM	ZA_REGNUM)
+    (LAST_FAKE_REGNUM	ZT0_REGNUM)
     ;; ----------------------------------------------------------------
 
     ;; The pair of scratch registers used for stack probing with -fstack-check.
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/zt0_state_1.c b/gcc/testsuite/gcc.target/aarch64/sme/zt0_state_1.c
new file mode 100644
index 00000000000..05da587d4b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/zt0_state_1.c
@@ -0,0 +1,65 @@
+// { dg-options "" }
+
+#pragma GCC target "+sme2"
+
+void share_za_zt0_a() __arm_inout("za", "zt0");
+void share_za_zt0_b() __arm_inout("za", "zt0");
+
+void share_za_preserve_zt0() __arm_inout("za") __arm_preserves("zt0");
+void share_zt0_preserve_za() __arm_inout("zt0") __arm_preserves("za");
+
+__arm_new("za", "zt0") void new_za_zt0_a() {
+  share_za_zt0_a();
+  share_za_zt0_b();
+}
+
+__arm_new("zt0", "za") void new_za_zt0_b() {
+  share_za_zt0_a();
+  share_za_zt0_b();
+}
+
+__arm_new("zt0") void new_za_zt0_c();
+__arm_new("za") void new_za_zt0_c() {
+  share_za_zt0_a();
+  share_za_zt0_b();
+}
+
+__arm_new("za") void new_za_zt0_d();
+__arm_new("zt0") void new_za_zt0_d() {
+  share_za_zt0_a();
+  share_za_zt0_b();
+}
+
+__arm_new("zt0", "za") void new_za_zt0_e();
+void new_za_zt0_e() {
+  share_za_zt0_a();
+  share_za_zt0_b();
+}
+
+__arm_new("zt0") void new_zt0_a() {
+  share_za_zt0_a(); // { dg-error {call to a function that shares 'za' state from a function that has no 'za' state} }
+}
+
+__arm_new("zt0") void new_zt0_b();
+void new_zt0_b() {
+  share_za_preserve_zt0(); // { dg-error {call to a function that shares 'za' state from a function that has no 'za' state} }
+}
+
+__arm_new("zt0") void new_zt0_c();
+void new_zt0_c() {
+  share_zt0_preserve_za();
+}
+
+__arm_new("za") void new_za_a() {
+  share_za_zt0_a(); // { dg-error {call to a function that shares 'zt0' state from a function that has no 'zt0' state} }
+}
+
+__arm_new("za") void new_za_b();
+void new_za_b() {
+  share_za_preserve_zt0();
+}
+
+__arm_new("za") void new_za_c();
+void new_za_c() {
+  share_zt0_preserve_za(); // { dg-error {call to a function that shares 'zt0' state from a function that has no 'zt0' state} }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/zt0_state_2.c b/gcc/testsuite/gcc.target/aarch64/sme/zt0_state_2.c
new file mode 100644
index 00000000000..17cd84437d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/zt0_state_2.c
@@ -0,0 +1,31 @@
+// { dg-options "" }
+
+void invalid_a() __arm_inout("za");
+void invalid_a() __arm_inout("za", "zt0"); // { dg-error {conflicting types} }
+
+void invalid_b() __arm_inout("za", "zt0");
+void invalid_b() __arm_inout("zt0"); // { dg-error {conflicting types} }
+
+void invalid_c() __arm_in("zt0") __arm_inout("za");
+void invalid_c() __arm_inout("zt0", "za"); // { dg-error {conflicting types} }
+
+void invalid_d() __arm_inout("zt0");
+void invalid_d() __arm_out("zt0"); // { dg-error {conflicting types} }
+
+void invalid_e() __arm_in("zt0");
+void invalid_e() __arm_out("zt0"); // { dg-error {conflicting types} }
+
+void invalid_f() __arm_in("zt0");
+void invalid_f() __arm_preserves("zt0"); // { dg-error {conflicting types} }
+
+void valid_a() __arm_inout("zt0") __arm_inout("za");
+void valid_a() __arm_inout("zt0", "za");
+
+void valid_b() __arm_inout("za") __arm_inout("zt0");
+void valid_b() __arm_inout("zt0") __arm_inout("za");
+
+void valid_c() __arm_inout("za", "zt0");
+void valid_c() __arm_inout("zt0", "za");
+
+void valid_d() __arm_inout("zt0", "za");
+void valid_d() __arm_inout("za", "zt0");
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/zt0_state_3.c b/gcc/testsuite/gcc.target/aarch64/sme/zt0_state_3.c
new file mode 100644
index 00000000000..2489ea21de9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/zt0_state_3.c
@@ -0,0 +1,6 @@
+// { dg-options "" }
+
+#pragma GCC target "+sme2"
+
+void foo() __arm_inout("zt0");
+void bar() __arm_inout("za", "zt0") { foo(); } // { dg-message {call to a function that shares state other than 'za' from a function that has 'za' state} }
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/zt0_state_4.c b/gcc/testsuite/gcc.target/aarch64/sme/zt0_state_4.c
new file mode 100644
index 00000000000..29999003d8a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/zt0_state_4.c
@@ -0,0 +1,53 @@
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#pragma GCC target "+sme2"
+
+void inout_za() __arm_inout("za");
+void inout_za_zt0() __arm_inout("za", "zt0");
+
+void inout_za_out_zt0() __arm_inout("za") __arm_out("zt0");
+void inout_za_in_zt0() __arm_inout("za") __arm_in("zt0");
+
+/*
+** test1:
+**	str	x30, \[sp, #?-16\]!
+**	bl	inout_za_zt0
+**	ldr	x30, \[sp\], #?16
+**	ret
+*/
+void test1() __arm_inout("za", "zt0")
+{
+  inout_za_zt0();
+}
+
+/*
+** test2:
+**	...
+**	str	zt0, \[(?:x[0-9]+|sp)\]
+**	...
+**	bl	inout_za
+**	...
+**	ldr	zt0, \[(?:x[0-9]+|sp)\]
+**	...
+**	ret
+*/
+void test2() __arm_inout("za", "zt0")
+{
+  inout_za();
+}
+
+/*
+** test3:
+**	...
+**	bl	inout_za
+**	bl	inout_za_out_zt0
+**	[^\n]+
+**	ret
+*/
+void test3() __arm_inout("za", "zt0")
+{
+  inout_za_in_zt0();
+  inout_za();
+  inout_za_out_zt0();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/zt0_state_5.c b/gcc/testsuite/gcc.target/aarch64/sme/zt0_state_5.c
new file mode 100644
index 00000000000..e18b395476c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/zt0_state_5.c
@@ -0,0 +1,260 @@
+// { dg-options "-O -fno-optimize-sibling-calls" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#pragma GCC target "+sme2"
+
+void private_zt0();
+void out_zt0() __arm_out("zt0");
+void in_zt0() __arm_in("zt0");
+void inout_zt0() __arm_inout("zt0");
+void preserves_zt0() __arm_preserves("zt0");
+
+/*
+** test1:
+**	ret
+*/
+__arm_new("zt0") void test1()
+{
+}
+
+/*
+** test2:
+**	ldr	w0, \[x0\]
+**	ret
+*/
+__arm_new("zt0") int test2(int *ptr)
+{
+  return *ptr;
+}
+
+/*
+** test3:
+**	stp	[^\n]+
+**	mov	x29, sp
+**	bl	private_zt0
+** (
+**	mov	w0, 0
+**	ldp	[^\n]+
+** |
+**	ldp	[^\n]+
+**	mov	w0, 0
+** )
+**	ret
+*/
+__arm_new("zt0") int test3()
+{
+  private_zt0();
+  return 0;
+}
+
+/*
+** test4:
+**	...
+**	mrs	x0, tpidr2_el0
+**	cbz	x0, [^\n]+
+**	bl	__arm_tpidr2_save
+**	msr	tpidr2_el0, xzr
+**	smstart	za
+**	bl	in_zt0
+**	smstop	za
+**	ldp	[^\n]+
+**	ret
+*/
+__arm_new("zt0") void test4()
+{
+  in_zt0(); // Uses zeroed contents.
+}
+
+/*
+** test5:
+**	...
+**	mrs	x0, tpidr2_el0
+**	cbz	x0, [^\n]+
+**	bl	__arm_tpidr2_save
+**	msr	tpidr2_el0, xzr
+**	smstop	za
+**	bl	private_zt0
+**	smstart	za
+**	bl	out_zt0
+**	bl	in_zt0
+**	...
+**	smstop	za
+**	bl	private_zt0
+**	ldp	[^\n]+
+**	ret
+*/
+__arm_new("zt0") void test5()
+{
+  private_zt0();
+  out_zt0();
+  in_zt0();
+  private_zt0();
+}
+
+// Despite the long test, there shouldn't be too much scope for variation
+// here.  The point is both to test correctness and code quality.
+/*
+** test6:
+**	stp	[^\n]+
+**	mov	x29, sp
+**	mrs	x0, tpidr2_el0
+**	cbz	x0, [^\n]+
+**	bl	__arm_tpidr2_save
+**	msr	tpidr2_el0, xzr
+**	smstart	za
+**	bl	out_zt0
+**	...
+**	str	zt0, [^\n]+
+**	smstop	za
+**	bl	private_zt0
+**	smstart	za
+**	...
+**	ldr	zt0, [^\n]+
+**	bl	in_zt0
+**	smstop	za
+**	ldp	[^\n]+
+**	ret
+*/
+__arm_new("zt0") void test6()
+{
+  out_zt0();
+  private_zt0();
+  in_zt0();
+}
+
+// Rely on previous tests for the part leading up to the smstart.
+/*
+** test7:
+**	...
+**	smstart	za
+**	bl	out_zt0
+**	bl	in_zt0
+**	...
+**	smstop	za
+**	bl	private_zt0
+**	smstart	za
+**	bl	out_zt0
+**	bl	in_zt0
+**	smstop	za
+**	ldp	[^\n]+
+**	ret
+*/
+__arm_new("zt0") void test7()
+{
+  out_zt0();
+  in_zt0();
+  private_zt0();
+  out_zt0();
+  in_zt0();
+}
+
+/*
+** test8:
+**	...
+**	smstart	za
+**	bl	out_zt0
+**	bl	in_zt0
+**	...
+**	smstop	za
+**	bl	private_zt0
+**	smstart	za
+**	bl	out_zt0
+**	bl	in_zt0
+**	...
+**	smstop	za
+**	bl	private_zt0
+**	ldp	[^\n]+
+**	ret
+*/
+__arm_new("zt0") void test8()
+{
+  out_zt0();
+  in_zt0();
+  private_zt0();
+  out_zt0();
+  in_zt0();
+  private_zt0();
+}
+
+/*
+** test9:
+**	...
+**	str	zt0, [^\n]+
+**	smstop	za
+**	bl	private_zt0
+**	bl	private_zt0
+**	bl	private_zt0
+**	bl	private_zt0
+**	smstart	za
+**	...
+**	ldr	zt0, [^\n]+
+**	bl	in_zt0
+**	smstop	za
+**	...
+*/
+__arm_new("zt0") void test9()
+{
+  out_zt0();
+  private_zt0();
+  private_zt0();
+  private_zt0();
+  private_zt0();
+  in_zt0();
+}
+
+/*
+** test10:
+**	ldr	(w[0-9]+), \[x0\]
+**	cbz	\1, [^\n]+
+**	ldr	[^\n]+
+**	add	[^\n]+
+**	str	[^\n]+
+**	ret
+**	...
+*/
+__arm_new("zt0") void test10(volatile int *ptr)
+{
+  if (__builtin_expect (*ptr != 0, 1))
+    *ptr = *ptr + 1;
+  else
+    inout_zt0();
+}
+
+/*
+** test11:
+**	...
+**	ldr	w[0-9]+, [^\n]+
+**	add	(w[0-9]+), [^\n]+
+**	str	\1, [^\n]+
+**	...
+**	ret
+**	mrs	x[0-9]+, tpidr2_el0
+**	...
+**	smstart	za
+**	bl	inout_zt0
+**	ldr	(w[0-9]+), [^\n]+
+**	cbnz	\2, [^\n]+
+**	smstop	za
+**	...
+*/
+__arm_new("zt0") void test11(volatile int *ptr)
+{
+  if (__builtin_expect (*ptr == 0, 0))
+    do
+      inout_zt0();
+    while (*ptr);
+  else
+    *ptr += 1;
+}
+
+__arm_new("zt0") void test12(volatile int *ptr)
+{
+  do
+    {
+      inout_zt0();
+      private_zt0();
+    }
+  while (*ptr);
+  out_zt0();
+  in_zt0();
+}
-- 
2.25.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 5/5] aarch64: Add support for SME2 intrinsics
  2023-11-17 17:37 aarch64: Add support for SME2 Richard Sandiford
                   ` (3 preceding siblings ...)
  2023-11-17 17:39 ` [PATCH 4/5] aarch64: Add ZT0 Richard Sandiford
@ 2023-11-17 17:42 ` Richard Sandiford
  4 siblings, 0 replies; 6+ messages in thread
From: Richard Sandiford @ 2023-11-17 17:42 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 68777 bytes --]

This patch adds support for the SME2 <arm_sme.h> intrinsics.  The
convention I've used is to put stuff in aarch64-sve-builtins-sme.*
if it relates to ZA, ZT0, the streaming vector length, or other
such SME state.  Things that operate purely on predicates and
vectors go in aarch64-sve-builtins-sve2.* instead.  Some of these
will later be picked up for SVE2p1.

In truth, the shapes & C overload resolution code (in general)
is beginning to get a bit out of hand.  Cleaning it up is GCC 15
material though.

We previously used Uph internally as a constraint for 16-bit
immediates to atomic instructions.  However, we need a user-facing
constraint for the upper predicate registers (already available as
PR_HI_REGS), and Uph makes a natural pair with the existing Upl.

gcc/
	* config/aarch64/aarch64.h (TARGET_STREAMING_SME2): New macro.
	(P_ALIASES): Likewise.
	(REGISTER_NAMES): Add pn aliases of the predicate registers.
	(W8_W11_REGNUM_P): New macro.
	(W8_W11_REGS): New register class.
	(REG_CLASS_NAMES, REG_CLASS_CONTENTS): Update accordingly.
	* config/aarch64/aarch64.cc (aarch64_print_operand): Add support
	for %K, which prints a predicate as a counter.  Handle tuples of
	predicates.
	(aarch64_regno_regclass): Handle W8_W11_REGS.
	(aarch64_class_max_nregs): Likewise.
	* config/aarch64/constraints.md (Uci, Uw2, Uw4): New constraints.
	(x, y): Move further up file.
	(Uph): Redefine as the high predicate registers, renaming the old
	constraint to...
	(Uih): ...this.
	* config/aarch64/predicates.md (const_0_to_7_operand): New predicate.
	(const_0_to_4_step_4_operand, const_0_to_6_step_2_operand): Likewise.
	(const_0_to_12_step_4_operand, const_0_to_14_step_2_operand): Likewise.
	(aarch64_simd_shift_imm_qi): Use const_0_to_7_operand.
	* config/aarch64/iterators.md (VNx16SI_ONLY, VNx8SI_ONLY)
	(VNx8DI_ONLY, SVE_FULL_BHSIx2, SVE_FULL_HF, SVE_FULL_SIx2_SDIx4)
	(SVE_FULL_BHS, SVE_FULLx24, SVE_DIx24, SVE_BHSx24, SVE_Ix24)
	(SVE_Fx24, SVE_SFx24, SME_ZA_BIx24, SME_ZA_BHIx124, SME_ZA_BHIx24)
	(SME_ZA_HFx124, SME_ZA_HFx24, SME_ZA_HIx124, SME_ZA_HIx24)
	(SME_ZA_SDIx24, SME_ZA_SDFx24): New mode iterators.
	(UNSPEC_REVD, UNSPEC_CNTP_C, UNSPEC_PEXT, UNSPEC_PEXTx2): New unspecs.
	(UNSPEC_PSEL, UNSPEC_PTRUE_C, UNSPEC_SQRSHR, UNSPEC_SQRSHRN)
	(UNSPEC_SQRSHRU, UNSPEC_SQRSHRUN, UNSPEC_UQRSHR, UNSPEC_UQRSHRN)
	(UNSPEC_UZP, UNSPEC_UZPQ, UNSPEC_ZIP, UNSPEC_ZIPQ, UNSPEC_BFMLSLB)
	(UNSPEC_BFMLSLT, UNSPEC_FCVTN, UNSPEC_FDOT, UNSPEC_SQCVT): Likewise.
	(UNSPEC_SQCVTN, UNSPEC_SQCVTU, UNSPEC_SQCVTUN, UNSPEC_UQCVT): Likewise.
	(UNSPEC_SME_ADD, UNSPEC_SME_ADD_WRITE, UNSPEC_SME_BMOPA): Likewise.
	(UNSPEC_SME_BMOPS, UNSPEC_SME_FADD, UNSPEC_SME_FDOT, UNSPEC_SME_FVDOT)
	(UNSPEC_SME_FMLA, UNSPEC_SME_FMLS, UNSPEC_SME_FSUB, UNSPEC_SME_READ)
	(UNSPEC_SME_SDOT, UNSPEC_SME_SVDOT, UNSPEC_SME_SMLA, UNSPEC_SME_SMLS)
	(UNSPEC_SME_SUB, UNSPEC_SME_SUB_WRITE, UNSPEC_SME_SUDOT): Likewise.
	(UNSPEC_SME_SUVDOT, UNSPEC_SME_UDOT, UNSPEC_SME_UVDOT): Likewise.
	(UNSPEC_SME_UMLA, UNSPEC_SME_UMLS, UNSPEC_SME_USDOT): Likewise.
	(UNSPEC_SME_USVDOT, UNSPEC_SME_WRITE): Likewise.
	(Vetype, VNARROW, V2XWIDE, Ventype, V_INT_EQUIV, v_int_equiv)
	(VSINGLE, vsingle, b): Add tuple modes.
	(v2xwide, za32_offset_range, za64_offset_range, za32_long)
	(za32_last_offset, vg_modifier, z_suffix, aligned_operand)
	(aligned_fpr): New mode attributes.
	(SVE_INT_BINARY_MULTI, SVE_INT_BINARY_SINGLE, SVE_INT_BINARY_MULTI)
	(SVE_FP_BINARY_MULTI): New int iterators.
	(SVE_BFLOAT_TERNARY_LONG): Add UNSPEC_BFMLSLB and UNSPEC_BFMLSLT.
	(SVE_BFLOAT_TERNARY_LONG_LANE): Likewise.
	(SVE_WHILE_ORDER, SVE2_INT_SHIFT_IMM_NARROWxN, SVE_QCVTxN)
	(SVE2_SFx24_UNARY, SVE2_x24_PERMUTE, SVE2_x24_PERMUTEQ)
	(UNSPEC_REVD_ONLY, SME2_INT_MOP, SME2_BMOP, SME_BINARY_SLICE_SDI)
	(SME_BINARY_SLICE_SDF, SME_BINARY_WRITE_SLICE_SDI, SME_INT_DOTPROD)
	(SME_INT_DOTPROD_LANE, SME_FP_DOTPROD, SME_FP_DOTPROD_LANE)
	(SME_INT_TERNARY_SLICE, SME_FP_TERNARY_SLICE, BHSD_BITS)
	(LUTI_BITS): New int iterators.
	(optab, sve_int_op): Handle the new unspecs.
	(sme_int_op, has_16bit_form): New int attributes.
	(bits_etype): Handle 64.
	* config/aarch64/aarch64.md (UNSPEC_LD1_SVE_COUNT): New unspec.
	(UNSPEC_ST1_SVE_COUNT, UNSPEC_LDNT1_SVE_COUNT): Likewise.
	(UNSPEC_STNT1_SVE_COUNT): Likewise.
	* config/aarch64/atomics.md (cas_short_expected_imm): Use Uhi
	rather than Uph for HImode immediates.
	* config/aarch64/aarch64-sve.md (@aarch64_ld1<SVE_FULLx24:mode>)
	(@aarch64_ldnt1<SVE_FULLx24:mode>, @aarch64_st1<SVE_FULLx24:mode>)
	(@aarch64_stnt1<SVE_FULLx24:mode>): New patterns.
	(@aarch64_<sur>dot_prod_lane<vsi2qi>): Extend to...
	(@aarch64_<sur>dot_prod_lane<SVE_FULL_SDI:mode><SVE_FULL_BHI:mode>)
	(@aarch64_<sur>dot_prod_lane<VNx4SI_ONLY:mode><VNx16QI_ONLY:mode>):
	...these new patterns.
	(SVE_WHILE_B, SVE_WHILE_B_X2, SVE_WHILE_C): New constants.  Add
	SVE_WHILE_B to existing while patterns.
	* config/aarch64/aarch64-sve2.md (@aarch64_sve_ptrue_c<BHSD_BITS>)
	(@aarch64_sve_pext<BHSD_BITS>, @aarch64_sve_pext<BHSD_BITS>x2)
	(@aarch64_sve_psel<BHSD_BITS>, *aarch64_sve_psel<BHSD_BITS>_plus)
	(@aarch64_sve_cntp_c<BHSD_BITS>, <frint_pattern><mode>2)
	(<optab><mode>3, *<optab><mode>3, @aarch64_sve_single_<optab><mode>)
	(@aarch64_sve_<sve_int_op><mode>): New patterns.
	(@aarch64_sve_single_<sve_int_op><mode>, @aarch64_sve_<su>clamp<mode>)
	(*aarch64_sve_<su>clamp<mode>_x, @aarch64_sve_<su>clamp_single<mode>)
	(@aarch64_sve_fclamp<mode>, *aarch64_sve_fclamp<mode>_x)
	(@aarch64_sve_fclamp_single<mode>, <optab><mode><v2xwide>2)
	(@aarch64_sve_<sur>dotvnx4sivnx8hi): New patterns.
	(@aarch64_sve_<maxmin_uns_op><mode>): Likewise.
	(*aarch64_sve_<maxmin_uns_op><mode>): Likewise.
	(@aarch64_sve_single_<maxmin_uns_op><mode>): Likewise.
	(aarch64_sve_fdotvnx4sfvnx8hf): Likewise.
	(aarch64_fdot_prod_lanevnx4sfvnx8hf): Likewise.
	(@aarch64_sve_<optab><VNx16QI_ONLY:mode><VNx16SI_ONLY:mode>): Likewise.
	(@aarch64_sve_<optab><VNx8HI_ONLY:mode><VNx8SI_ONLY:mode>): Likewise.
	(@aarch64_sve_<optab><VNx8HI_ONLY:mode><VNx8DI_ONLY:mode>): Likewise.
	(truncvnx8sf<mode>2, @aarch64_sve_cvtn<mode>): Likewise.
	(<optab><v_int_equiv><mode>2, <optab><mode><v_int_equiv>2): Likewise.
	(@aarch64_sve_sel<mode>): Likewise.
	(@aarch64_sve_while<while_optab_cmp>_b<BHSD_BITS>_x2): Likewise.
	(@aarch64_sve_while<while_optab_cmp>_c<BHSD_BITS>): Likewise.
	(@aarch64_pred_<optab><mode>, @cond_<optab><mode>): Likewise.
	(@aarch64_sve_<optab><mode>): Likewise.
	* config/aarch64/aarch64-sme.md (@aarch64_sme_<optab><mode><mode>)
	(*aarch64_sme_<optab><mode><mode>_plus, @aarch64_sme_read<mode>)
	(*aarch64_sme_read<mode>_plus, @aarch64_sme_write<mode>): New patterns.
	(*aarch64_sme_write<mode>_plus aarch64_sme_zero_zt0): Likewise.
	(@aarch64_sme_<optab><mode>, *aarch64_sme_<optab><mode>_plus)
	(@aarch64_sme_single_<optab><mode>): Likewise.
	(*aarch64_sme_single_<optab><mode>_plus): Likewise.
	(@aarch64_sme_<optab><SME_ZA_SDI:mode><SME_ZA_BHIx24:mode>)
	(*aarch64_sme_<optab><SME_ZA_SDI:mode><SME_ZA_BHIx24:mode>_plus)
	(@aarch64_sme_single_<optab><SME_ZA_SDI:mode><SME_ZA_BHIx24:mode>)
	(*aarch64_sme_single_<optab><SME_ZA_SDI:mode><SME_ZA_BHIx24:mode>_plus)
	(@aarch64_sme_single_sudot<VNx4SI_ONLY:mode><SME_ZA_BIx24:mode>)
	(*aarch64_sme_single_sudot<VNx4SI_ONLY:mode><SME_ZA_BIx24:mode>_plus)
	(@aarch64_sme_lane_<optab><SME_ZA_SDI:mode><SME_ZA_BHIx24:mode>)
	(*aarch64_sme_lane_<optab><SME_ZA_SDI:mode><SME_ZA_BHIx24:mode>_plus)
	(@aarch64_sme_<optab><VNx4SI_ONLY:mode><SVE_FULL_BHI:mode>)
	(*aarch64_sme_<optab><VNx4SI_ONLY:mode><SVE_FULL_BHI:mode>_plus)
	(@aarch64_sme_<optab><VNx4SI_ONLY:mode><SME_ZA_BHIx24:mode>)
	(*aarch64_sme_<optab><VNx4SI_ONLY:mode><SME_ZA_BHIx24:mode>_plus)
	(@aarch64_sme_single_<optab><VNx4SI_ONLY:mode><SME_ZA_BHIx24:mode>)
	(*aarch64_sme_single_<optab><VNx4SI_ONLY:mode><SME_ZA_BHIx24:mode>_plus)
	(@aarch64_sme_lane_<optab><VNx4SI_ONLY:mode><SME_ZA_BHIx124:mode>)
	(*aarch64_sme_lane_<optab><VNx4SI_ONLY:mode><SME_ZA_BHIx124:mode>)
	(@aarch64_sme_<optab><VNx2DI_ONLY:mode><VNx8HI_ONLY:mode>)
	(*aarch64_sme_<optab><VNx2DI_ONLY:mode><VNx8HI_ONLY:mode>_plus)
	(@aarch64_sme_<optab><VNx2DI_ONLY:mode><SME_ZA_HIx24:mode>)
	(*aarch64_sme_<optab><VNx2DI_ONLY:mode><SME_ZA_HIx24:mode>_plus)
	(@aarch64_sme_single_<optab><VNx2DI_ONLY:mode><SME_ZA_HIx24:mode>)
	(*aarch64_sme_single_<optab><VNx2DI_ONLY:mode><SME_ZA_HIx24:mode>_plus)
	(@aarch64_sme_lane_<optab><VNx2DI_ONLY:mode><SME_ZA_HIx124:mode>)
	(*aarch64_sme_lane_<optab><VNx2DI_ONLY:mode><SME_ZA_HIx124:mode>)
	(@aarch64_sme_<optab><VNx4SI_ONLY:mode><VNx8HI_ONLY:mode>)
	(@aarch64_sme_<optab><VNx4SI_ONLY:mode><VNx4SI_ONLY:mode>)
	(@aarch64_sme_<optab><VNx4SI_ONLY:mode><SME_ZA_HFx24:mode>)
	(*aarch64_sme_<optab><VNx4SI_ONLY:mode><SME_ZA_HFx24:mode>_plus)
	(@aarch64_sme_single_<optab><VNx4SI_ONLY:mode><SME_ZA_HFx24:mode>)
	(*aarch64_sme_single_<optab><VNx4SI_ONLY:mode><SME_ZA_HFx24:mode>_plus)
	(@aarch64_sme_lane_<optab><VNx4SI_ONLY:mode><SME_ZA_HFx24:mode>)
	(*aarch64_sme_lane_<optab><VNx4SI_ONLY:mode><SME_ZA_HFx24:mode>_plus)
	(@aarch64_sme_<optab><SME_ZA_SDF_I:mode><SME_ZA_SDFx24:mode>)
	(*aarch64_sme_<optab><SME_ZA_SDF_I:mode><SME_ZA_SDFx24:mode>_plus)
	(@aarch64_sme_single_<optab><SME_ZA_SDF_I:mode><SME_ZA_SDFx24:mode>)
	(*aarch64_sme_single_<optab><SME_ZA_SDF_I:mode><SME_ZA_SDFx24:mode>_plus)
	(@aarch64_sme_lane_<optab><SME_ZA_SDF_I:mode><SME_ZA_SDFx24:mode>)
	(*aarch64_sme_lane_<optab><SME_ZA_SDF_I:mode><SME_ZA_SDFx24:mode>)
	(@aarch64_sme_<optab><VNx4SI_ONLY:mode><SVE_FULL_HF:mode>)
	(*aarch64_sme_<optab><VNx4SI_ONLY:mode><SVE_FULL_HF:mode>_plus)
	(@aarch64_sme_lane_<optab><VNx4SI_ONLY:mode><SME_ZA_HFx124:mode>)
	(*aarch64_sme_lane_<optab><VNx4SI_ONLY:mode><SME_ZA_HFx124:mode>)
	(@aarch64_sme_lut<LUTI_BITS><mode>): Likewise.
	(UNSPEC_SME_LUTI): New unspec.
	* config/aarch64/aarch64-sve-builtins.def (single): New mode suffix.
	(c8, c16, c32, c64): New type suffixes.
	(vg1x2, vg1x4, vg2, vg2x1, vg2x2, vg2x4, vg4, vg4x1, vg4x2)
	(vg4x4): New group suffixes.
	* config/aarch64/aarch64-sve-builtins.h (CP_READ_ZT0)
	(CP_WRITE_ZT0): New constants.
	(get_svbool_t): Delete.
	(function_resolver::infer_predicate_type): New member function.
	(function_resolver::infer_vector_or_tuple_type): Add an is_opt_single
	parameter.
	(function_resolver::require_matching_vector_type): Add a was_tuple
	parameter.
	(function_resolver::require_nonscalar_type): New member function.
	(function_resolver::finish_opt_single_resolution): Likewise.
	(function_expander::map_to_rtx_codes): Add an extra parameter
	for unconditional FP unspecs.
	(function_instance::gp_type_index): New member function.
	(function_instance::gp_type): Likewise.
	(function_instance::gp_mode): Handle multi-vector operations.
	* config/aarch64/aarch64-sve-builtins.cc (TYPES_all_count)
	(TYPES_all_pred_count, TYPES_c, TYPES_bhs_data, TYPES_bhs_widen)
	(TYPES_hs_data, TYPES_cvt_h_s_float, TYPES_cvt_s_s, TYPES_qcvt_x2)
	(TYPES_qcvt_x4, TYPES_qrshr_x2, TYPES_qrshru_x2, TYPES_qrshr_x4)
	(TYPES_qrshru_x4, TYPES_while_x, TYPES_while_x_c, TYPES_s_narrow_fsu)
	(TYPES_za_s_b_signed, TYPES_za_s_b_unsigned, TYPES_za_s_b_integer)
	(TYPES_za_s_h_integer, TYPES_za_s_h_data, TYPES_za_s_unsigned)
	(TYPES_za_s_float, TYPES_za_s_data, TYPES_za_d_h_integer): New type
	macros.
	(groups_x2, groups_x12, groups_x4, groups_x24, groups_x124)
	(groups_vg1x2, groups_vg1x4, groups_vg1x24, groups_vg2, groups_vg4)
	(groups_vg24): New group arrays.
	(function_instance::reads_global_state_p): Handle CP_READ_ZT0.
	(function_instance::modifies_global_state_p): Handle CP_WRITE_ZT0.
	(add_shared_state_attribute): Handle zt0 state.
	(function_builder::add_overloaded_functions): Skip MODE_single
	for non-tuple groups.
	(function_resolver::resolve_to): Add a fallback error message for
	the general two-type case.
	(function_resolver::infer_predicate_type): New function.
	(function_resolver::infer_vector_or_tuple_type): Add an is_opt_single
	parameter.
	(function_resolver::require_matching_vector_type): Add a was_tuple
	parameter.
	(function_resolver::check_gp_argument): Use gp_type_index rather
	than hard-coding VECTOR_TYPE_svbool_t.
	(function_resolver::finish_opt_single_resolution): New function.
	(function_resolver::resolve_unary): Extend to handle tuples.
	(function_expander::direct_optab_handler): Likewise.
	(function_expander::use_pred_x_insn): Only add a strictness flag
	is the insn has an operand for it.
	(function_expander::map_to_rtx_codes): Take an unconditional
	FP unspec as an extra parameter.  Handle tuples and MODE_single.
	(function_expander::map_to_unspecs): Handle tuples and MODE_single.
	* config/aarch64/aarch64-sve-builtins-functions.h (read_zt0)
	(write_zt0): New typedefs.
	(full_width_access::memory_vector): Use the function's
	vectors_per_tuple.
	(rtx_code_function_base): Add an optional unconditional FP unspec.
	(rtx_code_function::expand): Update accordingly.
	(rtx_code_function_rotated::expand): Likewise.
	(unspec_based_function_exact_insn::expand): Use tuple_mode instead
	of vector_mode.
	(unspec_based_uncond_function): New typedef.
	(cond_or_uncond_unspec_function): New class.
	(sme_1mode_function::expand): Handle single forms.
	(sme_2mode_function_t): Likewise, adding a template parameter for them.
	(sme_2mode_function): Update accordingly.
	(sme_2mode_lane_function): New typedef.
	(multireg_permute): New class.
	(class integer_conversion): Likewise.
	(while_comparison::expand): Handle svcount_t and svboolx2_t results.
	* config/aarch64/aarch64-sve-builtins-shapes.h
	(binary_int_opt_single_n, binary_opt_single_n, binary_single)
	(binary_za_slice_lane, binary_za_slice_int_opt_single)
	(binary_za_slice_opt_single, binary_za_slice_uint_opt_single)
	(binaryx, clamp, compare_scalar_count, count_pred_c)
	(dot_za_slice_int_lane, dot_za_slice_lane, dot_za_slice_uint_lane)
	(extract_pred, inherent_zt, ldr_zt, read_za, read_za_slice)
	(select_pred, shift_right_imm_narrowxn, storexn, str_zt)
	(unary_convertxn, unary_za_slice, unaryxn, write_za)
	(write_za_slice): Declare.
	* config/aarch64/aarch64-sve-builtins-shapes.cc
	(za_group_is_pure_overload): New function.
	(apply_predication): Use the function's gp_type for the predicate,
	instead of hard-coding the use of svbool_t.
	(parse_element_type): Add support for "c" (svcount_t).
	(parse_type): Add support for "c0" and "c1" (conversion destination
	and source types).
	(binary_za_slice_lane_base): New class.
	(binary_za_slice_opt_single_base): Likewise.
	(load_contiguous_base::resolve): Pass the group suffix to r.resolve.
	(luti_lane_zt_base): New class.
	(binary_int_opt_single_n, binary_opt_single_n, binary_single)
	(binary_za_slice_lane, binary_za_slice_int_opt_single)
	(binary_za_slice_opt_single, binary_za_slice_uint_opt_single)
	(binaryx, clamp): New shapes.
	(compare_scalar_def::build): Allow the return type to be a tuple.
	(compare_scalar_def::expand): Pass the group suffix to r.resolve.
	(compare_scalar_count, count_pred_c, dot_za_slice_int_lane)
	(dot_za_slice_lane, dot_za_slice_uint_lane, extract_pred, inherent_zt)
	(ldr_zt, read_za, read_za_slice, select_pred, shift_right_imm_narrowxn)
	(storexn, str_zt): New shapes.
	(ternary_qq_lane_def, ternary_qq_opt_n_def): Replace with...
	(ternary_qq_or_011_lane_def, ternary_qq_opt_n_or_011_def): ...these
	new classes.  Allow a second suffix that specifies the type of the
	second vector argument, and that is used to derive the third.
	(unary_def::build): Extend to handle tuple types.
	(unary_convert_def::build): Use the new c0 and c1 format specifiers.
	(unary_convertxn, unary_za_slice, unaryxn, write_za): New shapes.
	(write_za_slice): Likewise.
	* config/aarch64/aarch64-sve-builtins-base.cc (svbic_impl::expand)
	(svext_bhw_impl::expand): Update call to map_to_rtx_costs.
	(svcntp_impl::expand): Handle svcount_t variants.
	(svcvt_impl::expand): Handle unpredicated conversions separately,
	dealing with tuples.
	(svdot_impl::expand): Handle 2-way dot products.
	(svdotprod_lane_impl::expand): Likewise.
	(svld1_impl::fold): Punt on tuple loads.
	(svld1_impl::expand): Handle tuple loads.
	(svldnt1_impl::expand): Likewise.
	(svpfalse_impl::fold): Punt on svcount_t forms.
	(svptrue_impl::fold): Likewise.
	(svptrue_impl::expand): Handle svcount_t forms.
	(svrint_impl): New class.
	(svsel_impl::fold): Punt on tuple forms.
	(svsel_impl::expand): Handle tuple forms.
	(svst1_impl::fold): Punt on tuple loads.
	(svst1_impl::expand): Handle tuple loads.
	(svstnt1_impl::expand): Likewise.
	(svwhilelx_impl::fold): Punt on tuple forms.
	(svdot_lane): Use UNSPEC_FDOT.
	(svmax, svmaxnm, svmin, svminmm): Add unconditional FP unspecs.
	(rinta, rinti, rintm, rintn, rintp, rintx, rintz): Use svrint_impl.
	* config/aarch64/aarch64-sve-builtins-base.def (svcvt): Use
	unary_convertxn.
	(svdot): Use ternary_qq_opt_n_or_011.
	(svdot_lane): Use ternary_qq_or_011_lane.
	(svmax, svmaxnm, svmin, svminnm): Use binary_opt_single_n.
	(svpfalse): Add a form that returns svcount_t results.
	(svrinta, svrintm, svrintn, svrintp): Use unaryxn.
	(svsel): Use binaryxn.
	(svst1, svstnt1): Use storexn.
	* config/aarch64/aarch64-sve-builtins-sme.h
	(svadd_za, svadd_write_za, svbmopa_za, svbmops_za, svdot_za)
	(svdot_lane_za, svldr_zt, svluti2_lane_zt, svluti4_lane_zt)
	(svmla_za, svmla_lane_za, svmls_za, svmls_lane_za, svread_za)
	(svstr_zt, svsub_za, svsub_write_za, svsudot_za, svsudot_lane_za)
	(svsuvdot_lane_za, svusdot_za, svusdot_lane_za, svusvdot_lane_za)
	(svvdot_lane_za, svwrite_za, svzero_zt): Declare.
	* config/aarch64/aarch64-sve-builtins-sme.cc (load_store_za_base):
	Rename to...
	(load_store_za_zt0_base): ...this and extend to tuples.
	(load_za_base, store_za_base): Update accordingly.
	(expand_ldr_str_zt0): New function.
	(svldr_zt_impl, svluti_lane_zt_impl, svread_za_impl, svstr_zt_impl)
	(svsudot_za_impl, svwrite_za_impl, svzero_zt_impl): New classes.
	(svadd_za, svadd_write_za, svbmopa_za, svbmops_za, svdot_za)
	(svdot_lane_za, svldr_zt, svluti2_lane_zt, svluti4_lane_zt)
	(svmla_za, svmla_lane_za, svmls_za, svmls_lane_za, svread_za)
	(svstr_zt, svsub_za, svsub_write_za, svsudot_za, svsudot_lane_za)
	(svsuvdot_lane_za, svusdot_za, svusdot_lane_za, svusvdot_lane_za)
	(svvdot_lane_za, svwrite_za, svzero_zt): New functions.
	* config/aarch64/aarch64-sve-builtins-sme.def: Add SME2 intrinsics.
	* config/aarch64/aarch64-sve-builtins-sve2.h
	(svbfmlslb, svbfmlslb_lane, svbfmlslt, svbfmlslt_lane, svclamp)
	(svcvtn, svpext, svpsel, svqcvt, svqcvtn, svqrshr, svqrshrn)
	(svqrshru, svqrshrun, svrevd, svunpk, svuzp, svuzpq, svzip)
	(svzipq): Declare.
	* config/aarch64/aarch64-sve-builtins-sve2.cc (svclamp_impl)
	(svcvtn_impl, svpext_impl, svpsel_impl): New classes.
	(svqrshl_impl::fold): Update for change to svrshl shape.
	(svrshl_impl::fold): Punt on tuple forms.
	(svsqadd_impl::expand): Update call to map_to_rtx_codes.
	(svunpk_impl): New class.
	(svbfmlslb, svbfmlslb_lane, svbfmlslt, svbfmlslt_lane, svclamp)
	(svcvtn, svpext, svpsel, svqcvt, svqcvtn, svqrshr, svqrshrn)
	(svqrshru, svqrshrun, svrevd, svunpk, svuzp, svuzpq, svzip)
	(svzipq): New functions.
	* config/aarch64/aarch64-sve-builtins-sve2.def: Add SME2 intrinsics.
	* config/aarch64/aarch64-c.cc (aarch64_update_cpp_builtins): Define
	or undefine __ARM_FEATURE_SME2.

gcc/testsuite/
	* gcc.target/aarch64/sve/acle/asm/test_sve_acle.h: Provide a way
	for test functions to share ZT0.
	(ATTR): Update accordingly.
	(TEST_LOAD_COUNT, TEST_STORE_COUNT, TEST_PN, TEST_COUNT_PN)
	(TEST_EXTRACT_PN, TEST_SELECT_P, TEST_COMPARE_S_X2, TEST_COMPARE_S_C)
	(TEST_XN, TEST_XN_SINGLE, TEST_XN_SINGLE_Z15, TEST_XN_SINGLE_AWKWARD)
	(TEST_X2_NARROW, TEST_X4_NARROW): New macros.
	* gcc.target/aarch64/sve/acle/general-c/binary_za_m_1.c: Remove
	test for svmopa that becomes valid with SME2.
	* gcc.target/aarch64/sve/acle/general-c/store_1.c: Adjust error
	messages to account for svcount_t predication.
	* gcc.target/aarch64/sve/acle/general-c/store_2.c: Likewise.
	* gcc.target/aarch64/sve/acle/general-c/ternary_qq_lane_1.c: Adjust
	error messages to account for new SME2 variants.
	* gcc.target/aarch64/sve/acle/general-c/ternary_qq_opt_n_2.c: Likewise.
---
 gcc/config/aarch64/aarch64-c.cc               |    1 +
 gcc/config/aarch64/aarch64-sme.md             | 1092 ++++++++++++++++-
 .../aarch64/aarch64-sve-builtins-base.cc      |  160 ++-
 .../aarch64/aarch64-sve-builtins-base.def     |   33 +-
 .../aarch64/aarch64-sve-builtins-functions.h  |  170 ++-
 .../aarch64/aarch64-sve-builtins-shapes.cc    |  897 +++++++++++++-
 .../aarch64/aarch64-sve-builtins-shapes.h     |   37 +-
 .../aarch64/aarch64-sve-builtins-sme.cc       |  175 ++-
 .../aarch64/aarch64-sve-builtins-sme.def      |  122 ++
 gcc/config/aarch64/aarch64-sve-builtins-sme.h |   28 +-
 .../aarch64/aarch64-sve-builtins-sve2.cc      |  107 +-
 .../aarch64/aarch64-sve-builtins-sve2.def     |   74 +-
 .../aarch64/aarch64-sve-builtins-sve2.h       |   21 +
 gcc/config/aarch64/aarch64-sve-builtins.cc    |  457 ++++++-
 gcc/config/aarch64/aarch64-sve-builtins.def   |   15 +
 gcc/config/aarch64/aarch64-sve-builtins.h     |   44 +-
 gcc/config/aarch64/aarch64-sve.md             |   98 +-
 gcc/config/aarch64/aarch64-sve2.md            |  703 +++++++++++
 gcc/config/aarch64/aarch64.cc                 |   27 +-
 gcc/config/aarch64/aarch64.h                  |   19 +-
 gcc/config/aarch64/aarch64.md                 |    4 +
 gcc/config/aarch64/atomics.md                 |    2 +-
 gcc/config/aarch64/constraints.md             |   26 +-
 gcc/config/aarch64/iterators.md               |  369 +++++-
 gcc/config/aarch64/predicates.md              |   27 +-
 .../aarch64/sme2/aarch64-sme2-acle-asm.exp    |   82 ++
 .../aarch64/sme/acle-asm/clamp_s16.c          |   42 +
 .../aarch64/sme/acle-asm/clamp_s32.c          |   42 +
 .../aarch64/sme/acle-asm/clamp_s64.c          |   42 +
 .../aarch64/sme/acle-asm/clamp_s8.c           |   42 +
 .../aarch64/sme/acle-asm/clamp_u16.c          |   42 +
 .../aarch64/sme/acle-asm/clamp_u32.c          |   42 +
 .../aarch64/sme/acle-asm/clamp_u64.c          |   42 +
 .../aarch64/sme/acle-asm/clamp_u8.c           |   42 +
 .../aarch64/sme/acle-asm/revd_bf16.c          |   76 ++
 .../aarch64/sme/acle-asm/revd_f16.c           |   76 ++
 .../aarch64/sme/acle-asm/revd_f32.c           |   76 ++
 .../aarch64/sme/acle-asm/revd_f64.c           |   76 ++
 .../aarch64/sme/acle-asm/revd_s16.c           |   76 ++
 .../aarch64/sme/acle-asm/revd_s32.c           |   76 ++
 .../aarch64/sme/acle-asm/revd_s64.c           |   76 ++
 .../gcc.target/aarch64/sme/acle-asm/revd_s8.c |   76 ++
 .../aarch64/sme/acle-asm/revd_u16.c           |   76 ++
 .../aarch64/sme/acle-asm/revd_u32.c           |   76 ++
 .../aarch64/sme/acle-asm/revd_u64.c           |   76 ++
 .../gcc.target/aarch64/sme/acle-asm/revd_u8.c |   76 ++
 .../gcc.target/aarch64/sme/clamp_1.c          |   38 +
 .../gcc.target/aarch64/sme/clamp_2.c          |   32 +
 .../gcc.target/aarch64/sme/clamp_3.c          |   26 +
 .../gcc.target/aarch64/sme/clamp_4.c          |   20 +
 .../aarch64/sme2/aarch64-sme2-acle-asm.exp    |   81 ++
 .../aarch64/sme2/acle-asm/add_s16_x2.c        |  115 ++
 .../aarch64/sme2/acle-asm/add_s16_x4.c        |  125 ++
 .../aarch64/sme2/acle-asm/add_s32_x2.c        |  115 ++
 .../aarch64/sme2/acle-asm/add_s32_x4.c        |  125 ++
 .../aarch64/sme2/acle-asm/add_s64_x2.c        |  115 ++
 .../aarch64/sme2/acle-asm/add_s64_x4.c        |  125 ++
 .../aarch64/sme2/acle-asm/add_s8_x2.c         |  115 ++
 .../aarch64/sme2/acle-asm/add_s8_x4.c         |  125 ++
 .../aarch64/sme2/acle-asm/add_u16_x2.c        |  115 ++
 .../aarch64/sme2/acle-asm/add_u16_x4.c        |  125 ++
 .../aarch64/sme2/acle-asm/add_u32_x2.c        |  115 ++
 .../aarch64/sme2/acle-asm/add_u32_x4.c        |  125 ++
 .../aarch64/sme2/acle-asm/add_u64_x2.c        |  115 ++
 .../aarch64/sme2/acle-asm/add_u64_x4.c        |  125 ++
 .../aarch64/sme2/acle-asm/add_u8_x2.c         |  115 ++
 .../aarch64/sme2/acle-asm/add_u8_x4.c         |  125 ++
 .../sme2/acle-asm/add_write_za32_s32_vg1x2.c  |  180 +++
 .../sme2/acle-asm/add_write_za32_s32_vg1x4.c  |  172 +++
 .../sme2/acle-asm/add_write_za32_u32_vg1x2.c  |  180 +++
 .../sme2/acle-asm/add_write_za32_u32_vg1x4.c  |  172 +++
 .../sme2/acle-asm/add_write_za64_s64_vg1x2.c  |  182 +++
 .../sme2/acle-asm/add_write_za64_s64_vg1x4.c  |  174 +++
 .../sme2/acle-asm/add_write_za64_u64_vg1x2.c  |  182 +++
 .../sme2/acle-asm/add_write_za64_u64_vg1x4.c  |  174 +++
 .../sme2/acle-asm/add_za32_f32_vg1x2.c        |  122 ++
 .../sme2/acle-asm/add_za32_f32_vg1x4.c        |  137 +++
 .../sme2/acle-asm/add_za32_s32_vg1x2.c        |  122 ++
 .../sme2/acle-asm/add_za32_s32_vg1x4.c        |  137 +++
 .../sme2/acle-asm/add_za32_u32_vg1x2.c        |  122 ++
 .../sme2/acle-asm/add_za32_u32_vg1x4.c        |  137 +++
 .../sme2/acle-asm/add_za64_f64_vg1x2.c        |  126 ++
 .../sme2/acle-asm/add_za64_f64_vg1x4.c        |  141 +++
 .../sme2/acle-asm/add_za64_s64_vg1x2.c        |  124 ++
 .../sme2/acle-asm/add_za64_s64_vg1x4.c        |  139 +++
 .../sme2/acle-asm/add_za64_u64_vg1x2.c        |  124 ++
 .../sme2/acle-asm/add_za64_u64_vg1x4.c        |  139 +++
 .../aarch64/sme2/acle-asm/bfmlslb_f32.c       |   65 +
 .../aarch64/sme2/acle-asm/bfmlslb_lane_f32.c  |   84 ++
 .../aarch64/sme2/acle-asm/bfmlslt_f32.c       |   65 +
 .../aarch64/sme2/acle-asm/bfmlslt_lane_f32.c  |   84 ++
 .../aarch64/sme2/acle-asm/bmopa_za32.c        |   30 +
 .../aarch64/sme2/acle-asm/bmops_za32.c        |   30 +
 .../aarch64/sme2/acle-asm/clamp_f16.c         |   42 +
 .../aarch64/sme2/acle-asm/clamp_f16_x2.c      |   94 ++
 .../aarch64/sme2/acle-asm/clamp_f16_x4.c      |  104 ++
 .../aarch64/sme2/acle-asm/clamp_f32.c         |   42 +
 .../aarch64/sme2/acle-asm/clamp_f32_x2.c      |   94 ++
 .../aarch64/sme2/acle-asm/clamp_f32_x4.c      |  104 ++
 .../aarch64/sme2/acle-asm/clamp_f64.c         |   42 +
 .../aarch64/sme2/acle-asm/clamp_f64_x2.c      |   94 ++
 .../aarch64/sme2/acle-asm/clamp_f64_x4.c      |  104 ++
 .../aarch64/sme2/acle-asm/clamp_s16_x2.c      |   94 ++
 .../aarch64/sme2/acle-asm/clamp_s16_x4.c      |  104 ++
 .../aarch64/sme2/acle-asm/clamp_s32_x2.c      |   94 ++
 .../aarch64/sme2/acle-asm/clamp_s32_x4.c      |  104 ++
 .../aarch64/sme2/acle-asm/clamp_s64_x2.c      |   94 ++
 .../aarch64/sme2/acle-asm/clamp_s64_x4.c      |  104 ++
 .../aarch64/sme2/acle-asm/clamp_s8_x2.c       |   94 ++
 .../aarch64/sme2/acle-asm/clamp_s8_x4.c       |  104 ++
 .../aarch64/sme2/acle-asm/clamp_u16_x2.c      |   94 ++
 .../aarch64/sme2/acle-asm/clamp_u16_x4.c      |  104 ++
 .../aarch64/sme2/acle-asm/clamp_u32_x2.c      |   94 ++
 .../aarch64/sme2/acle-asm/clamp_u32_x4.c      |  104 ++
 .../aarch64/sme2/acle-asm/clamp_u64_x2.c      |   94 ++
 .../aarch64/sme2/acle-asm/clamp_u64_x4.c      |  104 ++
 .../aarch64/sme2/acle-asm/clamp_u8_x2.c       |   94 ++
 .../aarch64/sme2/acle-asm/clamp_u8_x4.c       |  104 ++
 .../aarch64/sme2/acle-asm/cntp_c16.c          |   39 +
 .../aarch64/sme2/acle-asm/cntp_c32.c          |   39 +
 .../aarch64/sme2/acle-asm/cntp_c64.c          |   39 +
 .../aarch64/sme2/acle-asm/cntp_c8.c           |   39 +
 .../aarch64/sme2/acle-asm/cvt_bf16_f32_x2.c   |   50 +
 .../aarch64/sme2/acle-asm/cvt_f16_f32_x2.c    |   50 +
 .../aarch64/sme2/acle-asm/cvt_f32_s32_x2.c    |   43 +
 .../aarch64/sme2/acle-asm/cvt_f32_s32_x4.c    |   77 ++
 .../aarch64/sme2/acle-asm/cvt_f32_u32_x2.c    |   43 +
 .../aarch64/sme2/acle-asm/cvt_f32_u32_x4.c    |   77 ++
 .../aarch64/sme2/acle-asm/cvt_s32_f32_x2.c    |   43 +
 .../aarch64/sme2/acle-asm/cvt_s32_f32_x4.c    |   77 ++
 .../aarch64/sme2/acle-asm/cvt_u32_f32_x2.c    |   43 +
 .../aarch64/sme2/acle-asm/cvt_u32_f32_x4.c    |   77 ++
 .../aarch64/sme2/acle-asm/cvtn_bf16_f32_x2.c  |   50 +
 .../aarch64/sme2/acle-asm/cvtn_f16_f32_x2.c   |   50 +
 .../aarch64/sme2/acle-asm/dot_f32.c           |   44 +
 .../aarch64/sme2/acle-asm/dot_lane_f32.c      |   93 ++
 .../aarch64/sme2/acle-asm/dot_lane_s32.c      |   93 ++
 .../aarch64/sme2/acle-asm/dot_lane_u32.c      |   93 ++
 .../sme2/acle-asm/dot_lane_za32_bf16_vg1x2.c  |  102 ++
 .../sme2/acle-asm/dot_lane_za32_bf16_vg1x4.c  |  108 ++
 .../sme2/acle-asm/dot_lane_za32_f16_vg1x2.c   |  102 ++
 .../sme2/acle-asm/dot_lane_za32_f16_vg1x4.c   |  108 ++
 .../sme2/acle-asm/dot_lane_za32_s16_vg1x2.c   |  102 ++
 .../sme2/acle-asm/dot_lane_za32_s16_vg1x4.c   |  108 ++
 .../sme2/acle-asm/dot_lane_za32_s8_vg1x2.c    |  102 ++
 .../sme2/acle-asm/dot_lane_za32_s8_vg1x4.c    |  108 ++
 .../sme2/acle-asm/dot_lane_za32_u16_vg1x2.c   |  102 ++
 .../sme2/acle-asm/dot_lane_za32_u16_vg1x4.c   |  108 ++
 .../sme2/acle-asm/dot_lane_za32_u8_vg1x2.c    |  102 ++
 .../sme2/acle-asm/dot_lane_za32_u8_vg1x4.c    |  108 ++
 .../sme2/acle-asm/dot_lane_za64_s16_vg1x2.c   |  104 ++
 .../sme2/acle-asm/dot_lane_za64_s16_vg1x4.c   |  110 ++
 .../sme2/acle-asm/dot_lane_za64_u16_vg1x2.c   |  104 ++
 .../sme2/acle-asm/dot_lane_za64_u16_vg1x4.c   |  110 ++
 .../aarch64/sme2/acle-asm/dot_s32.c           |   44 +
 .../aarch64/sme2/acle-asm/dot_u32.c           |   44 +
 .../sme2/acle-asm/dot_za32_bf16_vg1x2.c       |  243 ++++
 .../sme2/acle-asm/dot_za32_bf16_vg1x4.c       |  254 ++++
 .../sme2/acle-asm/dot_za32_f16_vg1x2.c        |  243 ++++
 .../sme2/acle-asm/dot_za32_f16_vg1x4.c        |  254 ++++
 .../sme2/acle-asm/dot_za32_s16_vg1x2.c        |  243 ++++
 .../sme2/acle-asm/dot_za32_s16_vg1x4.c        |  254 ++++
 .../aarch64/sme2/acle-asm/dot_za32_s8_vg1x2.c |  243 ++++
 .../aarch64/sme2/acle-asm/dot_za32_s8_vg1x4.c |  254 ++++
 .../sme2/acle-asm/dot_za32_u16_vg1x2.c        |  243 ++++
 .../sme2/acle-asm/dot_za32_u16_vg1x4.c        |  254 ++++
 .../aarch64/sme2/acle-asm/dot_za32_u8_vg1x2.c |  243 ++++
 .../aarch64/sme2/acle-asm/dot_za32_u8_vg1x4.c |  254 ++++
 .../sme2/acle-asm/dot_za64_s16_vg1x2.c        |  245 ++++
 .../sme2/acle-asm/dot_za64_s16_vg1x4.c        |  256 ++++
 .../sme2/acle-asm/dot_za64_u16_vg1x2.c        |  245 ++++
 .../sme2/acle-asm/dot_za64_u16_vg1x4.c        |  256 ++++
 .../aarch64/sme2/acle-asm/ld1_bf16_x2.c       |  262 ++++
 .../aarch64/sme2/acle-asm/ld1_bf16_x4.c       |  354 ++++++
 .../aarch64/sme2/acle-asm/ld1_f16_x2.c        |  262 ++++
 .../aarch64/sme2/acle-asm/ld1_f16_x4.c        |  354 ++++++
 .../aarch64/sme2/acle-asm/ld1_f32_x2.c        |  262 ++++
 .../aarch64/sme2/acle-asm/ld1_f32_x4.c        |  354 ++++++
 .../aarch64/sme2/acle-asm/ld1_f64_x2.c        |  262 ++++
 .../aarch64/sme2/acle-asm/ld1_f64_x4.c        |  354 ++++++
 .../aarch64/sme2/acle-asm/ld1_s16_x2.c        |  262 ++++
 .../aarch64/sme2/acle-asm/ld1_s16_x4.c        |  354 ++++++
 .../aarch64/sme2/acle-asm/ld1_s32_x2.c        |  262 ++++
 .../aarch64/sme2/acle-asm/ld1_s32_x4.c        |  354 ++++++
 .../aarch64/sme2/acle-asm/ld1_s64_x2.c        |  262 ++++
 .../aarch64/sme2/acle-asm/ld1_s64_x4.c        |  354 ++++++
 .../aarch64/sme2/acle-asm/ld1_s8_x2.c         |  262 ++++
 .../aarch64/sme2/acle-asm/ld1_s8_x4.c         |  354 ++++++
 .../aarch64/sme2/acle-asm/ld1_u16_x2.c        |  262 ++++
 .../aarch64/sme2/acle-asm/ld1_u16_x4.c        |  354 ++++++
 .../aarch64/sme2/acle-asm/ld1_u32_x2.c        |  262 ++++
 .../aarch64/sme2/acle-asm/ld1_u32_x4.c        |  354 ++++++
 .../aarch64/sme2/acle-asm/ld1_u64_x2.c        |  262 ++++
 .../aarch64/sme2/acle-asm/ld1_u64_x4.c        |  354 ++++++
 .../aarch64/sme2/acle-asm/ld1_u8_x2.c         |  262 ++++
 .../aarch64/sme2/acle-asm/ld1_u8_x4.c         |  354 ++++++
 .../aarch64/sme2/acle-asm/ldnt1_bf16_x2.c     |  262 ++++
 .../aarch64/sme2/acle-asm/ldnt1_bf16_x4.c     |  354 ++++++
 .../aarch64/sme2/acle-asm/ldnt1_f16_x2.c      |  262 ++++
 .../aarch64/sme2/acle-asm/ldnt1_f16_x4.c      |  354 ++++++
 .../aarch64/sme2/acle-asm/ldnt1_f32_x2.c      |  262 ++++
 .../aarch64/sme2/acle-asm/ldnt1_f32_x4.c      |  354 ++++++
 .../aarch64/sme2/acle-asm/ldnt1_f64_x2.c      |  262 ++++
 .../aarch64/sme2/acle-asm/ldnt1_f64_x4.c      |  354 ++++++
 .../aarch64/sme2/acle-asm/ldnt1_s16_x2.c      |  262 ++++
 .../aarch64/sme2/acle-asm/ldnt1_s16_x4.c      |  354 ++++++
 .../aarch64/sme2/acle-asm/ldnt1_s32_x2.c      |  262 ++++
 .../aarch64/sme2/acle-asm/ldnt1_s32_x4.c      |  354 ++++++
 .../aarch64/sme2/acle-asm/ldnt1_s64_x2.c      |  262 ++++
 .../aarch64/sme2/acle-asm/ldnt1_s64_x4.c      |  354 ++++++
 .../aarch64/sme2/acle-asm/ldnt1_s8_x2.c       |  262 ++++
 .../aarch64/sme2/acle-asm/ldnt1_s8_x4.c       |  354 ++++++
 .../aarch64/sme2/acle-asm/ldnt1_u16_x2.c      |  262 ++++
 .../aarch64/sme2/acle-asm/ldnt1_u16_x4.c      |  354 ++++++
 .../aarch64/sme2/acle-asm/ldnt1_u32_x2.c      |  262 ++++
 .../aarch64/sme2/acle-asm/ldnt1_u32_x4.c      |  354 ++++++
 .../aarch64/sme2/acle-asm/ldnt1_u64_x2.c      |  262 ++++
 .../aarch64/sme2/acle-asm/ldnt1_u64_x4.c      |  354 ++++++
 .../aarch64/sme2/acle-asm/ldnt1_u8_x2.c       |  262 ++++
 .../aarch64/sme2/acle-asm/ldnt1_u8_x4.c       |  354 ++++++
 .../gcc.target/aarch64/sme2/acle-asm/ldr_zt.c |   36 +
 .../aarch64/sme2/acle-asm/luti2_bf16.c        |   48 +
 .../aarch64/sme2/acle-asm/luti2_bf16_x2.c     |   50 +
 .../aarch64/sme2/acle-asm/luti2_bf16_x4.c     |   56 +
 .../aarch64/sme2/acle-asm/luti2_f16.c         |   48 +
 .../aarch64/sme2/acle-asm/luti2_f16_x2.c      |   50 +
 .../aarch64/sme2/acle-asm/luti2_f16_x4.c      |   56 +
 .../aarch64/sme2/acle-asm/luti2_f32.c         |   48 +
 .../aarch64/sme2/acle-asm/luti2_f32_x2.c      |   50 +
 .../aarch64/sme2/acle-asm/luti2_f32_x4.c      |   56 +
 .../aarch64/sme2/acle-asm/luti2_s16.c         |   48 +
 .../aarch64/sme2/acle-asm/luti2_s16_x2.c      |   50 +
 .../aarch64/sme2/acle-asm/luti2_s16_x4.c      |   56 +
 .../aarch64/sme2/acle-asm/luti2_s32.c         |   48 +
 .../aarch64/sme2/acle-asm/luti2_s32_x2.c      |   50 +
 .../aarch64/sme2/acle-asm/luti2_s32_x4.c      |   56 +
 .../aarch64/sme2/acle-asm/luti2_s8.c          |   48 +
 .../aarch64/sme2/acle-asm/luti2_s8_x2.c       |   50 +
 .../aarch64/sme2/acle-asm/luti2_s8_x4.c       |   56 +
 .../aarch64/sme2/acle-asm/luti2_u16.c         |   48 +
 .../aarch64/sme2/acle-asm/luti2_u16_x2.c      |   50 +
 .../aarch64/sme2/acle-asm/luti2_u16_x4.c      |   56 +
 .../aarch64/sme2/acle-asm/luti2_u32.c         |   48 +
 .../aarch64/sme2/acle-asm/luti2_u32_x2.c      |   50 +
 .../aarch64/sme2/acle-asm/luti2_u32_x4.c      |   56 +
 .../aarch64/sme2/acle-asm/luti2_u8.c          |   48 +
 .../aarch64/sme2/acle-asm/luti2_u8_x2.c       |   50 +
 .../aarch64/sme2/acle-asm/luti2_u8_x4.c       |   56 +
 .../aarch64/sme2/acle-asm/luti4_bf16.c        |   48 +
 .../aarch64/sme2/acle-asm/luti4_bf16_x2.c     |   50 +
 .../aarch64/sme2/acle-asm/luti4_bf16_x4.c     |   56 +
 .../aarch64/sme2/acle-asm/luti4_f16.c         |   48 +
 .../aarch64/sme2/acle-asm/luti4_f16_x2.c      |   50 +
 .../aarch64/sme2/acle-asm/luti4_f16_x4.c      |   56 +
 .../aarch64/sme2/acle-asm/luti4_f32.c         |   48 +
 .../aarch64/sme2/acle-asm/luti4_f32_x2.c      |   50 +
 .../aarch64/sme2/acle-asm/luti4_f32_x4.c      |   56 +
 .../aarch64/sme2/acle-asm/luti4_s16.c         |   48 +
 .../aarch64/sme2/acle-asm/luti4_s16_x2.c      |   50 +
 .../aarch64/sme2/acle-asm/luti4_s16_x4.c      |   56 +
 .../aarch64/sme2/acle-asm/luti4_s32.c         |   48 +
 .../aarch64/sme2/acle-asm/luti4_s32_x2.c      |   50 +
 .../aarch64/sme2/acle-asm/luti4_s32_x4.c      |   56 +
 .../aarch64/sme2/acle-asm/luti4_s8.c          |   48 +
 .../aarch64/sme2/acle-asm/luti4_s8_x2.c       |   50 +
 .../aarch64/sme2/acle-asm/luti4_u16.c         |   48 +
 .../aarch64/sme2/acle-asm/luti4_u16_x2.c      |   50 +
 .../aarch64/sme2/acle-asm/luti4_u16_x4.c      |   56 +
 .../aarch64/sme2/acle-asm/luti4_u32.c         |   48 +
 .../aarch64/sme2/acle-asm/luti4_u32_x2.c      |   50 +
 .../aarch64/sme2/acle-asm/luti4_u32_x4.c      |   56 +
 .../aarch64/sme2/acle-asm/luti4_u8.c          |   48 +
 .../aarch64/sme2/acle-asm/luti4_u8_x2.c       |   50 +
 .../aarch64/sme2/acle-asm/max_f16_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/max_f16_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/max_f32_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/max_f32_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/max_f64_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/max_f64_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/max_s16_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/max_s16_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/max_s32_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/max_s32_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/max_s64_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/max_s64_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/max_s8_x2.c         |  207 ++++
 .../aarch64/sme2/acle-asm/max_s8_x4.c         |  249 ++++
 .../aarch64/sme2/acle-asm/max_u16_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/max_u16_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/max_u32_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/max_u32_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/max_u64_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/max_u64_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/max_u8_x2.c         |  207 ++++
 .../aarch64/sme2/acle-asm/max_u8_x4.c         |  249 ++++
 .../aarch64/sme2/acle-asm/maxnm_f16_x2.c      |  207 ++++
 .../aarch64/sme2/acle-asm/maxnm_f16_x4.c      |  249 ++++
 .../aarch64/sme2/acle-asm/maxnm_f32_x2.c      |  207 ++++
 .../aarch64/sme2/acle-asm/maxnm_f32_x4.c      |  249 ++++
 .../aarch64/sme2/acle-asm/maxnm_f64_x2.c      |  207 ++++
 .../aarch64/sme2/acle-asm/maxnm_f64_x4.c      |  249 ++++
 .../aarch64/sme2/acle-asm/min_f16_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/min_f16_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/min_f32_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/min_f32_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/min_f64_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/min_f64_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/min_s16_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/min_s16_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/min_s32_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/min_s32_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/min_s64_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/min_s64_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/min_s8_x2.c         |  207 ++++
 .../aarch64/sme2/acle-asm/min_s8_x4.c         |  249 ++++
 .../aarch64/sme2/acle-asm/min_u16_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/min_u16_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/min_u32_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/min_u32_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/min_u64_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/min_u64_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/min_u8_x2.c         |  207 ++++
 .../aarch64/sme2/acle-asm/min_u8_x4.c         |  249 ++++
 .../aarch64/sme2/acle-asm/minnm_f16_x2.c      |  207 ++++
 .../aarch64/sme2/acle-asm/minnm_f16_x4.c      |  249 ++++
 .../aarch64/sme2/acle-asm/minnm_f32_x2.c      |  207 ++++
 .../aarch64/sme2/acle-asm/minnm_f32_x4.c      |  249 ++++
 .../aarch64/sme2/acle-asm/minnm_f64_x2.c      |  207 ++++
 .../aarch64/sme2/acle-asm/minnm_f64_x4.c      |  249 ++++
 .../sme2/acle-asm/mla_lane_za32_bf16_vg2x1.c  |  148 +++
 .../sme2/acle-asm/mla_lane_za32_bf16_vg2x2.c  |  112 ++
 .../sme2/acle-asm/mla_lane_za32_bf16_vg2x4.c  |  118 ++
 .../sme2/acle-asm/mla_lane_za32_f16_vg2x1.c   |  148 +++
 .../sme2/acle-asm/mla_lane_za32_f16_vg2x2.c   |  112 ++
 .../sme2/acle-asm/mla_lane_za32_f16_vg2x4.c   |  118 ++
 .../sme2/acle-asm/mla_lane_za32_f32_vg1x2.c   |  102 ++
 .../sme2/acle-asm/mla_lane_za32_f32_vg1x4.c   |  108 ++
 .../sme2/acle-asm/mla_lane_za32_s16_vg2x1.c   |  148 +++
 .../sme2/acle-asm/mla_lane_za32_s16_vg2x2.c   |  112 ++
 .../sme2/acle-asm/mla_lane_za32_s16_vg2x4.c   |  118 ++
 .../sme2/acle-asm/mla_lane_za32_s8_vg4x1.c    |  150 +++
 .../sme2/acle-asm/mla_lane_za32_s8_vg4x2.c    |  122 ++
 .../sme2/acle-asm/mla_lane_za32_s8_vg4x4.c    |  128 ++
 .../sme2/acle-asm/mla_lane_za32_u16_vg2x1.c   |  148 +++
 .../sme2/acle-asm/mla_lane_za32_u16_vg2x2.c   |  112 ++
 .../sme2/acle-asm/mla_lane_za32_u16_vg2x4.c   |  118 ++
 .../sme2/acle-asm/mla_lane_za32_u8_vg4x1.c    |  150 +++
 .../sme2/acle-asm/mla_lane_za32_u8_vg4x2.c    |  122 ++
 .../sme2/acle-asm/mla_lane_za32_u8_vg4x4.c    |  128 ++
 .../sme2/acle-asm/mla_lane_za64_f64_vg1x2.c   |  104 ++
 .../sme2/acle-asm/mla_lane_za64_f64_vg1x4.c   |  110 ++
 .../sme2/acle-asm/mla_lane_za64_s16_vg4x1.c   |  152 +++
 .../sme2/acle-asm/mla_lane_za64_s16_vg4x2.c   |  124 ++
 .../sme2/acle-asm/mla_lane_za64_s16_vg4x4.c   |  130 ++
 .../sme2/acle-asm/mla_lane_za64_u16_vg4x1.c   |  152 +++
 .../sme2/acle-asm/mla_lane_za64_u16_vg4x2.c   |  124 ++
 .../sme2/acle-asm/mla_lane_za64_u16_vg4x4.c   |  130 ++
 .../sme2/acle-asm/mla_za32_bf16_vg2x1.c       |  148 +++
 .../sme2/acle-asm/mla_za32_bf16_vg2x2.c       |  247 ++++
 .../sme2/acle-asm/mla_za32_bf16_vg2x4.c       |  258 ++++
 .../sme2/acle-asm/mla_za32_f16_vg2x1.c        |  148 +++
 .../sme2/acle-asm/mla_za32_f16_vg2x2.c        |  247 ++++
 .../sme2/acle-asm/mla_za32_f16_vg2x4.c        |  258 ++++
 .../sme2/acle-asm/mla_za32_f32_vg1x2.c        |  180 +++
 .../sme2/acle-asm/mla_za32_f32_vg1x4.c        |  172 +++
 .../sme2/acle-asm/mla_za32_s16_vg2x1.c        |  148 +++
 .../sme2/acle-asm/mla_za32_s16_vg2x2.c        |  247 ++++
 .../sme2/acle-asm/mla_za32_s16_vg2x4.c        |  258 ++++
 .../aarch64/sme2/acle-asm/mla_za32_s8_vg4x1.c |  149 +++
 .../aarch64/sme2/acle-asm/mla_za32_s8_vg4x2.c |  249 ++++
 .../aarch64/sme2/acle-asm/mla_za32_s8_vg4x4.c |  260 ++++
 .../sme2/acle-asm/mla_za32_u16_vg2x1.c        |  148 +++
 .../sme2/acle-asm/mla_za32_u16_vg2x2.c        |  247 ++++
 .../sme2/acle-asm/mla_za32_u16_vg2x4.c        |  258 ++++
 .../aarch64/sme2/acle-asm/mla_za32_u8_vg4x1.c |  149 +++
 .../aarch64/sme2/acle-asm/mla_za32_u8_vg4x2.c |  249 ++++
 .../aarch64/sme2/acle-asm/mla_za32_u8_vg4x4.c |  260 ++++
 .../sme2/acle-asm/mla_za64_f64_vg1x2.c        |  182 +++
 .../sme2/acle-asm/mla_za64_f64_vg1x4.c        |  174 +++
 .../sme2/acle-asm/mla_za64_s16_vg4x1.c        |  151 +++
 .../sme2/acle-asm/mla_za64_s16_vg4x2.c        |  251 ++++
 .../sme2/acle-asm/mla_za64_s16_vg4x4.c        |  262 ++++
 .../sme2/acle-asm/mla_za64_u16_vg4x1.c        |  151 +++
 .../sme2/acle-asm/mla_za64_u16_vg4x2.c        |  251 ++++
 .../sme2/acle-asm/mla_za64_u16_vg4x4.c        |  262 ++++
 .../sme2/acle-asm/mls_lane_za32_bf16_vg2x1.c  |  148 +++
 .../sme2/acle-asm/mls_lane_za32_bf16_vg2x2.c  |  112 ++
 .../sme2/acle-asm/mls_lane_za32_bf16_vg2x4.c  |  118 ++
 .../sme2/acle-asm/mls_lane_za32_f16_vg2x1.c   |  148 +++
 .../sme2/acle-asm/mls_lane_za32_f16_vg2x2.c   |  112 ++
 .../sme2/acle-asm/mls_lane_za32_f16_vg2x4.c   |  118 ++
 .../sme2/acle-asm/mls_lane_za32_f32_vg1x2.c   |  102 ++
 .../sme2/acle-asm/mls_lane_za32_f32_vg1x4.c   |  108 ++
 .../sme2/acle-asm/mls_lane_za32_s16_vg2x1.c   |  148 +++
 .../sme2/acle-asm/mls_lane_za32_s16_vg2x2.c   |  112 ++
 .../sme2/acle-asm/mls_lane_za32_s16_vg2x4.c   |  118 ++
 .../sme2/acle-asm/mls_lane_za32_s8_vg4x1.c    |  150 +++
 .../sme2/acle-asm/mls_lane_za32_s8_vg4x2.c    |  122 ++
 .../sme2/acle-asm/mls_lane_za32_s8_vg4x4.c    |  128 ++
 .../sme2/acle-asm/mls_lane_za32_u16_vg2x1.c   |  148 +++
 .../sme2/acle-asm/mls_lane_za32_u16_vg2x2.c   |  112 ++
 .../sme2/acle-asm/mls_lane_za32_u16_vg2x4.c   |  118 ++
 .../sme2/acle-asm/mls_lane_za32_u8_vg4x1.c    |  150 +++
 .../sme2/acle-asm/mls_lane_za32_u8_vg4x2.c    |  122 ++
 .../sme2/acle-asm/mls_lane_za32_u8_vg4x4.c    |  128 ++
 .../sme2/acle-asm/mls_lane_za64_f64_vg1x2.c   |  104 ++
 .../sme2/acle-asm/mls_lane_za64_f64_vg1x4.c   |  110 ++
 .../sme2/acle-asm/mls_lane_za64_s16_vg4x1.c   |  152 +++
 .../sme2/acle-asm/mls_lane_za64_s16_vg4x2.c   |  124 ++
 .../sme2/acle-asm/mls_lane_za64_s16_vg4x4.c   |  130 ++
 .../sme2/acle-asm/mls_lane_za64_u16_vg4x1.c   |  152 +++
 .../sme2/acle-asm/mls_lane_za64_u16_vg4x2.c   |  124 ++
 .../sme2/acle-asm/mls_lane_za64_u16_vg4x4.c   |  130 ++
 .../sme2/acle-asm/mls_za32_bf16_vg2x1.c       |  148 +++
 .../sme2/acle-asm/mls_za32_bf16_vg2x2.c       |  247 ++++
 .../sme2/acle-asm/mls_za32_bf16_vg2x4.c       |  258 ++++
 .../sme2/acle-asm/mls_za32_f16_vg2x1.c        |  148 +++
 .../sme2/acle-asm/mls_za32_f16_vg2x2.c        |  247 ++++
 .../sme2/acle-asm/mls_za32_f16_vg2x4.c        |  258 ++++
 .../sme2/acle-asm/mls_za32_f32_vg1x2.c        |  180 +++
 .../sme2/acle-asm/mls_za32_f32_vg1x4.c        |  172 +++
 .../sme2/acle-asm/mls_za32_s16_vg2x1.c        |  148 +++
 .../sme2/acle-asm/mls_za32_s16_vg2x2.c        |  247 ++++
 .../sme2/acle-asm/mls_za32_s16_vg2x4.c        |  258 ++++
 .../aarch64/sme2/acle-asm/mls_za32_s8_vg4x1.c |  149 +++
 .../aarch64/sme2/acle-asm/mls_za32_s8_vg4x2.c |  249 ++++
 .../aarch64/sme2/acle-asm/mls_za32_s8_vg4x4.c |  260 ++++
 .../sme2/acle-asm/mls_za32_u16_vg2x1.c        |  148 +++
 .../sme2/acle-asm/mls_za32_u16_vg2x2.c        |  247 ++++
 .../sme2/acle-asm/mls_za32_u16_vg2x4.c        |  258 ++++
 .../aarch64/sme2/acle-asm/mls_za32_u8_vg4x1.c |  149 +++
 .../aarch64/sme2/acle-asm/mls_za32_u8_vg4x2.c |  249 ++++
 .../aarch64/sme2/acle-asm/mls_za32_u8_vg4x4.c |  260 ++++
 .../sme2/acle-asm/mls_za64_f64_vg1x2.c        |  182 +++
 .../sme2/acle-asm/mls_za64_f64_vg1x4.c        |  174 +++
 .../sme2/acle-asm/mls_za64_s16_vg4x1.c        |  151 +++
 .../sme2/acle-asm/mls_za64_s16_vg4x2.c        |  251 ++++
 .../sme2/acle-asm/mls_za64_s16_vg4x4.c        |  262 ++++
 .../sme2/acle-asm/mls_za64_u16_vg4x1.c        |  151 +++
 .../sme2/acle-asm/mls_za64_u16_vg4x2.c        |  251 ++++
 .../sme2/acle-asm/mls_za64_u16_vg4x4.c        |  262 ++++
 .../aarch64/sme2/acle-asm/mopa_za32.c         |   48 +
 .../aarch64/sme2/acle-asm/mops_za32.c         |   48 +
 .../aarch64/sme2/acle-asm/pext_c16.c          |   50 +
 .../aarch64/sme2/acle-asm/pext_c16_x2.c       |   54 +
 .../aarch64/sme2/acle-asm/pext_c32.c          |   50 +
 .../aarch64/sme2/acle-asm/pext_c32_x2.c       |   54 +
 .../aarch64/sme2/acle-asm/pext_c64.c          |   50 +
 .../aarch64/sme2/acle-asm/pext_c64_x2.c       |   54 +
 .../aarch64/sme2/acle-asm/pext_c8.c           |   50 +
 .../aarch64/sme2/acle-asm/pext_c8_x2.c        |   54 +
 .../aarch64/sme2/acle-asm/pfalse_c.c          |   39 +
 .../aarch64/sme2/acle-asm/psel_b16.c          |   89 ++
 .../aarch64/sme2/acle-asm/psel_b32.c          |   89 ++
 .../aarch64/sme2/acle-asm/psel_b64.c          |   80 ++
 .../aarch64/sme2/acle-asm/psel_b8.c           |   89 ++
 .../aarch64/sme2/acle-asm/psel_c16.c          |   89 ++
 .../aarch64/sme2/acle-asm/psel_c32.c          |   89 ++
 .../aarch64/sme2/acle-asm/psel_c64.c          |   80 ++
 .../aarch64/sme2/acle-asm/psel_c8.c           |   89 ++
 .../aarch64/sme2/acle-asm/ptrue_c16.c         |   41 +
 .../aarch64/sme2/acle-asm/ptrue_c32.c         |   41 +
 .../aarch64/sme2/acle-asm/ptrue_c64.c         |   41 +
 .../aarch64/sme2/acle-asm/ptrue_c8.c          |   41 +
 .../aarch64/sme2/acle-asm/qcvt_s16_s32_x2.c   |   50 +
 .../aarch64/sme2/acle-asm/qcvt_s16_s64_x4.c   |   65 +
 .../aarch64/sme2/acle-asm/qcvt_s8_s32_x4.c    |   65 +
 .../aarch64/sme2/acle-asm/qcvt_u16_s32_x2.c   |   50 +
 .../aarch64/sme2/acle-asm/qcvt_u16_s64_x4.c   |   65 +
 .../aarch64/sme2/acle-asm/qcvt_u16_u32_x2.c   |   50 +
 .../aarch64/sme2/acle-asm/qcvt_u16_u64_x4.c   |   65 +
 .../aarch64/sme2/acle-asm/qcvt_u8_s32_x4.c    |   65 +
 .../aarch64/sme2/acle-asm/qcvt_u8_u32_x4.c    |   65 +
 .../aarch64/sme2/acle-asm/qcvtn_s16_s32_x2.c  |   50 +
 .../aarch64/sme2/acle-asm/qcvtn_s16_s64_x4.c  |   65 +
 .../aarch64/sme2/acle-asm/qcvtn_s8_s32_x4.c   |   65 +
 .../aarch64/sme2/acle-asm/qcvtn_u16_s32_x2.c  |   50 +
 .../aarch64/sme2/acle-asm/qcvtn_u16_s64_x4.c  |   65 +
 .../aarch64/sme2/acle-asm/qcvtn_u16_u32_x2.c  |   50 +
 .../aarch64/sme2/acle-asm/qcvtn_u16_u64_x4.c  |   65 +
 .../aarch64/sme2/acle-asm/qcvtn_u8_s32_x4.c   |   65 +
 .../aarch64/sme2/acle-asm/qcvtn_u8_u32_x4.c   |   65 +
 .../aarch64/sme2/acle-asm/qdmulh_s16_x2.c     |  207 ++++
 .../aarch64/sme2/acle-asm/qdmulh_s16_x4.c     |  249 ++++
 .../aarch64/sme2/acle-asm/qdmulh_s32_x2.c     |  207 ++++
 .../aarch64/sme2/acle-asm/qdmulh_s32_x4.c     |  249 ++++
 .../aarch64/sme2/acle-asm/qdmulh_s64_x2.c     |  207 ++++
 .../aarch64/sme2/acle-asm/qdmulh_s64_x4.c     |  249 ++++
 .../aarch64/sme2/acle-asm/qdmulh_s8_x2.c      |  207 ++++
 .../aarch64/sme2/acle-asm/qdmulh_s8_x4.c      |  249 ++++
 .../aarch64/sme2/acle-asm/qrshr_s16_x2.c      |   50 +
 .../aarch64/sme2/acle-asm/qrshr_s16_x4.c      |   65 +
 .../aarch64/sme2/acle-asm/qrshr_s8_x4.c       |   65 +
 .../aarch64/sme2/acle-asm/qrshr_u16_x2.c      |   50 +
 .../aarch64/sme2/acle-asm/qrshr_u16_x4.c      |   65 +
 .../aarch64/sme2/acle-asm/qrshr_u8_x4.c       |   65 +
 .../aarch64/sme2/acle-asm/qrshrn_s16_x2.c     |   50 +
 .../aarch64/sme2/acle-asm/qrshrn_s16_x4.c     |   65 +
 .../aarch64/sme2/acle-asm/qrshrn_s8_x4.c      |   65 +
 .../aarch64/sme2/acle-asm/qrshrn_u16_x2.c     |   50 +
 .../aarch64/sme2/acle-asm/qrshrn_u16_x4.c     |   65 +
 .../aarch64/sme2/acle-asm/qrshrn_u8_x4.c      |   65 +
 .../aarch64/sme2/acle-asm/qrshru_u16_x2.c     |   50 +
 .../aarch64/sme2/acle-asm/qrshru_u16_x4.c     |   65 +
 .../aarch64/sme2/acle-asm/qrshru_u8_x4.c      |   65 +
 .../aarch64/sme2/acle-asm/qrshrun_u16_x2.c    |   50 +
 .../aarch64/sme2/acle-asm/qrshrun_u16_x4.c    |   65 +
 .../aarch64/sme2/acle-asm/qrshrun_u8_x4.c     |   65 +
 .../aarch64/sme2/acle-asm/read_hor_za16_vg2.c |  140 +++
 .../aarch64/sme2/acle-asm/read_hor_za16_vg4.c |  138 +++
 .../aarch64/sme2/acle-asm/read_hor_za32_vg2.c |  112 ++
 .../aarch64/sme2/acle-asm/read_hor_za32_vg4.c |  129 ++
 .../aarch64/sme2/acle-asm/read_hor_za64_vg2.c |  113 ++
 .../aarch64/sme2/acle-asm/read_hor_za64_vg4.c |  129 ++
 .../aarch64/sme2/acle-asm/read_hor_za8_vg2.c  |  140 +++
 .../aarch64/sme2/acle-asm/read_hor_za8_vg4.c  |  156 +++
 .../aarch64/sme2/acle-asm/read_ver_za16_vg2.c |  140 +++
 .../aarch64/sme2/acle-asm/read_ver_za16_vg4.c |  138 +++
 .../aarch64/sme2/acle-asm/read_ver_za32_vg2.c |  112 ++
 .../aarch64/sme2/acle-asm/read_ver_za32_vg4.c |  129 ++
 .../aarch64/sme2/acle-asm/read_ver_za64_vg2.c |  113 ++
 .../aarch64/sme2/acle-asm/read_ver_za64_vg4.c |  129 ++
 .../aarch64/sme2/acle-asm/read_ver_za8_vg2.c  |  140 +++
 .../aarch64/sme2/acle-asm/read_ver_za8_vg4.c  |  156 +++
 .../aarch64/sme2/acle-asm/read_za16_vg1x2.c   |  122 ++
 .../aarch64/sme2/acle-asm/read_za16_vg1x4.c   |  137 +++
 .../aarch64/sme2/acle-asm/read_za32_vg1x2.c   |  122 ++
 .../aarch64/sme2/acle-asm/read_za32_vg1x4.c   |  137 +++
 .../aarch64/sme2/acle-asm/read_za64_vg1x2.c   |  122 ++
 .../aarch64/sme2/acle-asm/read_za64_vg1x4.c   |  137 +++
 .../aarch64/sme2/acle-asm/read_za8_vg1x2.c    |  122 ++
 .../aarch64/sme2/acle-asm/read_za8_vg1x4.c    |  137 +++
 .../aarch64/sme2/acle-asm/rinta_s32_x2.c      |   61 +
 .../aarch64/sme2/acle-asm/rinta_s32_x4.c      |   73 ++
 .../aarch64/sme2/acle-asm/rintm_u32_x2.c      |   61 +
 .../aarch64/sme2/acle-asm/rintm_u32_x4.c      |   73 ++
 .../aarch64/sme2/acle-asm/rintn_u32_x2.c      |   61 +
 .../aarch64/sme2/acle-asm/rintn_u32_x4.c      |   73 ++
 .../aarch64/sme2/acle-asm/rintp_u32_x2.c      |   61 +
 .../aarch64/sme2/acle-asm/rintp_u32_x4.c      |   73 ++
 .../aarch64/sme2/acle-asm/rshl_s16_x2.c       |  207 ++++
 .../aarch64/sme2/acle-asm/rshl_s16_x4.c       |  249 ++++
 .../aarch64/sme2/acle-asm/rshl_s32_x2.c       |  207 ++++
 .../aarch64/sme2/acle-asm/rshl_s32_x4.c       |  249 ++++
 .../aarch64/sme2/acle-asm/rshl_s64_x2.c       |  207 ++++
 .../aarch64/sme2/acle-asm/rshl_s64_x4.c       |  249 ++++
 .../aarch64/sme2/acle-asm/rshl_s8_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/rshl_s8_x4.c        |  249 ++++
 .../aarch64/sme2/acle-asm/rshl_u16_x2.c       |  207 ++++
 .../aarch64/sme2/acle-asm/rshl_u16_x4.c       |  228 ++++
 .../aarch64/sme2/acle-asm/rshl_u32_x2.c       |  207 ++++
 .../aarch64/sme2/acle-asm/rshl_u32_x4.c       |  228 ++++
 .../aarch64/sme2/acle-asm/rshl_u64_x2.c       |  207 ++++
 .../aarch64/sme2/acle-asm/rshl_u64_x4.c       |  228 ++++
 .../aarch64/sme2/acle-asm/rshl_u8_x2.c        |  207 ++++
 .../aarch64/sme2/acle-asm/rshl_u8_x4.c        |  228 ++++
 .../aarch64/sme2/acle-asm/sel_bf16_x2.c       |   92 ++
 .../aarch64/sme2/acle-asm/sel_bf16_x4.c       |   92 ++
 .../aarch64/sme2/acle-asm/sel_f16_x2.c        |   92 ++
 .../aarch64/sme2/acle-asm/sel_f16_x4.c        |   92 ++
 .../aarch64/sme2/acle-asm/sel_f32_x2.c        |   92 ++
 .../aarch64/sme2/acle-asm/sel_f32_x4.c        |   92 ++
 .../aarch64/sme2/acle-asm/sel_f64_x2.c        |   92 ++
 .../aarch64/sme2/acle-asm/sel_f64_x4.c        |   92 ++
 .../aarch64/sme2/acle-asm/sel_s16_x2.c        |   92 ++
 .../aarch64/sme2/acle-asm/sel_s16_x4.c        |   92 ++
 .../aarch64/sme2/acle-asm/sel_s32_x2.c        |   92 ++
 .../aarch64/sme2/acle-asm/sel_s32_x4.c        |   92 ++
 .../aarch64/sme2/acle-asm/sel_s64_x2.c        |   92 ++
 .../aarch64/sme2/acle-asm/sel_s64_x4.c        |   92 ++
 .../aarch64/sme2/acle-asm/sel_s8_x2.c         |   92 ++
 .../aarch64/sme2/acle-asm/sel_s8_x4.c         |   92 ++
 .../aarch64/sme2/acle-asm/sel_u16_x2.c        |   92 ++
 .../aarch64/sme2/acle-asm/sel_u16_x4.c        |   92 ++
 .../aarch64/sme2/acle-asm/sel_u32_x2.c        |   92 ++
 .../aarch64/sme2/acle-asm/sel_u32_x4.c        |   92 ++
 .../aarch64/sme2/acle-asm/sel_u64_x2.c        |   92 ++
 .../aarch64/sme2/acle-asm/sel_u64_x4.c        |   92 ++
 .../aarch64/sme2/acle-asm/sel_u8_x2.c         |   92 ++
 .../aarch64/sme2/acle-asm/sel_u8_x4.c         |   92 ++
 .../aarch64/sme2/acle-asm/st1_bf16_x2.c       |  262 ++++
 .../aarch64/sme2/acle-asm/st1_bf16_x4.c       |  354 ++++++
 .../aarch64/sme2/acle-asm/st1_f16_x2.c        |  262 ++++
 .../aarch64/sme2/acle-asm/st1_f16_x4.c        |  354 ++++++
 .../aarch64/sme2/acle-asm/st1_f32_x2.c        |  262 ++++
 .../aarch64/sme2/acle-asm/st1_f32_x4.c        |  354 ++++++
 .../aarch64/sme2/acle-asm/st1_f64_x2.c        |  262 ++++
 .../aarch64/sme2/acle-asm/st1_f64_x4.c        |  354 ++++++
 .../aarch64/sme2/acle-asm/st1_s16_x2.c        |  262 ++++
 .../aarch64/sme2/acle-asm/st1_s16_x4.c        |  354 ++++++
 .../aarch64/sme2/acle-asm/st1_s32_x2.c        |  262 ++++
 .../aarch64/sme2/acle-asm/st1_s32_x4.c        |  354 ++++++
 .../aarch64/sme2/acle-asm/st1_s64_x2.c        |  262 ++++
 .../aarch64/sme2/acle-asm/st1_s64_x4.c        |  354 ++++++
 .../aarch64/sme2/acle-asm/st1_s8_x2.c         |  262 ++++
 .../aarch64/sme2/acle-asm/st1_s8_x4.c         |  354 ++++++
 .../aarch64/sme2/acle-asm/st1_u16_x2.c        |  262 ++++
 .../aarch64/sme2/acle-asm/st1_u16_x4.c        |  354 ++++++
 .../aarch64/sme2/acle-asm/st1_u32_x2.c        |  262 ++++
 .../aarch64/sme2/acle-asm/st1_u32_x4.c        |  354 ++++++
 .../aarch64/sme2/acle-asm/st1_u64_x2.c        |  262 ++++
 .../aarch64/sme2/acle-asm/st1_u64_x4.c        |  354 ++++++
 .../aarch64/sme2/acle-asm/st1_u8_x2.c         |  262 ++++
 .../aarch64/sme2/acle-asm/st1_u8_x4.c         |  354 ++++++
 .../aarch64/sme2/acle-asm/stnt1_bf16_x2.c     |  262 ++++
 .../aarch64/sme2/acle-asm/stnt1_bf16_x4.c     |  354 ++++++
 .../aarch64/sme2/acle-asm/stnt1_f16_x2.c      |  262 ++++
 .../aarch64/sme2/acle-asm/stnt1_f16_x4.c      |  354 ++++++
 .../aarch64/sme2/acle-asm/stnt1_f32_x2.c      |  262 ++++
 .../aarch64/sme2/acle-asm/stnt1_f32_x4.c      |  354 ++++++
 .../aarch64/sme2/acle-asm/stnt1_f64_x2.c      |  262 ++++
 .../aarch64/sme2/acle-asm/stnt1_f64_x4.c      |  354 ++++++
 .../aarch64/sme2/acle-asm/stnt1_s16_x2.c      |  262 ++++
 .../aarch64/sme2/acle-asm/stnt1_s16_x4.c      |  354 ++++++
 .../aarch64/sme2/acle-asm/stnt1_s32_x2.c      |  262 ++++
 .../aarch64/sme2/acle-asm/stnt1_s32_x4.c      |  354 ++++++
 .../aarch64/sme2/acle-asm/stnt1_s64_x2.c      |  262 ++++
 .../aarch64/sme2/acle-asm/stnt1_s64_x4.c      |  354 ++++++
 .../aarch64/sme2/acle-asm/stnt1_s8_x2.c       |  262 ++++
 .../aarch64/sme2/acle-asm/stnt1_s8_x4.c       |  354 ++++++
 .../aarch64/sme2/acle-asm/stnt1_u16_x2.c      |  262 ++++
 .../aarch64/sme2/acle-asm/stnt1_u16_x4.c      |  354 ++++++
 .../aarch64/sme2/acle-asm/stnt1_u32_x2.c      |  262 ++++
 .../aarch64/sme2/acle-asm/stnt1_u32_x4.c      |  354 ++++++
 .../aarch64/sme2/acle-asm/stnt1_u64_x2.c      |  262 ++++
 .../aarch64/sme2/acle-asm/stnt1_u64_x4.c      |  354 ++++++
 .../aarch64/sme2/acle-asm/stnt1_u8_x2.c       |  262 ++++
 .../aarch64/sme2/acle-asm/stnt1_u8_x4.c       |  354 ++++++
 .../gcc.target/aarch64/sme2/acle-asm/str_zt.c |   36 +
 .../sme2/acle-asm/sub_write_za32_s32_vg1x2.c  |  180 +++
 .../sme2/acle-asm/sub_write_za32_s32_vg1x4.c  |  172 +++
 .../sme2/acle-asm/sub_write_za32_u32_vg1x2.c  |  180 +++
 .../sme2/acle-asm/sub_write_za32_u32_vg1x4.c  |  172 +++
 .../sme2/acle-asm/sub_write_za64_s64_vg1x2.c  |  182 +++
 .../sme2/acle-asm/sub_write_za64_s64_vg1x4.c  |  174 +++
 .../sme2/acle-asm/sub_write_za64_u64_vg1x2.c  |  182 +++
 .../sme2/acle-asm/sub_write_za64_u64_vg1x4.c  |  174 +++
 .../sme2/acle-asm/sub_za32_f32_vg1x2.c        |  122 ++
 .../sme2/acle-asm/sub_za32_f32_vg1x4.c        |  137 +++
 .../sme2/acle-asm/sub_za32_s32_vg1x2.c        |  122 ++
 .../sme2/acle-asm/sub_za32_s32_vg1x4.c        |  137 +++
 .../sme2/acle-asm/sub_za32_u32_vg1x2.c        |  122 ++
 .../sme2/acle-asm/sub_za32_u32_vg1x4.c        |  137 +++
 .../sme2/acle-asm/sub_za64_f64_vg1x2.c        |  126 ++
 .../sme2/acle-asm/sub_za64_f64_vg1x4.c        |  141 +++
 .../sme2/acle-asm/sub_za64_s64_vg1x2.c        |  124 ++
 .../sme2/acle-asm/sub_za64_s64_vg1x4.c        |  139 +++
 .../sme2/acle-asm/sub_za64_u64_vg1x2.c        |  124 ++
 .../sme2/acle-asm/sub_za64_u64_vg1x4.c        |  139 +++
 .../sme2/acle-asm/sudot_lane_za32_s8_vg1x2.c  |  102 ++
 .../sme2/acle-asm/sudot_lane_za32_s8_vg1x4.c  |  108 ++
 .../sme2/acle-asm/sudot_za32_s8_vg1x2.c       |  243 ++++
 .../sme2/acle-asm/sudot_za32_s8_vg1x4.c       |  254 ++++
 .../sme2/acle-asm/suvdot_lane_za32_s8_vg1x4.c |  108 ++
 .../aarch64/sme2/acle-asm/test_sme2_acle.h    |  124 ++
 .../aarch64/sme2/acle-asm/unpk_s16_x2.c       |   50 +
 .../aarch64/sme2/acle-asm/unpk_s16_x4.c       |   76 ++
 .../aarch64/sme2/acle-asm/unpk_s32_x2.c       |   50 +
 .../aarch64/sme2/acle-asm/unpk_s32_x4.c       |   76 ++
 .../aarch64/sme2/acle-asm/unpk_s8_x2.c        |   50 +
 .../aarch64/sme2/acle-asm/unpk_s8_x4.c        |   76 ++
 .../aarch64/sme2/acle-asm/unpk_u16_x2.c       |   50 +
 .../aarch64/sme2/acle-asm/unpk_u16_x4.c       |   76 ++
 .../aarch64/sme2/acle-asm/unpk_u32_x2.c       |   50 +
 .../aarch64/sme2/acle-asm/unpk_u32_x4.c       |   76 ++
 .../aarch64/sme2/acle-asm/unpk_u8_x2.c        |   50 +
 .../aarch64/sme2/acle-asm/unpk_u8_x4.c        |   76 ++
 .../sme2/acle-asm/usdot_lane_za32_u8_vg1x2.c  |  102 ++
 .../sme2/acle-asm/usdot_lane_za32_u8_vg1x4.c  |  108 ++
 .../sme2/acle-asm/usdot_za32_u8_vg1x2.c       |  243 ++++
 .../sme2/acle-asm/usdot_za32_u8_vg1x4.c       |  254 ++++
 .../sme2/acle-asm/usvdot_lane_za32_u8_vg1x4.c |  108 ++
 .../aarch64/sme2/acle-asm/uzp_bf16_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/uzp_bf16_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/uzp_f16_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/uzp_f16_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/uzp_f32_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/uzp_f32_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/uzp_f64_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/uzp_f64_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/uzp_s16_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/uzp_s16_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/uzp_s32_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/uzp_s32_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/uzp_s64_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/uzp_s64_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/uzp_s8_x2.c         |   77 ++
 .../aarch64/sme2/acle-asm/uzp_s8_x4.c         |   73 ++
 .../aarch64/sme2/acle-asm/uzp_u16_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/uzp_u16_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/uzp_u32_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/uzp_u32_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/uzp_u64_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/uzp_u64_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/uzp_u8_x2.c         |   77 ++
 .../aarch64/sme2/acle-asm/uzp_u8_x4.c         |   73 ++
 .../aarch64/sme2/acle-asm/uzpq_bf16_x2.c      |   77 ++
 .../aarch64/sme2/acle-asm/uzpq_bf16_x4.c      |   73 ++
 .../aarch64/sme2/acle-asm/uzpq_f16_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/uzpq_f16_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/uzpq_f32_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/uzpq_f32_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/uzpq_f64_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/uzpq_f64_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/uzpq_s16_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/uzpq_s16_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/uzpq_s32_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/uzpq_s32_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/uzpq_s64_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/uzpq_s64_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/uzpq_s8_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/uzpq_s8_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/uzpq_u16_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/uzpq_u16_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/uzpq_u32_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/uzpq_u32_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/uzpq_u64_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/uzpq_u64_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/uzpq_u8_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/uzpq_u8_x4.c        |   73 ++
 .../sme2/acle-asm/vdot_lane_za32_bf16_vg1x2.c |  102 ++
 .../sme2/acle-asm/vdot_lane_za32_f16_vg1x2.c  |  102 ++
 .../sme2/acle-asm/vdot_lane_za32_s16_vg1x2.c  |  102 ++
 .../sme2/acle-asm/vdot_lane_za32_s8_vg1x4.c   |  108 ++
 .../sme2/acle-asm/vdot_lane_za32_u16_vg1x2.c  |  102 ++
 .../sme2/acle-asm/vdot_lane_za32_u8_vg1x4.c   |  108 ++
 .../sme2/acle-asm/vdot_lane_za64_s16_vg1x4.c  |  110 ++
 .../sme2/acle-asm/vdot_lane_za64_u16_vg1x4.c  |  110 ++
 .../aarch64/sme2/acle-asm/whilege_b16.c       |  119 ++
 .../aarch64/sme2/acle-asm/whilege_b32.c       |  119 ++
 .../aarch64/sme2/acle-asm/whilege_b64.c       |  119 ++
 .../aarch64/sme2/acle-asm/whilege_b8.c        |  119 ++
 .../aarch64/sme2/acle-asm/whilege_c16.c       |  117 ++
 .../aarch64/sme2/acle-asm/whilege_c32.c       |  117 ++
 .../aarch64/sme2/acle-asm/whilege_c64.c       |  117 ++
 .../aarch64/sme2/acle-asm/whilege_c8.c        |  117 ++
 .../aarch64/sme2/acle-asm/whilegt_b16.c       |  119 ++
 .../aarch64/sme2/acle-asm/whilegt_b32.c       |  119 ++
 .../aarch64/sme2/acle-asm/whilegt_b64.c       |  119 ++
 .../aarch64/sme2/acle-asm/whilegt_b8.c        |  119 ++
 .../aarch64/sme2/acle-asm/whilegt_c16.c       |  117 ++
 .../aarch64/sme2/acle-asm/whilegt_c32.c       |  117 ++
 .../aarch64/sme2/acle-asm/whilegt_c64.c       |  117 ++
 .../aarch64/sme2/acle-asm/whilegt_c8.c        |  117 ++
 .../aarch64/sme2/acle-asm/whilele_b16.c       |  119 ++
 .../aarch64/sme2/acle-asm/whilele_b32.c       |  119 ++
 .../aarch64/sme2/acle-asm/whilele_b64.c       |  119 ++
 .../aarch64/sme2/acle-asm/whilele_b8.c        |  119 ++
 .../aarch64/sme2/acle-asm/whilele_c16.c       |  117 ++
 .../aarch64/sme2/acle-asm/whilele_c32.c       |  117 ++
 .../aarch64/sme2/acle-asm/whilele_c64.c       |  117 ++
 .../aarch64/sme2/acle-asm/whilele_c8.c        |  117 ++
 .../aarch64/sme2/acle-asm/whilelt_b16.c       |  119 ++
 .../aarch64/sme2/acle-asm/whilelt_b32.c       |  119 ++
 .../aarch64/sme2/acle-asm/whilelt_b64.c       |  119 ++
 .../aarch64/sme2/acle-asm/whilelt_b8.c        |  119 ++
 .../aarch64/sme2/acle-asm/whilelt_c16.c       |  117 ++
 .../aarch64/sme2/acle-asm/whilelt_c32.c       |  117 ++
 .../aarch64/sme2/acle-asm/whilelt_c64.c       |  117 ++
 .../aarch64/sme2/acle-asm/whilelt_c8.c        |  117 ++
 .../sme2/acle-asm/write_hor_za16_vg2.c        |  140 +++
 .../sme2/acle-asm/write_hor_za16_vg4.c        |  138 +++
 .../sme2/acle-asm/write_hor_za32_vg2.c        |  112 ++
 .../sme2/acle-asm/write_hor_za32_vg4.c        |  129 ++
 .../sme2/acle-asm/write_hor_za64_vg2.c        |  113 ++
 .../sme2/acle-asm/write_hor_za64_vg4.c        |  129 ++
 .../aarch64/sme2/acle-asm/write_hor_za8_vg2.c |  140 +++
 .../aarch64/sme2/acle-asm/write_hor_za8_vg4.c |  156 +++
 .../sme2/acle-asm/write_ver_za16_vg2.c        |  140 +++
 .../sme2/acle-asm/write_ver_za16_vg4.c        |  138 +++
 .../sme2/acle-asm/write_ver_za32_vg2.c        |  112 ++
 .../sme2/acle-asm/write_ver_za32_vg4.c        |  129 ++
 .../sme2/acle-asm/write_ver_za64_vg2.c        |  113 ++
 .../sme2/acle-asm/write_ver_za64_vg4.c        |  129 ++
 .../aarch64/sme2/acle-asm/write_ver_za8_vg2.c |  140 +++
 .../aarch64/sme2/acle-asm/write_ver_za8_vg4.c |  156 +++
 .../aarch64/sme2/acle-asm/write_za16_vg1x2.c  |  122 ++
 .../aarch64/sme2/acle-asm/write_za16_vg1x4.c  |  137 +++
 .../aarch64/sme2/acle-asm/write_za32_vg1x2.c  |  122 ++
 .../aarch64/sme2/acle-asm/write_za32_vg1x4.c  |  137 +++
 .../aarch64/sme2/acle-asm/write_za64_vg1x2.c  |  122 ++
 .../aarch64/sme2/acle-asm/write_za64_vg1x4.c  |  137 +++
 .../aarch64/sme2/acle-asm/write_za8_vg1x2.c   |  122 ++
 .../aarch64/sme2/acle-asm/write_za8_vg1x4.c   |  137 +++
 .../aarch64/sme2/acle-asm/zero_zt.c           |   12 +
 .../aarch64/sme2/acle-asm/zip_bf16_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/zip_bf16_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/zip_f16_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/zip_f16_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/zip_f32_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/zip_f32_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/zip_f64_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/zip_f64_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/zip_s16_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/zip_s16_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/zip_s32_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/zip_s32_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/zip_s64_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/zip_s64_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/zip_s8_x2.c         |   77 ++
 .../aarch64/sme2/acle-asm/zip_s8_x4.c         |   73 ++
 .../aarch64/sme2/acle-asm/zip_u16_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/zip_u16_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/zip_u32_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/zip_u32_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/zip_u64_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/zip_u64_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/zip_u8_x2.c         |   77 ++
 .../aarch64/sme2/acle-asm/zip_u8_x4.c         |   73 ++
 .../aarch64/sme2/acle-asm/zipq_bf16_x2.c      |   77 ++
 .../aarch64/sme2/acle-asm/zipq_bf16_x4.c      |   73 ++
 .../aarch64/sme2/acle-asm/zipq_f16_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/zipq_f16_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/zipq_f32_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/zipq_f32_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/zipq_f64_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/zipq_f64_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/zipq_s16_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/zipq_s16_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/zipq_s32_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/zipq_s32_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/zipq_s64_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/zipq_s64_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/zipq_s8_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/zipq_s8_x4.c        |   73 ++
 .../aarch64/sme2/acle-asm/zipq_u16_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/zipq_u16_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/zipq_u32_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/zipq_u32_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/zipq_u64_x2.c       |   77 ++
 .../aarch64/sme2/acle-asm/zipq_u64_x4.c       |   73 ++
 .../aarch64/sme2/acle-asm/zipq_u8_x2.c        |   77 ++
 .../aarch64/sme2/acle-asm/zipq_u8_x4.c        |   73 ++
 .../aarch64/sve/acle/asm/test_sve_acle.h      |  230 +++-
 .../sve/acle/general-c/binary_za_m_1.c        |    1 -
 .../general-c/binary_za_slice_opt_single_1.c  |   76 ++
 .../general-c/binary_za_slice_opt_single_2.c  |   29 +
 .../general-c/binary_za_slice_opt_single_3.c  |   16 +
 .../aarch64/sve/acle/general-c/store_1.c      |    4 +-
 .../aarch64/sve/acle/general-c/store_2.c      |    4 +-
 .../sve/acle/general-c/ternary_qq_lane_1.c    |   30 +-
 .../sve/acle/general-c/ternary_qq_opt_n_2.c   |   12 +-
 .../sve/acle/general-c/unary_za_slice_1.c     |   54 +
 .../sve/acle/general-c/unary_za_slice_2.c     |   27 +
 .../sve/acle/general-c/unary_za_slice_3.c     |   16 +
 845 files changed, 120426 insertions(+), 216 deletions(-)


[-- Attachment #2: 0005-aarch64-Add-support-for-SME2-intrinsics.patch.xz --]
[-- Type: application/x-xz, Size: 168264 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-11-17 17:42 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-17 17:37 aarch64: Add support for SME2 Richard Sandiford
2023-11-17 17:38 ` [PATCH 1/5] aarch64: Add +sme2 Richard Sandiford
2023-11-17 17:39 ` [PATCH 2/5] aarch64: Add svcount_t Richard Sandiford
2023-11-17 17:39 ` [PATCH 3/5] aarch64: Add svboolx2_t Richard Sandiford
2023-11-17 17:39 ` [PATCH 4/5] aarch64: Add ZT0 Richard Sandiford
2023-11-17 17:42 ` [PATCH 5/5] aarch64: Add support for SME2 intrinsics Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).