public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
From: "Andre Vieira (lists)" <andre.simoesdiasvieira@arm.com>
To: gcc-patches@gcc.gnu.org
Cc: Richard Sandiford <richard.sandiford@arm.com>,
	Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
Subject: [PATCH 4/4][RFC] VLA Constructor
Date: Fri, 5 Aug 2022 13:58:16 +0100	[thread overview]
Message-ID: <3f90f079-8c12-2547-c925-a28779fdb267@arm.com> (raw)
In-Reply-To: <95d2de77-5b68-6d0b-ac99-ac1ca28835e2@arm.com>

[-- Attachment #1: Type: text/plain, Size: 153 bytes --]

This isn't really a 'PATCH' yet, it's something I was working on but had 
to put on hold. Feel free to re-use any bits or trash all of it if you'd 
like.

[-- Attachment #2: sve_const_dup_4.patch --]
[-- Type: text/plain, Size: 15627 bytes --]

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 82f9eba5c397af04924bdebdc684a1d77682d3fd..08625aad7b1a8dc9c9f8c491cb13d8af0b46a946 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -842,13 +842,45 @@ public:
     for (unsigned int i = 0; i < nargs; ++i)
       {
 	tree elt = gimple_call_arg (f.call, i);
-	if (!CONSTANT_CLASS_P (elt))
-	  return NULL;
 	builder.quick_push (elt);
 	for (unsigned int j = 1; j < factor; ++j)
 	  builder.quick_push (build_zero_cst (TREE_TYPE (vec_type)));
       }
-    return gimple_build_assign (f.lhs, builder.build ());
+    builder.finalize ();
+    unsigned int n_elts
+      = builder.nelts_per_pattern () == 1 ? builder.npatterns ()
+					  : builder.full_nelts ().coeffs[0];
+
+    if (n_elts == 1)
+      return gimple_build_assign (f.lhs, build1 (VEC_DUPLICATE_EXPR, vec_type,
+						 builder.elt (0)));
+    tree list = NULL_TREE;
+    tree *pp = &list;
+    for (unsigned int i = 0; i < n_elts; ++i)
+      {
+	*pp = build_tree_list (NULL, builder.elt (i) PASS_MEM_STAT);
+	pp = &TREE_CHAIN (*pp);
+      }
+
+    poly_uint64 vec_len = TYPE_VECTOR_SUBPARTS (vec_type);
+    vec_perm_builder sel (vec_len, n_elts, 1);
+    for (unsigned int i = 0; i < n_elts; i++)
+      sel.quick_push (i);
+    vec_perm_indices indices (sel, 1, n_elts);
+
+    tree elt_type = TREE_TYPE (vec_type);
+
+    tree ctor_type = build_vector_type (elt_type, n_elts);
+    tree ctor = make_ssa_name_fn (cfun, ctor_type, 0);
+    gimple *ctor_stmt
+      = gimple_build_assign (ctor,
+			     build_constructor_from_list (ctor_type, list));
+    gsi_insert_before (f.gsi, ctor_stmt, GSI_SAME_STMT);
+
+    tree mask_type = build_vector_type (ssizetype, vec_len);
+    tree mask = vec_perm_indices_to_tree (mask_type, indices);
+    return gimple_build_assign (f.lhs, fold_build3 (VEC_PERM_EXPR, vec_type,
+						    ctor, ctor, mask));
   }
 
   rtx
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index bd60e65b0c3f05f1c931f03807170f3b9d699de5..dec935211e5a064239c858880a696e6ca3fe1ae2 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2544,6 +2544,17 @@
   }
 )
 
+;; Duplicate an Advanced SIMD vector to fill an SVE vector (LE version).
+(define_insn "*aarch64_vec_duplicate_reg<mode>_le"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w,w")
+	(vec_duplicate:SVE_FULL
+	  (match_operand:<VEL> 1 "register_operand" "w,r")))]
+  "TARGET_SVE && !BYTES_BIG_ENDIAN"
+  "@
+   mov\t%0.<Vetype>, %<vwcore>1
+   mov\t%0.<Vetype>, %<Vetype>1"
+)
+
 ;; Duplicate an Advanced SIMD vector to fill an SVE vector (BE version).
 ;; The SVE register layout puts memory lane N into (architectural)
 ;; register lane N, whereas the Advanced SIMD layout puts the memory
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index a08043e18d609e258ebfe033875201163d129aba..9b118e4101d0a5995a833769433be49321ab2151 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -6033,7 +6033,6 @@ rtx
 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
 {
   machine_mode src_mode = GET_MODE (src);
-  gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
   insn_code icode = (BYTES_BIG_ENDIAN
 		     ? code_for_aarch64_vec_duplicate_vq_be (mode)
 		     : code_for_aarch64_vec_duplicate_vq_le (mode));
@@ -21806,20 +21805,29 @@ aarch64_simd_make_constant (rtx vals)
 }
 
 static void
-aarch64_vec_duplicate (rtx target, machine_mode mode, machine_mode element_mode,
+aarch64_vec_duplicate (rtx target, rtx op, machine_mode mode, machine_mode element_mode,
 		       int narrow_n_elts)
 {
   poly_uint64 size = narrow_n_elts * GET_MODE_BITSIZE (element_mode);
-  scalar_mode i_mode = int_mode_for_size (size, 0).require ();
   machine_mode o_mode;
-  if (aarch64_sve_mode_p (mode))
-    o_mode = aarch64_full_sve_mode (i_mode).require ();
+  rtx input, output;
+  bool sve = aarch64_sve_mode_p (mode);
+  if (sve && known_eq (size, 128U))
+    {
+      o_mode = mode;
+      output = target;
+      input = op;
+    }
   else
-    o_mode
-      = aarch64_simd_container_mode (i_mode,
-				     GET_MODE_BITSIZE (mode));
-  rtx input = simplify_gen_subreg (i_mode, target, mode, 0);
-  rtx output = simplify_gen_subreg (o_mode, target, mode, 0);
+    {
+      scalar_mode i_mode = int_mode_for_size (size, 0).require ();
+      o_mode
+	= sve ? aarch64_full_sve_mode (i_mode).require ()
+	      : aarch64_simd_container_mode (i_mode,
+					     GET_MODE_BITSIZE (mode));
+      input = simplify_gen_subreg (i_mode, op, GET_MODE (op), 0);
+      output = simplify_gen_subreg (o_mode, target, mode, 0);
+    }
   aarch64_emit_move (output, gen_vec_duplicate (o_mode, input));
 }
 
@@ -21910,6 +21918,16 @@ aarch64_expand_vector_init (rtx target, rtx_vector_builder &v)
       return;
     }
 
+  /* We are constructing a VLS vector that we may later duplicate into a VLA
+     one.  Actually maybe split this into one for ASIMD and one for SVE? */
+  machine_mode real_mode = mode;
+  rtx real_target = target;
+  if (aarch64_sve_mode_p (real_mode))
+    {
+      mode = aarch64_vq_mode (GET_MODE_INNER (real_mode)).require ();
+      target = simplify_gen_subreg (mode, target, real_mode, 0);
+    }
+
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
@@ -22000,8 +22018,8 @@ aarch64_expand_vector_init (rtx target, rtx_vector_builder &v)
 	  x = copy_to_mode_reg (inner_mode, x);
 	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
 	}
-	if (!known_eq (v.full_nelts (), n_elts))
-	  aarch64_vec_duplicate (target, mode, GET_MODE (v0), n_elts);
+      if (!known_eq (v.full_nelts (), n_elts))
+	aarch64_vec_duplicate (real_target, target, real_mode, GET_MODE (v0), n_elts);
       return;
     }
 
@@ -22048,7 +22066,7 @@ aarch64_expand_vector_init (rtx target, rtx_vector_builder &v)
       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
     }
   if (!known_eq (v.full_nelts (), n_elts))
-    aarch64_vec_duplicate (target, mode, inner_mode, n_elts);
+    aarch64_vec_duplicate (real_target, target, real_mode, inner_mode, n_elts);
 }
 
 /* Emit RTL corresponding to:
@@ -23947,11 +23965,7 @@ aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
   if (BYTES_BIG_ENDIAN
       || !d->one_vector_p
       || d->vec_flags != VEC_SVE_DATA
-      || d->op_vec_flags != VEC_ADVSIMD
-      || d->perm.encoding ().nelts_per_pattern () != 1
-      || !known_eq (d->perm.encoding ().npatterns (),
-		    GET_MODE_NUNITS (d->op_mode))
-      || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
+      || d->perm.encoding ().nelts_per_pattern () != 1)
     return false;
 
   int npatterns = d->perm.encoding ().npatterns ();
@@ -23962,7 +23976,10 @@ aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
   if (d->testing_p)
     return true;
 
-  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+  machine_mode mode = GET_MODE (d->target);
+  machine_mode element_mode = GET_MODE_INNER (mode);
+  aarch64_vec_duplicate (d->target, d->op0, mode, element_mode,
+			 d->perm.encoding ().npatterns ());
   return true;
 }
 
@@ -24194,6 +24211,15 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
   return ret;
 }
 
+/* Implement TARGET_VECTORIZE_VLA_CONSTRUCTOR.  */
+
+static bool
+aarch64_vectorize_vla_constructor (rtx target, rtx_vector_builder &builder)
+{
+  aarch64_expand_vector_init (target, builder);
+  return true;
+}
+
 /* Generate a byte permute mask for a register of mode MODE,
    which has NUNITS units.  */
 
@@ -27667,6 +27693,10 @@ aarch64_libgcc_floating_mode_supported_p
 #define TARGET_VECTORIZE_VEC_PERM_CONST \
   aarch64_vectorize_vec_perm_const
 
+#undef TARGET_VECTORIZE_VLA_CONSTRUCTOR
+#define TARGET_VECTORIZE_VLA_CONSTRUCTOR \
+  aarch64_vectorize_vla_constructor
+
 #undef TARGET_VECTORIZE_RELATED_MODE
 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
 #undef TARGET_VECTORIZE_GET_MASK_MODE
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index b0ea39884aa3ced5c0ccc1e792088aa66997ec3b..eda3f014984f62d96d7fe0b3c0c439905375f25a 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6112,6 +6112,11 @@ instruction pattern.  There is no need for the hook to handle these two
 implementation approaches itself.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_VLA_CONSTRUCTOR (rtx @var{target}, rtx_vector_builder @var{&builder})
+This hook is used to expand a vla constructor into @var{target}
+using the rtx_vector_builder @var{builder}.
+@end deftypefn
+
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
 This hook should return the decl of a function that implements the
 vectorized variant of the function with the @code{combined_fn} code
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index f869ddd5e5b8b7acbd8e9765fb103af24a1085b6..07f4f77877b18a23f6fd205a8dd8daf1a03c2923 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
+@hook TARGET_VECTORIZE_VLA_CONSTRUCTOR
+
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
 @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
diff --git a/gcc/expr.cc b/gcc/expr.cc
index f9753d48245d56039206647be8576246a3b25ed3..b9eb550cac4c68464c95cffa8da19b3984b80782 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -10264,6 +10264,44 @@ expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode,
 
     case VEC_PERM_EXPR:
       {
+	if (TREE_CODE (treeop2) == VECTOR_CST
+	    && targetm.vectorize.vla_constructor)
+	  {
+	    tree ctor0, ctor1;
+	    if (TREE_CODE (treeop0) == SSA_NAME
+		&& is_gimple_assign (SSA_NAME_DEF_STMT (treeop0)))
+	      ctor0 = gimple_assign_rhs1 (SSA_NAME_DEF_STMT (treeop0));
+	    else
+	      ctor0 = treeop0;
+	    if (TREE_CODE (treeop1) == SSA_NAME
+		&& is_gimple_assign (SSA_NAME_DEF_STMT (treeop1)))
+	      ctor1 = gimple_assign_rhs1 (SSA_NAME_DEF_STMT (treeop1));
+	    else
+	      ctor1 = treeop1;
+
+	    if (TREE_CODE (ctor0) == CONSTRUCTOR
+		&& TREE_CODE (ctor1) == CONSTRUCTOR)
+	      {
+
+		unsigned int nelts = vector_cst_encoded_nelts (treeop2);
+		unsigned int ctor_nelts = CONSTRUCTOR_NELTS (ctor0);
+		machine_mode mode = GET_MODE (target);
+		rtx_vector_builder builder (mode, nelts, 1);
+		for (unsigned int i = 0; i < nelts; ++i)
+		  {
+		    unsigned HOST_WIDE_INT index
+		      = tree_to_uhwi (VECTOR_CST_ENCODED_ELT (treeop2, i));
+		    tree op
+		      = index >= ctor_nelts
+			? CONSTRUCTOR_ELT (ctor1, index - ctor_nelts)->value
+			: CONSTRUCTOR_ELT (ctor0, index)->value;
+		    builder.quick_push (expand_normal (op));
+		  }
+		builder.finalize ();
+		if (targetm.vectorize.vla_constructor (target, builder))
+		  return target;
+	      }
+	  }
 	expand_operands (treeop0, treeop1, target, &op0, &op1, EXPAND_NORMAL);
 	vec_perm_builder sel;
 	if (TREE_CODE (treeop2) == VECTOR_CST
diff --git a/gcc/target.def b/gcc/target.def
index 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..3c219b6a90d9cc1a6393a3ebc24e54fcf14c6377 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1902,6 +1902,13 @@ implementation approaches itself.",
 	const vec_perm_indices &sel),
  NULL)
 
+DEFHOOK
+(vla_constructor,
+ "This hook is used to expand a vla constructor into @var{target}\n\
+using the rtx_vector_builder @var{builder}.",
+ bool, (rtx target, rtx_vector_builder &builder),
+ NULL)
+
 /* Return true if the target supports misaligned store/load of a
    specific factor denoted in the third parameter.  The last parameter
    is true if the access is defined in a packed struct.  */
diff --git a/gcc/target.h b/gcc/target.h
index d6fa6931499d15edff3e5af3e429540d001c7058..b46b8f0d7a9c52f6efe6acf10f589703cec3bd08 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -262,6 +262,8 @@ enum poly_value_estimate_kind
 extern bool verify_type_context (location_t, type_context_kind, const_tree,
 				 bool = false);
 
+class rtx_vector_builder;
+
 /* The target structure.  This holds all the backend hooks.  */
 #define DEFHOOKPOD(NAME, DOC, TYPE, INIT) TYPE NAME;
 #define DEFHOOK(NAME, DOC, TYPE, PARAMS, INIT) TYPE (* NAME) PARAMS;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_opt_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_opt_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..01f652931555534f43e0487766c568c72a5df686
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_opt_1.c
@@ -0,0 +1,134 @@
+/* { dg-options { "-O2" } } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+#include <arm_sve.h>
+
+/*
+** test0:
+**	ins	v0.s\[1\], v1.s\[0\]
+**	mov	z0.d, d0
+**	ret
+*/
+svfloat32_t test0(float x, float y) {
+    return svdupq_n_f32(x, y, x, y);
+}
+/*
+** test1:
+**	mov	z0.s, s0
+**	ret
+*/
+
+svfloat32_t test1(float x) {
+    return svdupq_n_f32(x, x, x, x);
+}
+
+/*
+** test2:
+**	mov	z0.s, w0
+**	ret
+*/
+
+svint32_t test2(int x) {
+    return svdupq_n_s32(x, x, x, x);
+}
+
+/*
+** test3:
+**	sxth	w0, w0
+**	fmov	d0, x0
+**	ins	v0.h\[1\], w1
+**	ins	v0.h\[2\], w2
+**	ins	v0.h\[3\], w3
+**	mov	z0.d, d0
+**	ret
+*/
+
+svint16_t test3(short a, short b, short c, short d)
+{
+    return svdupq_n_s16(a, b, c, d, a, b, c, d);
+}
+
+/*
+** test4:
+**	dup	v0.4h, w0
+**	ins	v0.h\[1\], w1
+**	ins	v0.h\[3\], w1
+**	mov	z0.d, d0
+**	ret
+*/
+
+svint16_t test4(short a, short b)
+{
+    return svdupq_n_s16(a, b, a, b, a, b, a, b);
+}
+
+/*
+** test5:
+**	mov	z0.h, w0
+**	ret
+*/
+
+svint16_t test5(short a)
+{
+    return svdupq_n_s16(a, a, a, a, a, a, a, a);
+}
+/*
+** test6:
+**	sxtb	w0, w0
+**	fmov	d0, x0
+**	ins	v0.b\[1\], w1
+**	ins	v0.b\[2\], w2
+**	ins	v0.b\[3\], w3
+**	ins	v0.b\[4\], w4
+**	ins	v0.b\[5\], w5
+**	ins	v0.b\[6\], w6
+**	ins	v0.b\[7\], w7
+**	mov	z0.d, d0
+**	ret
+*/
+
+svint8_t test6(char a, char b, char c, char d, char e, char f, char g, char h)
+{
+    return svdupq_n_s8(a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h);
+}
+
+/*
+** test7:
+**	dup	v0.8b, w0
+**	ins	v0.b\[1\], w1
+**	ins	v0.b\[2\], w2
+**	ins	v0.b\[3\], w3
+**	mov	z0.s, s0
+**	ret
+*/
+
+svint8_t test7(char a, char b, char c, char d)
+{
+    return svdupq_n_s8(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d);
+}
+
+
+// We can do better than this
+/*
+**	sxtb	w0, w0
+**	fmov	d0, x0
+**	ins	v0.d\[1\], x1
+**	ins	v0.b\[1\], w1
+**	mov	z0.h, h0
+**	ret
+*/
+
+svint8_t test8(char a, char b)
+{
+    return svdupq_n_s8(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b);
+}
+
+/*
+** test9:
+**	mov	z0.b, w0
+**	ret
+*/
+
+svint8_t test9(char a)
+{
+    return svdupq_n_s8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a);
+}
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index 350129555a0c71c0896c4f1003163f3b3557c11b..eaae1eefe02af3f51073310e7d17c33286b2bead 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -1513,6 +1513,11 @@ lower_vec_perm (gimple_stmt_iterator *gsi)
   if (!TYPE_VECTOR_SUBPARTS (vect_type).is_constant (&elements))
     return;
 
+  /* It is possible to have a VEC_PERM_EXPR with a VLA mask and a VLS
+     CONSTRUCTOR, this should return a VLA type, so we can't lower it.  */
+  if (!TYPE_VECTOR_SUBPARTS (mask_type).is_constant ())
+    return;
+
   if (TREE_CODE (mask) == SSA_NAME)
     {
       gimple *def_stmt = SSA_NAME_DEF_STMT (mask);

  parent reply	other threads:[~2022-08-05 12:58 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-05 12:50 [PATCH 0/4] aarch64: Improve codegen for dups and constructors Andre Vieira (lists)
2022-08-05 12:53 ` [PATCH 1/4] aarch64: encourage use of GPR input for SIMD inserts Andre Vieira (lists)
2022-08-05 12:55 ` [PATCH 2/4]aarch64: Change aarch64_expand_vector_init to use rtx_vector_builder Andre Vieira (lists)
2022-08-05 12:56 ` [PATCH 3/4] match.pd: Teach forwprop to handle VLA VEC_PERM_EXPRs with VLS CONSTRUCTORs as arguments Andre Vieira (lists)
2022-08-05 14:53   ` Prathamesh Kulkarni
2022-08-05 12:58 ` Andre Vieira (lists) [this message]
2022-08-08 12:12   ` [PATCH 4/4][RFC] VLA Constructor Richard Biener

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=3f90f079-8c12-2547-c925-a28779fdb267@arm.com \
    --to=andre.simoesdiasvieira@arm.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=prathamesh.kulkarni@linaro.org \
    --cc=richard.sandiford@arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).