[gcc r13-1055] PR96463: Optimise svld1rq from vectors for little endian AArch64 targets.

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

* [gcc r13-1055] PR96463: Optimise svld1rq from vectors for little endian AArch64 targets.
@ 2022-06-12  3:25 Prathamesh Kulkarni
  0 siblings, 0 replies; only message in thread
From: Prathamesh Kulkarni @ 2022-06-12  3:25 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:494bec025002df422f2faa947138bf3643d80b54

commit r13-1055-g494bec025002df422f2faa947138bf3643d80b54
Author: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
Date:   Sun Jun 12 08:50:16 2022 +0530

    PR96463: Optimise svld1rq from vectors for little endian AArch64 targets.
    
    The patch folds:
    lhs = svld1rq({-1, -1, ...}, rhs)
    into:
    tmp = mem_ref<vectype> [(elem_type * {ref-all}) rhs]
    lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3 ...}>.
    which is then expanded using aarch64_expand_sve_dupq.
    
    Example:
    
    svint32_t
    foo (int32x4_t x)
    {
      return svld1rq (svptrue_b8 (), &x[0]);
    }
    
    code-gen:
    foo:
    .LFB4350:
            dup     z0.q, z0.q[0]
            ret
    
    The patch relaxes type-checking for VEC_PERM_EXPR by allowing different
    vector types for lhs and rhs provided:
    (1) rhs3 is constant and has integer type element.
    (2) len(lhs) == len(rhs3) and len(rhs1) == len(rhs2)
    (3) lhs and rhs have same element type.
    
    gcc/ChangeLog:
            PR target/96463
            * config/aarch64/aarch64-sve-builtins-base.cc: Include ssa.h.
            (svld1rq_impl::fold): Define.
            * config/aarch64/aarch64.cc (expand_vec_perm_d): Define new members
            op_mode and op_vec_flags.
            (aarch64_evpc_reencode): Initialize newd.op_mode and
            newd.op_vec_flags.
            (aarch64_evpc_sve_dup): New function.
            (aarch64_expand_vec_perm_const_1): Gate existing calls to
            aarch64_evpc_* functions under d->vmode == d->op_mode,
            and call aarch64_evpc_sve_dup.
            (aarch64_vectorize_vec_perm_const): Remove assert
            d->vmode != d->op_mode, and initialize d.op_mode and d.op_vec_flags.
            * tree-cfg.cc (verify_gimple_assign_ternary): Allow different
            vector types for lhs and rhs in VEC_PERM_EXPR if rhs3 is
            constant.
    
    gcc/testsuite/ChangeLog:
            PR target/96463
            * gcc.target/aarch64/sve/acle/general/pr96463-1.c: New test.
            * gcc.target/aarch64/sve/acle/general/pr96463-2.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc    | 59 ++++++++++++++
 gcc/config/aarch64/aarch64.cc                      | 95 ++++++++++++++++------
 .../aarch64/sve/acle/general/pr96463-1.c           | 29 +++++++
 .../aarch64/sve/acle/general/pr96463-2.c           | 29 +++++++
 gcc/tree-cfg.cc                                    | 40 ++++++---
 5 files changed, 212 insertions(+), 40 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index bee410929bd..82f9eba5c39 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -44,6 +44,7 @@
 #include "aarch64-sve-builtins-shapes.h"
 #include "aarch64-sve-builtins-base.h"
 #include "aarch64-sve-builtins-functions.h"
+#include "ssa.h"
 
 using namespace aarch64_sve;
 
@@ -1207,6 +1208,64 @@ public:
     insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
     return e.use_contiguous_load_insn (icode);
   }
+
+  gimple *
+  fold (gimple_folder &f) const override
+  {
+    tree arg0 = gimple_call_arg (f.call, 0);
+    tree arg1 = gimple_call_arg (f.call, 1);
+
+    /* Transform:
+       lhs = svld1rq ({-1, -1, ... }, arg1)
+       into:
+       tmp = mem_ref<vectype> [(elem * {ref-all}) arg1]
+       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
+       on little endian target.
+       vectype is the corresponding ADVSIMD type.  */
+
+    if (!BYTES_BIG_ENDIAN
+	&& integer_all_onesp (arg0))
+      {
+	tree lhs = gimple_call_lhs (f.call);
+	tree lhs_type = TREE_TYPE (lhs);
+	poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
+	tree eltype = TREE_TYPE (lhs_type);
+
+	scalar_mode elmode = GET_MODE_INNER (TYPE_MODE (lhs_type));
+	machine_mode vq_mode = aarch64_vq_mode (elmode).require ();
+	tree vectype = build_vector_type_for_mode (eltype, vq_mode);
+
+	tree elt_ptr_type
+	  = build_pointer_type_for_mode (eltype, VOIDmode, true);
+	tree zero = build_zero_cst (elt_ptr_type);
+
+	/* Use element type alignment.  */
+	tree access_type
+	  = build_aligned_type (vectype, TYPE_ALIGN (eltype));
+
+	tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
+	tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
+	gimple *mem_ref_stmt
+	  = gimple_build_assign (mem_ref_lhs, mem_ref_op);
+	gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
+
+	int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
+	vec_perm_builder sel (lhs_len, source_nelts, 1);
+	for (int i = 0; i < source_nelts; i++)
+	  sel.quick_push (i);
+
+	vec_perm_indices indices (sel, 1, source_nelts);
+	gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
+						   TYPE_MODE (access_type),
+						   indices));
+	tree mask_type = build_vector_type (ssizetype, lhs_len);
+	tree mask = vec_perm_indices_to_tree (mask_type, indices);
+	return gimple_build_assign (lhs, VEC_PERM_EXPR,
+				    mem_ref_lhs, mem_ref_lhs, mask);
+      }
+
+    return NULL;
+  }
 };
 
 class svld1ro_impl : public load_replicate
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 5969d1f56c2..d21e041eccb 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23342,7 +23342,9 @@ struct expand_vec_perm_d
   rtx target, op0, op1;
   vec_perm_indices perm;
   machine_mode vmode;
+  machine_mode op_mode;
   unsigned int vec_flags;
+  unsigned int op_vec_flags;
   bool one_vector_p;
   bool testing_p;
 };
@@ -23577,6 +23579,8 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d)
 
   newd.vmode = new_mode;
   newd.vec_flags = VEC_ADVSIMD;
+  newd.op_mode = newd.vmode;
+  newd.op_vec_flags = newd.vec_flags;
   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
@@ -23891,6 +23895,33 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to implement D using SVE dup instruction.  */
+
+static bool
+aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
+{
+  if (BYTES_BIG_ENDIAN
+      || !d->one_vector_p
+      || d->vec_flags != VEC_SVE_DATA
+      || d->op_vec_flags != VEC_ADVSIMD
+      || d->perm.encoding ().nelts_per_pattern () != 1
+      || !known_eq (d->perm.encoding ().npatterns (),
+		    GET_MODE_NUNITS (d->op_mode))
+      || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
+    return false;
+
+  int npatterns = d->perm.encoding ().npatterns ();
+  for (int i = 0; i < npatterns; i++)
+    if (!known_eq (d->perm[i], i))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+  return true;
+}
+
 /* Try to implement D using SVE SEL instruction.  */
 
 static bool
@@ -24014,6 +24045,8 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
 static bool
 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 {
+  gcc_assert (d->op_mode != E_VOIDmode);
+
   /* The pattern matching functions above are written to look for a small
      number to begin the sequence (0, 1, N/2).  If we begin with an index
      from the second operand, we can swap the operands.  */
@@ -24030,30 +24063,39 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
        || d->vec_flags == VEC_SVE_PRED)
       && known_gt (nelt, 1))
     {
-      if (aarch64_evpc_rev_local (d))
-	return true;
-      else if (aarch64_evpc_rev_global (d))
-	return true;
-      else if (aarch64_evpc_ext (d))
-	return true;
-      else if (aarch64_evpc_dup (d))
-	return true;
-      else if (aarch64_evpc_zip (d))
-	return true;
-      else if (aarch64_evpc_uzp (d))
-	return true;
-      else if (aarch64_evpc_trn (d))
-	return true;
-      else if (aarch64_evpc_sel (d))
-	return true;
-      else if (aarch64_evpc_ins (d))
-	return true;
-      else if (aarch64_evpc_reencode (d))
-	return true;
-      if (d->vec_flags == VEC_SVE_DATA)
-	return aarch64_evpc_sve_tbl (d);
-      else if (d->vec_flags == VEC_ADVSIMD)
-	return aarch64_evpc_tbl (d);
+      if (d->vmode == d->op_mode)
+	{
+	  if (aarch64_evpc_rev_local (d))
+	    return true;
+	  else if (aarch64_evpc_rev_global (d))
+	    return true;
+	  else if (aarch64_evpc_ext (d))
+	    return true;
+	  else if (aarch64_evpc_dup (d))
+	    return true;
+	  else if (aarch64_evpc_zip (d))
+	    return true;
+	  else if (aarch64_evpc_uzp (d))
+	    return true;
+	  else if (aarch64_evpc_trn (d))
+	    return true;
+	  else if (aarch64_evpc_sel (d))
+	    return true;
+	  else if (aarch64_evpc_ins (d))
+	    return true;
+	  else if (aarch64_evpc_reencode (d))
+	    return true;
+
+	  if (d->vec_flags == VEC_SVE_DATA)
+	    return aarch64_evpc_sve_tbl (d);
+	  else if (d->vec_flags == VEC_ADVSIMD)
+	    return aarch64_evpc_tbl (d);
+	}
+      else
+	{
+	  if (aarch64_evpc_sve_dup (d))
+	    return true;
+	}
     }
   return false;
 }
@@ -24065,9 +24107,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
 				  rtx target, rtx op0, rtx op1,
 				  const vec_perm_indices &sel)
 {
-  if (vmode != op_mode)
-    return false;
-
   struct expand_vec_perm_d d;
 
   /* Check whether the mask can be applied to a single vector.  */
@@ -24091,6 +24130,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
 		     sel.nelts_per_input ());
   d.vmode = vmode;
   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
+  d.op_mode = op_mode;
+  d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
   d.target = target;
   d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
   if (op0 == op1)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c
new file mode 100644
index 00000000000..b68f43cc3ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+#include "arm_sve.h"
+
+#define TEST(ret_type, param_type, suffix) \
+ret_type test_##suffix(param_type x) \
+{ \
+  return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
+}
+
+TEST(svint8_t, int8x16_t, s8)
+TEST(svint16_t, int16x8_t, s16)
+TEST(svint32_t, int32x4_t, s32)
+TEST(svint64_t, int64x2_t, s64)
+
+TEST(svuint8_t, uint8x16_t, u8)
+TEST(svuint16_t, uint16x8_t, u16)
+TEST(svuint32_t, uint32x4_t, u32)
+TEST(svuint64_t, uint64x2_t, u64)
+
+TEST(svfloat16_t, float16x8_t, f16)
+TEST(svfloat32_t, float32x4_t, f32)
+TEST(svfloat64_t, float64x2_t, f64)
+
+TEST(svbfloat16_t, bfloat16x8_t, bf16)
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 12 { target aarch64_little_endian } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c
new file mode 100644
index 00000000000..196de3f5e0a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+#include "arm_sve.h"
+
+#define TEST(ret_type, param_type, suffix) \
+ret_type test_##suffix(param_type *x) \
+{ \
+  return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
+}
+
+TEST(svint8_t, int8_t, s8)
+TEST(svint16_t, int16_t, s16)
+TEST(svint32_t, int32_t, s32)
+TEST(svint64_t, int64_t, s64)
+
+TEST(svuint8_t, uint8_t, u8)
+TEST(svuint16_t, uint16_t, u16)
+TEST(svuint32_t, uint32_t, u32)
+TEST(svuint64_t, uint64_t, u64)
+
+TEST(svfloat16_t, float16_t, f16)
+TEST(svfloat32_t, float32_t, f32)
+TEST(svfloat64_t, float64_t, f64)
+
+TEST(svbfloat16_t, bfloat16_t, bf16)
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 12 { target aarch64_little_endian } } } */
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index 8de1b144a42..9e5d84a9805 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -4297,18 +4297,14 @@ verify_gimple_assign_ternary (gassign *stmt)
       break;
 
     case VEC_PERM_EXPR:
-      if (!useless_type_conversion_p (lhs_type, rhs1_type)
-	  || !useless_type_conversion_p (lhs_type, rhs2_type))
-	{
-	  error ("type mismatch in %qs", code_name);
-	  debug_generic_expr (lhs_type);
-	  debug_generic_expr (rhs1_type);
-	  debug_generic_expr (rhs2_type);
-	  debug_generic_expr (rhs3_type);
-	  return true;
-	}
+      /* If permute is constant, then we allow for lhs and rhs
+	 to have different vector types, provided:
+	 (1) lhs, rhs1, rhs2 have same element type.
+	 (2) rhs3 vector is constant and has integer element type.
+	 (3) len(lhs) == len(rhs3) && len(rhs1) == len(rhs2).  */
 
-      if (TREE_CODE (rhs1_type) != VECTOR_TYPE
+      if (TREE_CODE (lhs_type) != VECTOR_TYPE
+	  || TREE_CODE (rhs1_type) != VECTOR_TYPE
 	  || TREE_CODE (rhs2_type) != VECTOR_TYPE
 	  || TREE_CODE (rhs3_type) != VECTOR_TYPE)
 	{
@@ -4320,10 +4316,28 @@ verify_gimple_assign_ternary (gassign *stmt)
 	  return true;
 	}
 
+      /* If rhs3 is constant, we allow lhs, rhs1 and rhs2 to be different vector types,
+	 as long as lhs, rhs1 and rhs2 have same element type.  */
+      if (TREE_CONSTANT (rhs3)
+	  ? (!useless_type_conversion_p (TREE_TYPE (lhs_type), TREE_TYPE (rhs1_type))
+	     || !useless_type_conversion_p (TREE_TYPE (lhs_type), TREE_TYPE (rhs2_type)))
+	  : (!useless_type_conversion_p (lhs_type, rhs1_type)
+	     || !useless_type_conversion_p (lhs_type, rhs2_type)))
+	{
+	    error ("type mismatch in %qs", code_name);
+	    debug_generic_expr (lhs_type);
+	    debug_generic_expr (rhs1_type);
+	    debug_generic_expr (rhs2_type);
+	    debug_generic_expr (rhs3_type);
+	    return true;
+	}
+
+      /* If rhs3 is constant, relax the check len(rhs2) == len(rhs3).  */
       if (maybe_ne (TYPE_VECTOR_SUBPARTS (rhs1_type),
 		    TYPE_VECTOR_SUBPARTS (rhs2_type))
-	  || maybe_ne (TYPE_VECTOR_SUBPARTS (rhs2_type),
-		       TYPE_VECTOR_SUBPARTS (rhs3_type))
+	  || (!TREE_CONSTANT(rhs3)
+	      && maybe_ne (TYPE_VECTOR_SUBPARTS (rhs2_type),
+			   TYPE_VECTOR_SUBPARTS (rhs3_type)))
 	  || maybe_ne (TYPE_VECTOR_SUBPARTS (rhs3_type),
 		       TYPE_VECTOR_SUBPARTS (lhs_type)))
 	{


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-06-12  3:25 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-12  3:25 [gcc r13-1055] PR96463: Optimise svld1rq from vectors for little endian AArch64 targets Prathamesh Kulkarni

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).