public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR target/97875)
@ 2020-12-17 17:48 Christophe Lyon
  2020-12-17 17:48 ` [PATCH 2/3] arm: Auto-vectorization for MVE: vshl Christophe Lyon
                   ` (3 more replies)
  0 siblings, 4 replies; 16+ messages in thread
From: Christophe Lyon @ 2020-12-17 17:48 UTC (permalink / raw)
  To: gcc-patches

This patch adds new movmisalign<mode>_mve_load and store patterns for
MVE to help vectorization. They are very similar to their Neon
counterparts, but use different iterators and instructions.

Indeed MVE supports less vectors modes than Neon, so we use
the MVE_VLD_ST iterator where Neon uses VQX.

Since the supported modes are different from the ones valid for
arithmetic operators, we introduce two new sets of macros:

ARM_HAVE_NEON_<MODE>_LDST
  true if Neon has vector load/store instructions for <MODE>

ARM_HAVE_<MODE>_LDST
  true if any vector extension has vector load/store instructions for <MODE>

We move the movmisalign<mode> expander from neon.md to vec-commond.md, and
replace the TARGET_NEON enabler with ARM_HAVE_<MODE>_LDST.

The patch also updates the mve-vneg.c test to scan for the better code
generation when loading and storing the vectors involved: it checks
that no 'orr' instruction is generated to cope with misalignment at
runtime.
This test was chosen among the other mve tests, but any other should
be OK. Using a plain vector copy loop (dest[i] = a[i]) is not a good
test because the compiler chooses to use memcpy.

For instance we now generate:
test_vneg_s32x4:
	vldrw.32       q3, [r1]
	vneg.s32  q3, q3
	vstrw.32       q3, [r0]
	bx      lr

instead of:
test_vneg_s32x4:
	orr     r3, r1, r0
	lsls    r3, r3, #28
	bne     .L15
	vldrw.32	q3, [r1]
	vneg.s32  q3, q3
	vstrw.32	q3, [r0]
	bx      lr
	.L15:
	push    {r4, r5}
	ldrd    r2, r3, [r1, #8]
	ldrd    r5, r4, [r1]
	rsbs    r2, r2, #0
	rsbs    r5, r5, #0
	rsbs    r4, r4, #0
	rsbs    r3, r3, #0
	strd    r5, r4, [r0]
	pop     {r4, r5}
	strd    r2, r3, [r0, #8]
	bx      lr

2020-12-15  Christophe Lyon  <christophe.lyon@linaro.org>

	PR target/97875
	gcc/
	* config/arm/arm.h (ARM_HAVE_NEON_V8QI_LDST): New macro.
	(ARM_HAVE_NEON_V16QI_LDST, ARM_HAVE_NEON_V4HI_LDST): Likewise.
	(ARM_HAVE_NEON_V8HI_LDST, ARM_HAVE_NEON_V2SI_LDST): Likewise.
	(ARM_HAVE_NEON_V4SI_LDST, ARM_HAVE_NEON_V4HF_LDST): Likewise.
	(ARM_HAVE_NEON_V8HF_LDST, ARM_HAVE_NEON_V4BF_LDST): Likewise.
	(ARM_HAVE_NEON_V8BF_LDST, ARM_HAVE_NEON_V2SF_LDST): Likewise.
	(ARM_HAVE_NEON_V4SF_LDST, ARM_HAVE_NEON_DI_LDST): Likewise.
	(ARM_HAVE_NEON_V2DI_LDST): Likewise.
	(ARM_HAVE_V8QI_LDST, ARM_HAVE_V16QI_LDST): Likewise.
	(ARM_HAVE_V4HI_LDST, ARM_HAVE_V8HI_LDST): Likewise.
	(ARM_HAVE_V2SI_LDST, ARM_HAVE_V4SI_LDST, ARM_HAVE_V4HF_LDST): Likewise.
	(ARM_HAVE_V8HF_LDST, ARM_HAVE_V4BF_LDST, ARM_HAVE_V8BF_LDST): Likewise.
	(ARM_HAVE_V2SF_LDST, ARM_HAVE_V4SF_LDST, ARM_HAVE_DI_LDST): Likewise.
	(ARM_HAVE_V2DI_LDST): Likewise.
	* config/arm/mve.md (*movmisalign<mode>_mve_store): New pattern.
	(*movmisalign<mode>_mve_load): New pattern.
	* config/arm/neon.md (movmisalign<mode>): Move to ...
	* config/arm/vec-common.md: ... here.

	PR target/97875
	gcc/testsuite/
	* gcc.target/arm/simd/mve-vneg.c: Update test.
---
 gcc/config/arm/arm.h                         | 40 ++++++++++++++++++++++++++++
 gcc/config/arm/mve.md                        | 25 +++++++++++++++++
 gcc/config/arm/neon.md                       | 25 -----------------
 gcc/config/arm/vec-common.md                 | 24 +++++++++++++++++
 gcc/testsuite/gcc.target/arm/simd/mve-vneg.c |  3 +++
 5 files changed, 92 insertions(+), 25 deletions(-)

diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index 4a63d33..d44e0c6 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -1151,6 +1151,46 @@ extern const int arm_arch_cde_coproc_bits[];
 #define ARM_HAVE_V8HF_ARITH (ARM_HAVE_NEON_V8HF_ARITH || TARGET_HAVE_MVE_FLOAT)
 #define ARM_HAVE_V4SF_ARITH (ARM_HAVE_NEON_V4SF_ARITH || TARGET_HAVE_MVE_FLOAT)
 
+/* The conditions under which vector modes are supported by load/store
+   instructions using Neon.  */
+
+#define ARM_HAVE_NEON_V8QI_LDST TARGET_NEON
+#define ARM_HAVE_NEON_V16QI_LDST TARGET_NEON
+#define ARM_HAVE_NEON_V4HI_LDST TARGET_NEON
+#define ARM_HAVE_NEON_V8HI_LDST TARGET_NEON
+#define ARM_HAVE_NEON_V2SI_LDST TARGET_NEON
+#define ARM_HAVE_NEON_V4SI_LDST TARGET_NEON
+#define ARM_HAVE_NEON_V4HF_LDST TARGET_NEON_FP16INST
+#define ARM_HAVE_NEON_V8HF_LDST TARGET_NEON_FP16INST
+#define ARM_HAVE_NEON_V4BF_LDST TARGET_BF16_SIMD
+#define ARM_HAVE_NEON_V8BF_LDST TARGET_BF16_SIMD
+#define ARM_HAVE_NEON_V2SF_LDST TARGET_NEON
+#define ARM_HAVE_NEON_V4SF_LDST TARGET_NEON
+#define ARM_HAVE_NEON_DI_LDST TARGET_NEON
+#define ARM_HAVE_NEON_V2DI_LDST TARGET_NEON
+
+/* The conditions under which vector modes are supported by load/store
+   instructions by any vector extension.  */
+
+#define ARM_HAVE_V8QI_LDST (ARM_HAVE_NEON_V8QI_LDST || TARGET_REALLY_IWMMXT)
+#define ARM_HAVE_V4HI_LDST (ARM_HAVE_NEON_V4HI_LDST || TARGET_REALLY_IWMMXT)
+#define ARM_HAVE_V2SI_LDST (ARM_HAVE_NEON_V2SI_LDST || TARGET_REALLY_IWMMXT)
+
+#define ARM_HAVE_V16QI_LDST (ARM_HAVE_NEON_V16QI_LDST || TARGET_HAVE_MVE)
+#define ARM_HAVE_V8HI_LDST (ARM_HAVE_NEON_V8HI_LDST || TARGET_HAVE_MVE)
+#define ARM_HAVE_V4SI_LDST (ARM_HAVE_NEON_V4SI_LDST || TARGET_HAVE_MVE)
+#define ARM_HAVE_DI_LDST ARM_HAVE_NEON_DI_LDST
+#define ARM_HAVE_V2DI_LDST ARM_HAVE_NEON_V2DI_LDST
+
+#define ARM_HAVE_V4HF_LDST ARM_HAVE_NEON_V4HF_LDST
+#define ARM_HAVE_V2SF_LDST ARM_HAVE_NEON_V2SF_LDST
+
+#define ARM_HAVE_V4BF_LDST ARM_HAVE_NEON_V4BF_LDST
+#define ARM_HAVE_V8BF_LDST ARM_HAVE_NEON_V8BF_LDST
+
+#define ARM_HAVE_V8HF_LDST (ARM_HAVE_NEON_V8HF_LDST || TARGET_HAVE_MVE_FLOAT)
+#define ARM_HAVE_V4SF_LDST (ARM_HAVE_NEON_V4SF_LDST || TARGET_HAVE_MVE_FLOAT)
+
 /* The register numbers in sequence, for passing to arm_gen_load_multiple.  */
 extern int arm_regs_in_sequence[];
 
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index b4c5a1e2..673a83c 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -10937,3 +10937,28 @@ (define_insn "arm_vcx3q<a>_p_v16qi"
   [(set_attr "type" "coproc")
    (set_attr "length" "8")]
 )
+
+(define_insn "*movmisalign<mode>_mve_store"
+  [(set (match_operand:MVE_VLD_ST 0 "neon_permissive_struct_operand"	     "=Um")
+	(unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1 "s_register_operand" " w")]
+	 UNSPEC_MISALIGNED_ACCESS))]
+  "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
+   || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))
+   && !BYTES_BIG_ENDIAN && unaligned_access"
+  "vstr<V_sz_elem1>.<V_sz_elem>\t%q1, %E0"
+  [(set_attr "type" "mve_store")
+   (set_attr "length" "4")]
+)
+
+
+(define_insn "*movmisalign<mode>_mve_load"
+  [(set (match_operand:MVE_VLD_ST 0 "s_register_operand"				 "=w")
+	(unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1 "neon_permissive_struct_operand" " Um")]
+	 UNSPEC_MISALIGNED_ACCESS))]
+  "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
+   || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))
+   && !BYTES_BIG_ENDIAN && unaligned_access"
+  "vldr<V_sz_elem1>.<V_sz_elem>\t%q0, %E1"
+  [(set_attr "type" "mve_load")
+   (set_attr "length" "4")]
+)
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index d2e92ba..50220be 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -280,31 +280,6 @@ (define_split
   neon_disambiguate_copy (operands, dest, src, 4);
 })
 
-(define_expand "movmisalign<mode>"
-  [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
-	(unspec:VDQX [(match_operand:VDQX 1 "neon_perm_struct_or_reg_operand")]
-		     UNSPEC_MISALIGNED_ACCESS))]
-  "TARGET_NEON && !BYTES_BIG_ENDIAN && unaligned_access"
-{
-  rtx adjust_mem;
-  /* This pattern is not permitted to fail during expansion: if both arguments
-     are non-registers (e.g. memory := constant, which can be created by the
-     auto-vectorizer), force operand 1 into a register.  */
-  if (!s_register_operand (operands[0], <MODE>mode)
-      && !s_register_operand (operands[1], <MODE>mode))
-    operands[1] = force_reg (<MODE>mode, operands[1]);
-
-  if (s_register_operand (operands[0], <MODE>mode))
-    adjust_mem = operands[1];
-  else
-    adjust_mem = operands[0];
-
-  /* Legitimize address.  */
-  if (!neon_vector_mem_operand (adjust_mem, 2, true))
-    XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
-
-})
-
 (define_insn "*movmisalign<mode>_neon_store"
   [(set (match_operand:VDX 0 "neon_permissive_struct_operand"	"=Um")
 	(unspec:VDX [(match_operand:VDX 1 "s_register_operand" " w")]
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 2d0932b..f6a79e2 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -205,3 +205,27 @@ (define_expand "neg<mode>2"
 	(neg:VDQWH (match_operand:VDQWH 1 "s_register_operand" "")))]
   "ARM_HAVE_<MODE>_ARITH"
 )
+
+(define_expand "movmisalign<mode>"
+  [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
+	(unspec:VDQX [(match_operand:VDQX 1 "neon_perm_struct_or_reg_operand")]
+	 UNSPEC_MISALIGNED_ACCESS))]
+  "ARM_HAVE_<MODE>_LDST && !BYTES_BIG_ENDIAN && unaligned_access"
+{
+  rtx adjust_mem;
+  /* This pattern is not permitted to fail during expansion: if both arguments
+     are non-registers (e.g. memory := constant, which can be created by the
+     auto-vectorizer), force operand 1 into a register.  */
+  if (!s_register_operand (operands[0], <MODE>mode)
+      && !s_register_operand (operands[1], <MODE>mode))
+    operands[1] = force_reg (<MODE>mode, operands[1]);
+
+  if (s_register_operand (operands[0], <MODE>mode))
+    adjust_mem = operands[1];
+  else
+    adjust_mem = operands[0];
+
+  /* Legitimize address.  */
+  if (!neon_vector_mem_operand (adjust_mem, 2, true))
+    XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
+})
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c b/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
index afd0d60..7945a06 100644
--- a/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
@@ -47,3 +47,6 @@ FUNC(f, float, 16, 8, -, vneg)
    functions above.  */
 /* { dg-final { scan-assembler-times {vneg.s[0-9]+  q[0-9]+, q[0-9]+} 6 } } */
 /* { dg-final { scan-assembler-times {vneg.f[0-9]+  q[0-9]+, q[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {vldr[bhw].[0-9]+\tq[0-9]+} 8 } } */
+/* { dg-final { scan-assembler-times {vstr[bhw].[0-9]+\tq[0-9]+} 8 } } */
+/* { dg-final { scan-assembler-not {orr\tr[0-9]+, r[0-9]+, r[0-9]+} } } */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 2/3] arm: Auto-vectorization for MVE: vshl
  2020-12-17 17:48 [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR target/97875) Christophe Lyon
@ 2020-12-17 17:48 ` Christophe Lyon
  2020-12-30 10:34   ` Christophe Lyon
  2021-01-15  9:42   ` Kyrylo Tkachov
  2020-12-17 17:48 ` [PATCH 3/3] arm: Auto-vectorization for MVE: vshr Christophe Lyon
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 16+ messages in thread
From: Christophe Lyon @ 2020-12-17 17:48 UTC (permalink / raw)
  To: gcc-patches

This patch enables MVE vshlq instructions for auto-vectorization.

The existing mve_vshlq_n_<supf><mode> is kept, as it takes a single
immediate as second operand, and is used by arm_mve.h.

We move the vashl<mode>3 insn from neon.md to an expander in
vec-common.md, and the mve_vshlq_<supf><mode> insn from mve.md to
vec-common.md, adding the second alternative fron neon.md.

mve_vshlq_<supf><mode> will be used by a later patch enabling
vectorization for vshr, as a unified version of
ashl3<mode3>_[signed|unsigned] from neon.md. Keeping the use of unspec
VSHLQ enables to generate both 's' and 'u' variants.

It is not clear whether the neon_shift_[reg|imm]<q> attribute is still
suitable, since this insn is also used for MVE.

I kept the mve_vshlq_<supf><mode> naming instead of renaming it to
ashl3_<supf>_<mode> as discussed because the reference in
arm_mve_builtins.def automatically inserts the "mve_" prefix and I
didn't want to make a special case for this.

I haven't yet found why the v16qi and v8hi tests are not vectorized.
With dest[i] = a[i] << b[i] and:
  {
    int i;
    unsigned int i.24_1;
    unsigned int _2;
    int16_t * _3;
    short int _4;
    int _5;
    int16_t * _6;
    short int _7;
    int _8;
    int _9;
    int16_t * _10;
    short int _11;
    unsigned int ivtmp_42;
    unsigned int ivtmp_43;

    <bb 2> [local count: 119292720]:

    <bb 3> [local count: 954449105]:
    i.24_1 = (unsigned int) i_23;
    _2 = i.24_1 * 2;
    _3 = a_15(D) + _2;
    _4 = *_3;
    _5 = (int) _4;
    _6 = b_16(D) + _2;
    _7 = *_6;
    _8 = (int) _7;
    _9 = _5 << _8;
    _10 = dest_17(D) + _2;
    _11 = (short int) _9;
    *_10 = _11;
    i_19 = i_23 + 1;
    ivtmp_42 = ivtmp_43 - 1;
    if (ivtmp_42 != 0)
      goto <bb 5>; [87.50%]
    else
      goto <bb 4>; [12.50%]

    <bb 5> [local count: 835156386]:
    goto <bb 3>; [100.00%]

    <bb 4> [local count: 119292720]:
    return;

  }
the vectorizer says:
mve-vshl.c:37:96: note:   ==> examining statement: _5 = (int) _4;
mve-vshl.c:37:96: note:   vect_is_simple_use: operand *_3, type of def: internal
mve-vshl.c:37:96: note:   vect_is_simple_use: vectype vector(8) short int
mve-vshl.c:37:96: missed:   conversion not supported by target.
mve-vshl.c:37:96: note:   vect_is_simple_use: operand *_3, type of def: internal
mve-vshl.c:37:96: note:   vect_is_simple_use: vectype vector(8) short int
mve-vshl.c:37:96: note:   vect_is_simple_use: operand *_3, type of def: internal
mve-vshl.c:37:96: note:   vect_is_simple_use: vectype vector(8) short int
mve-vshl.c:37:117: missed:   not vectorized: relevant stmt not supported: _5 = (int) _4;
mve-vshl.c:37:96: missed:  bad operation or unsupported loop bound.
mve-vshl.c:37:96: note:  ***** Analysis failed with vector mode V8HI

2020-12-03  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/mve.md (mve_vshlq_<supf><mode>): Move to
	vec-commond.md.
	* config/arm/neon.md (vashl<mode>3): Delete.
	* config/arm/vec-common.md (mve_vshlq_<supf><mode>): New.
	(vasl<mode>3): New expander.

	gcc/testsuite/
	* gcc.target/arm/simd/mve-vshl.c: Add tests for vshl.
---
 gcc/config/arm/mve.md                        | 13 +-----
 gcc/config/arm/neon.md                       | 19 ---------
 gcc/config/arm/vec-common.md                 | 30 ++++++++++++++
 gcc/testsuite/gcc.target/arm/simd/mve-vshl.c | 62 ++++++++++++++++++++++++++++
 4 files changed, 93 insertions(+), 31 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vshl.c

diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 673a83c..8bdb451 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -822,18 +822,7 @@ (define_insn "mve_vcmpneq_<supf><mode>"
 
 ;;
 ;; [vshlq_s, vshlq_u])
-;;
-(define_insn "mve_vshlq_<supf><mode>"
-  [
-   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
-	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
-		       (match_operand:MVE_2 2 "s_register_operand" "w")]
-	 VSHLQ))
-  ]
-  "TARGET_HAVE_MVE"
-  "vshl.<supf>%#<V_sz_elem>\t%q0, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
+;; See vec-common.md
 
 ;;
 ;; [vabdq_s, vabdq_u])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 50220be..ac9bf74 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -845,25 +845,6 @@ (define_insn "*smax<mode>3_neon"
 ; generic vectorizer code.  It ends up creating a V2DI constructor with
 ; SImode elements.
 
-(define_insn "vashl<mode>3"
-  [(set (match_operand:VDQIW 0 "s_register_operand" "=w,w")
-	(ashift:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w,w")
-		      (match_operand:VDQIW 2 "imm_lshift_or_reg_neon" "w,Dm")))]
-  "TARGET_NEON"
-  {
-    switch (which_alternative)
-      {
-        case 0: return "vshl.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2";
-        case 1: return neon_output_shift_immediate ("vshl", 'i', &operands[2],
-                         			    <MODE>mode,
-						    VALID_NEON_QREG_MODE (<MODE>mode),
-						    true);
-        default: gcc_unreachable ();
-      }
-  }
-  [(set_attr "type" "neon_shift_reg<q>, neon_shift_imm<q>")]
-)
-
 (define_insn "vashr<mode>3_imm"
   [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
 	(ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index f6a79e2..3a282f0 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -229,3 +229,33 @@ (define_expand "movmisalign<mode>"
   if (!neon_vector_mem_operand (adjust_mem, 2, true))
     XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
 })
+
+(define_insn "mve_vshlq_<supf><mode>"
+  [(set (match_operand:VDQIW 0 "s_register_operand" "=w,w")
+	(unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w,w")
+		       (match_operand:VDQIW 2 "imm_lshift_or_reg_neon" "w,Dm")]
+	 VSHLQ))]
+  "ARM_HAVE_<MODE>_ARITH"
+{
+  switch (which_alternative)
+    {
+      case 0: return "vshl.<supf>%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2";
+      case 1: return neon_output_shift_immediate ("vshl", 'i', &operands[2],
+						  <MODE>mode,
+						  VALID_NEON_QREG_MODE (<MODE>mode),
+						  true);
+      default: gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "neon_shift_reg<q>, neon_shift_imm<q>")]
+)
+
+(define_expand "vashl<mode>3"
+  [(set (match_operand:VDQIW 0 "s_register_operand" "")
+	(ashift:VDQIW (match_operand:VDQIW 1 "s_register_operand" "")
+		      (match_operand:VDQIW 2 "imm_lshift_or_reg_neon" "")))]
+  "ARM_HAVE_<MODE>_ARITH"
+{
+  emit_insn (gen_mve_vshlq_u<mode> (operands[0], operands[1], operands[2]));
+  DONE;
+})
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
new file mode 100644
index 0000000..7a06449
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
@@ -0,0 +1,62 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O3" } */
+
+#include <stdint.h>
+
+#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)				\
+  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
+    int i;								\
+    for (i=0; i<NB; i++) {						\
+      dest[i] = a[i] OP b[i];						\
+    }									\
+}
+
+#define FUNC_IMM(SIGN, TYPE, BITS, NB, OP, NAME)				\
+  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a) { \
+    int i;								\
+    for (i=0; i<NB; i++) {						\
+      dest[i] = a[i] OP 5;						\
+    }									\
+}
+
+/* 64-bit vectors.  */
+FUNC(s, int, 32, 2, <<, vshl)
+FUNC(u, uint, 32, 2, <<, vshl)
+FUNC(s, int, 16, 4, <<, vshl)
+FUNC(u, uint, 16, 4, <<, vshl)
+FUNC(s, int, 8, 8, <<, vshl)
+FUNC(u, uint, 8, 8, <<, vshl)
+
+/* 128-bit vectors.  */
+FUNC(s, int, 32, 4, <<, vshl)
+FUNC(u, uint, 32, 4, <<, vshl)
+FUNC(s, int, 16, 8, <<, vshl)  /* FIXME: not vectorized */
+FUNC(u, uint, 16, 8, <<, vshl) /* FIXME: not vectorized */
+FUNC(s, int, 8, 16, <<, vshl)  /* FIXME: not vectorized */
+FUNC(u, uint, 8, 16, <<, vshl) /* FIXME: not vectorized */
+
+/* 64-bit vectors.  */
+FUNC_IMM(s, int, 32, 2, <<, vshlimm)
+FUNC_IMM(u, uint, 32, 2, <<, vshlimm)
+FUNC_IMM(s, int, 16, 4, <<, vshlimm)
+FUNC_IMM(u, uint, 16, 4, <<, vshlimm)
+FUNC_IMM(s, int, 8, 8, <<, vshlimm)
+FUNC_IMM(u, uint, 8, 8, <<, vshlimm)
+
+/* 128-bit vectors.  */
+FUNC_IMM(s, int, 32, 4, <<, vshlimm)
+FUNC_IMM(u, uint, 32, 4, <<, vshlimm)
+FUNC_IMM(s, int, 16, 8, <<, vshlimm)
+FUNC_IMM(u, uint, 16, 8, <<, vshlimm)
+FUNC_IMM(s, int, 8, 16, <<, vshlimm)
+FUNC_IMM(u, uint, 8, 16, <<, vshlimm)
+
+/* MVE has only 128-bit vectors, so we can vectorize only half of the
+   functions above.  */
+/* We only emit vshl.u, which is equivalent to vshl.s anyway.  */
+/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 2 } } */
+
+/* We emit vshl.i when the shift amount is an immediate.  */
+/* { dg-final { scan-assembler-times {vshl.i[0-9]+\tq[0-9]+, q[0-9]+} 6 } } */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 3/3] arm: Auto-vectorization for MVE: vshr
  2020-12-17 17:48 [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR target/97875) Christophe Lyon
  2020-12-17 17:48 ` [PATCH 2/3] arm: Auto-vectorization for MVE: vshl Christophe Lyon
@ 2020-12-17 17:48 ` Christophe Lyon
  2020-12-30 10:34   ` Christophe Lyon
  2021-01-15  9:44   ` Kyrylo Tkachov
  2020-12-30 10:33 ` [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR target/97875) Christophe Lyon
  2021-01-08  9:50 ` Kyrylo Tkachov
  3 siblings, 2 replies; 16+ messages in thread
From: Christophe Lyon @ 2020-12-17 17:48 UTC (permalink / raw)
  To: gcc-patches

This patch enables MVE vshr instructions for auto-vectorization.  New
MVE patterns are introduced that take a vector of constants as second
operand, all constants being equal.

The existing mve_vshrq_n_<supf><mode> is kept, as it takes a single
immediate as second operand, and is used by arm_mve.h.

The vashr<mode>3 and vlshr<mode>3 expanders are moved fron neon.md to
vec-common.md, updated to rely on the normal expansion scheme to
generate shifts by immediate.

2020-12-03  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/mve.md (mve_vshrq_n_s<mode>_imm): New entry.
	(mve_vshrq_n_u<mode>_imm): Likewise.
	* config/arm/neon.md (vashr<mode>3, vlshr<mode>3): Move to ...
	* config/arm/vec-common.md: ... here.

	gcc/testsuite/
	* gcc.target/arm/simd/mve-vshr.c: Add tests for vshr.
---
 gcc/config/arm/mve.md                        | 34 ++++++++++++++++
 gcc/config/arm/neon.md                       | 34 ----------------
 gcc/config/arm/vec-common.md                 | 38 +++++++++++++++++-
 gcc/testsuite/gcc.target/arm/simd/mve-vshr.c | 59 ++++++++++++++++++++++++++++
 4 files changed, 130 insertions(+), 35 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vshr.c

diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 8bdb451..eea8b20 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -763,6 +763,7 @@ (define_insn "mve_vcreateq_<supf><mode>"
 ;;
 ;; [vshrq_n_s, vshrq_n_u])
 ;;
+;; Version that takes an immediate as operand 2.
 (define_insn "mve_vshrq_n_<supf><mode>"
   [
    (set (match_operand:MVE_2 0 "s_register_operand" "=w")
@@ -775,6 +776,39 @@ (define_insn "mve_vshrq_n_<supf><mode>"
   [(set_attr "type" "mve_move")
 ])
 
+;; Versions that take constant vectors as operand 2 (with all elements
+;; equal).
+(define_insn "mve_vshrq_n_s<mode>_imm"
+  [
+   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
+	(ashiftrt:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
+			(match_operand:MVE_2 2 "imm_for_neon_rshift_operand" "i")))
+  ]
+  "TARGET_HAVE_MVE"
+  {
+    return neon_output_shift_immediate ("vshr", 's', &operands[2],
+					<MODE>mode,
+					VALID_NEON_QREG_MODE (<MODE>mode),
+					true);
+  }
+  [(set_attr "type" "mve_move")
+])
+(define_insn "mve_vshrq_n_u<mode>_imm"
+  [
+   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
+	(lshiftrt:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
+			(match_operand:MVE_2 2 "imm_for_neon_rshift_operand" "i")))
+  ]
+  "TARGET_HAVE_MVE"
+  {
+    return neon_output_shift_immediate ("vshr", 'u', &operands[2],
+					<MODE>mode,
+					VALID_NEON_QREG_MODE (<MODE>mode),
+					true);
+  }
+  [(set_attr "type" "mve_move")
+])
+
 ;;
 ;; [vcvtq_n_from_f_s, vcvtq_n_from_f_u])
 ;;
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index ac9bf74..a0e8d7a 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -899,40 +899,6 @@ (define_insn "ashl<mode>3_unsigned"
   [(set_attr "type" "neon_shift_reg<q>")]
 )
 
-(define_expand "vashr<mode>3"
-  [(set (match_operand:VDQIW 0 "s_register_operand")
-	(ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
-			(match_operand:VDQIW 2 "imm_rshift_or_reg_neon")))]
-  "TARGET_NEON"
-{
-  if (s_register_operand (operands[2], <MODE>mode))
-    {
-      rtx neg = gen_reg_rtx (<MODE>mode);
-      emit_insn (gen_neon_neg<mode>2 (neg, operands[2]));
-      emit_insn (gen_ashl<mode>3_signed (operands[0], operands[1], neg));
-    }
-  else
-    emit_insn (gen_vashr<mode>3_imm (operands[0], operands[1], operands[2]));
-  DONE;
-})
-
-(define_expand "vlshr<mode>3"
-  [(set (match_operand:VDQIW 0 "s_register_operand")
-	(lshiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
-			(match_operand:VDQIW 2 "imm_rshift_or_reg_neon")))]
-  "TARGET_NEON"
-{
-  if (s_register_operand (operands[2], <MODE>mode))
-    {
-      rtx neg = gen_reg_rtx (<MODE>mode);
-      emit_insn (gen_neon_neg<mode>2 (neg, operands[2]));
-      emit_insn (gen_ashl<mode>3_unsigned (operands[0], operands[1], neg));
-    }
-  else
-    emit_insn (gen_vlshr<mode>3_imm (operands[0], operands[1], operands[2]));
-  DONE;
-})
-
 ;; 64-bit shifts
 
 ;; This pattern loads a 32-bit shift count into a 64-bit NEON register,
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 3a282f0..e126557 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -258,4 +258,40 @@ (define_expand "vashl<mode>3"
 {
   emit_insn (gen_mve_vshlq_u<mode> (operands[0], operands[1], operands[2]));
   DONE;
-})
\ No newline at end of file
+})
+
+;; When operand 2 is an immediate, use the normal expansion to match
+;; gen_vashr<mode>3_imm for Neon and gen_mve_vshrq_n_s<mode>_imm for
+;; MVE.
+(define_expand "vashr<mode>3"
+  [(set (match_operand:VDQIW 0 "s_register_operand")
+	(ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
+			(match_operand:VDQIW 2 "imm_rshift_or_reg_neon")))]
+  "ARM_HAVE_<MODE>_ARITH"
+{
+  if (s_register_operand (operands[2], <MODE>mode))
+    {
+      rtx neg = gen_reg_rtx (<MODE>mode);
+      emit_insn (gen_neg<mode>2 (neg, operands[2]));
+      emit_insn (gen_mve_vshlq_s<mode> (operands[0], operands[1], neg));
+      DONE;
+    }
+})
+
+;; When operand 2 is an immediate, use the normal expansion to match
+;; gen_vashr<mode>3_imm for Neon and gen_mve_vshrq_n_u<mode>_imm for
+;; MVE.
+(define_expand "vlshr<mode>3"
+  [(set (match_operand:VDQIW 0 "s_register_operand")
+	(lshiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
+			(match_operand:VDQIW 2 "imm_rshift_or_reg_neon")))]
+  "ARM_HAVE_<MODE>_ARITH"
+{
+  if (s_register_operand (operands[2], <MODE>mode))
+    {
+      rtx neg = gen_reg_rtx (<MODE>mode);
+      emit_insn (gen_neg<mode>2 (neg, operands[2]));
+      emit_insn (gen_mve_vshlq_u<mode> (operands[0], operands[1], neg));
+      DONE;
+    }
+})
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c b/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c
new file mode 100644
index 0000000..d4e658c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c
@@ -0,0 +1,59 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O3" } */
+
+#include <stdint.h>
+
+#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)				\
+  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
+    int i;								\
+    for (i=0; i<NB; i++) {						\
+      dest[i] = a[i] OP b[i];						\
+    }									\
+}
+
+#define FUNC_IMM(SIGN, TYPE, BITS, NB, OP, NAME)				\
+  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a) { \
+    int i;								\
+    for (i=0; i<NB; i++) {						\
+      dest[i] = a[i] OP 5;						\
+    }									\
+}
+
+/* 64-bit vectors.  */
+FUNC(s, int, 32, 2, >>, vshr)
+FUNC(u, uint, 32, 2, >>, vshr)
+FUNC(s, int, 16, 4, >>, vshr)
+FUNC(u, uint, 16, 4, >>, vshr)
+FUNC(s, int, 8, 8, >>, vshr)
+FUNC(u, uint, 8, 8, >>, vshr)
+
+/* 128-bit vectors.  */
+FUNC(s, int, 32, 4, >>, vshr)
+FUNC(u, uint, 32, 4, >>, vshr)
+FUNC(s, int, 16, 8, >>, vshr)
+FUNC(u, uint, 16, 8, >>, vshr)
+FUNC(s, int, 8, 16, >>, vshr)
+FUNC(u, uint, 8, 16, >>, vshr)
+
+/* 64-bit vectors.  */
+FUNC_IMM(s, int, 32, 2, >>, vshrimm)
+FUNC_IMM(u, uint, 32, 2, >>, vshrimm)
+FUNC_IMM(s, int, 16, 4, >>, vshrimm)
+FUNC_IMM(u, uint, 16, 4, >>, vshrimm)
+FUNC_IMM(s, int, 8, 8, >>, vshrimm)
+FUNC_IMM(u, uint, 8, 8, >>, vshrimm)
+
+/* 128-bit vectors.  */
+FUNC_IMM(s, int, 32, 4, >>, vshrimm)
+FUNC_IMM(u, uint, 32, 4, >>, vshrimm)
+FUNC_IMM(s, int, 16, 8, >>, vshrimm)
+FUNC_IMM(u, uint, 16, 8, >>, vshrimm)
+FUNC_IMM(s, int, 8, 16, >>, vshrimm)
+FUNC_IMM(u, uint, 8, 16, >>, vshrimm)
+
+/* MVE has only 128-bit vectors, so we can vectorize only half of the
+   functions above.  */
+/* { dg-final { scan-assembler-times {vshr.s[0-9]+\tq[0-9]+, q[0-9]+} 3 } } */
+/* { dg-final { scan-assembler-times {vshr.u[0-9]+\tq[0-9]+, q[0-9]+} 3 } } */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR target/97875)
  2020-12-17 17:48 [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR target/97875) Christophe Lyon
  2020-12-17 17:48 ` [PATCH 2/3] arm: Auto-vectorization for MVE: vshl Christophe Lyon
  2020-12-17 17:48 ` [PATCH 3/3] arm: Auto-vectorization for MVE: vshr Christophe Lyon
@ 2020-12-30 10:33 ` Christophe Lyon
  2021-01-07 12:20   ` Christophe Lyon
  2021-01-08  9:50 ` Kyrylo Tkachov
  3 siblings, 1 reply; 16+ messages in thread
From: Christophe Lyon @ 2020-12-30 10:33 UTC (permalink / raw)
  To: gcc Patches

ping?

On Thu, 17 Dec 2020 at 18:48, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
>
> This patch adds new movmisalign<mode>_mve_load and store patterns for
> MVE to help vectorization. They are very similar to their Neon
> counterparts, but use different iterators and instructions.
>
> Indeed MVE supports less vectors modes than Neon, so we use
> the MVE_VLD_ST iterator where Neon uses VQX.
>
> Since the supported modes are different from the ones valid for
> arithmetic operators, we introduce two new sets of macros:
>
> ARM_HAVE_NEON_<MODE>_LDST
>   true if Neon has vector load/store instructions for <MODE>
>
> ARM_HAVE_<MODE>_LDST
>   true if any vector extension has vector load/store instructions for <MODE>
>
> We move the movmisalign<mode> expander from neon.md to vec-commond.md, and
> replace the TARGET_NEON enabler with ARM_HAVE_<MODE>_LDST.
>
> The patch also updates the mve-vneg.c test to scan for the better code
> generation when loading and storing the vectors involved: it checks
> that no 'orr' instruction is generated to cope with misalignment at
> runtime.
> This test was chosen among the other mve tests, but any other should
> be OK. Using a plain vector copy loop (dest[i] = a[i]) is not a good
> test because the compiler chooses to use memcpy.
>
> For instance we now generate:
> test_vneg_s32x4:
>         vldrw.32       q3, [r1]
>         vneg.s32  q3, q3
>         vstrw.32       q3, [r0]
>         bx      lr
>
> instead of:
> test_vneg_s32x4:
>         orr     r3, r1, r0
>         lsls    r3, r3, #28
>         bne     .L15
>         vldrw.32        q3, [r1]
>         vneg.s32  q3, q3
>         vstrw.32        q3, [r0]
>         bx      lr
>         .L15:
>         push    {r4, r5}
>         ldrd    r2, r3, [r1, #8]
>         ldrd    r5, r4, [r1]
>         rsbs    r2, r2, #0
>         rsbs    r5, r5, #0
>         rsbs    r4, r4, #0
>         rsbs    r3, r3, #0
>         strd    r5, r4, [r0]
>         pop     {r4, r5}
>         strd    r2, r3, [r0, #8]
>         bx      lr
>
> 2020-12-15  Christophe Lyon  <christophe.lyon@linaro.org>
>
>         PR target/97875
>         gcc/
>         * config/arm/arm.h (ARM_HAVE_NEON_V8QI_LDST): New macro.
>         (ARM_HAVE_NEON_V16QI_LDST, ARM_HAVE_NEON_V4HI_LDST): Likewise.
>         (ARM_HAVE_NEON_V8HI_LDST, ARM_HAVE_NEON_V2SI_LDST): Likewise.
>         (ARM_HAVE_NEON_V4SI_LDST, ARM_HAVE_NEON_V4HF_LDST): Likewise.
>         (ARM_HAVE_NEON_V8HF_LDST, ARM_HAVE_NEON_V4BF_LDST): Likewise.
>         (ARM_HAVE_NEON_V8BF_LDST, ARM_HAVE_NEON_V2SF_LDST): Likewise.
>         (ARM_HAVE_NEON_V4SF_LDST, ARM_HAVE_NEON_DI_LDST): Likewise.
>         (ARM_HAVE_NEON_V2DI_LDST): Likewise.
>         (ARM_HAVE_V8QI_LDST, ARM_HAVE_V16QI_LDST): Likewise.
>         (ARM_HAVE_V4HI_LDST, ARM_HAVE_V8HI_LDST): Likewise.
>         (ARM_HAVE_V2SI_LDST, ARM_HAVE_V4SI_LDST, ARM_HAVE_V4HF_LDST): Likewise.
>         (ARM_HAVE_V8HF_LDST, ARM_HAVE_V4BF_LDST, ARM_HAVE_V8BF_LDST): Likewise.
>         (ARM_HAVE_V2SF_LDST, ARM_HAVE_V4SF_LDST, ARM_HAVE_DI_LDST): Likewise.
>         (ARM_HAVE_V2DI_LDST): Likewise.
>         * config/arm/mve.md (*movmisalign<mode>_mve_store): New pattern.
>         (*movmisalign<mode>_mve_load): New pattern.
>         * config/arm/neon.md (movmisalign<mode>): Move to ...
>         * config/arm/vec-common.md: ... here.
>
>         PR target/97875
>         gcc/testsuite/
>         * gcc.target/arm/simd/mve-vneg.c: Update test.
> ---
>  gcc/config/arm/arm.h                         | 40 ++++++++++++++++++++++++++++
>  gcc/config/arm/mve.md                        | 25 +++++++++++++++++
>  gcc/config/arm/neon.md                       | 25 -----------------
>  gcc/config/arm/vec-common.md                 | 24 +++++++++++++++++
>  gcc/testsuite/gcc.target/arm/simd/mve-vneg.c |  3 +++
>  5 files changed, 92 insertions(+), 25 deletions(-)
>
> diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
> index 4a63d33..d44e0c6 100644
> --- a/gcc/config/arm/arm.h
> +++ b/gcc/config/arm/arm.h
> @@ -1151,6 +1151,46 @@ extern const int arm_arch_cde_coproc_bits[];
>  #define ARM_HAVE_V8HF_ARITH (ARM_HAVE_NEON_V8HF_ARITH || TARGET_HAVE_MVE_FLOAT)
>  #define ARM_HAVE_V4SF_ARITH (ARM_HAVE_NEON_V4SF_ARITH || TARGET_HAVE_MVE_FLOAT)
>
> +/* The conditions under which vector modes are supported by load/store
> +   instructions using Neon.  */
> +
> +#define ARM_HAVE_NEON_V8QI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V16QI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V4HI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V8HI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V2SI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V4SI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V4HF_LDST TARGET_NEON_FP16INST
> +#define ARM_HAVE_NEON_V8HF_LDST TARGET_NEON_FP16INST
> +#define ARM_HAVE_NEON_V4BF_LDST TARGET_BF16_SIMD
> +#define ARM_HAVE_NEON_V8BF_LDST TARGET_BF16_SIMD
> +#define ARM_HAVE_NEON_V2SF_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V4SF_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_DI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V2DI_LDST TARGET_NEON
> +
> +/* The conditions under which vector modes are supported by load/store
> +   instructions by any vector extension.  */
> +
> +#define ARM_HAVE_V8QI_LDST (ARM_HAVE_NEON_V8QI_LDST || TARGET_REALLY_IWMMXT)
> +#define ARM_HAVE_V4HI_LDST (ARM_HAVE_NEON_V4HI_LDST || TARGET_REALLY_IWMMXT)
> +#define ARM_HAVE_V2SI_LDST (ARM_HAVE_NEON_V2SI_LDST || TARGET_REALLY_IWMMXT)
> +
> +#define ARM_HAVE_V16QI_LDST (ARM_HAVE_NEON_V16QI_LDST || TARGET_HAVE_MVE)
> +#define ARM_HAVE_V8HI_LDST (ARM_HAVE_NEON_V8HI_LDST || TARGET_HAVE_MVE)
> +#define ARM_HAVE_V4SI_LDST (ARM_HAVE_NEON_V4SI_LDST || TARGET_HAVE_MVE)
> +#define ARM_HAVE_DI_LDST ARM_HAVE_NEON_DI_LDST
> +#define ARM_HAVE_V2DI_LDST ARM_HAVE_NEON_V2DI_LDST
> +
> +#define ARM_HAVE_V4HF_LDST ARM_HAVE_NEON_V4HF_LDST
> +#define ARM_HAVE_V2SF_LDST ARM_HAVE_NEON_V2SF_LDST
> +
> +#define ARM_HAVE_V4BF_LDST ARM_HAVE_NEON_V4BF_LDST
> +#define ARM_HAVE_V8BF_LDST ARM_HAVE_NEON_V8BF_LDST
> +
> +#define ARM_HAVE_V8HF_LDST (ARM_HAVE_NEON_V8HF_LDST || TARGET_HAVE_MVE_FLOAT)
> +#define ARM_HAVE_V4SF_LDST (ARM_HAVE_NEON_V4SF_LDST || TARGET_HAVE_MVE_FLOAT)
> +
>  /* The register numbers in sequence, for passing to arm_gen_load_multiple.  */
>  extern int arm_regs_in_sequence[];
>
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index b4c5a1e2..673a83c 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -10937,3 +10937,28 @@ (define_insn "arm_vcx3q<a>_p_v16qi"
>    [(set_attr "type" "coproc")
>     (set_attr "length" "8")]
>  )
> +
> +(define_insn "*movmisalign<mode>_mve_store"
> +  [(set (match_operand:MVE_VLD_ST 0 "neon_permissive_struct_operand"        "=Um")
> +       (unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1 "s_register_operand" " w")]
> +        UNSPEC_MISALIGNED_ACCESS))]
> +  "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
> +   || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))
> +   && !BYTES_BIG_ENDIAN && unaligned_access"
> +  "vstr<V_sz_elem1>.<V_sz_elem>\t%q1, %E0"
> +  [(set_attr "type" "mve_store")
> +   (set_attr "length" "4")]
> +)
> +
> +
> +(define_insn "*movmisalign<mode>_mve_load"
> +  [(set (match_operand:MVE_VLD_ST 0 "s_register_operand"                                "=w")
> +       (unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1 "neon_permissive_struct_operand" " Um")]
> +        UNSPEC_MISALIGNED_ACCESS))]
> +  "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
> +   || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))
> +   && !BYTES_BIG_ENDIAN && unaligned_access"
> +  "vldr<V_sz_elem1>.<V_sz_elem>\t%q0, %E1"
> +  [(set_attr "type" "mve_load")
> +   (set_attr "length" "4")]
> +)
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index d2e92ba..50220be 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -280,31 +280,6 @@ (define_split
>    neon_disambiguate_copy (operands, dest, src, 4);
>  })
>
> -(define_expand "movmisalign<mode>"
> -  [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
> -       (unspec:VDQX [(match_operand:VDQX 1 "neon_perm_struct_or_reg_operand")]
> -                    UNSPEC_MISALIGNED_ACCESS))]
> -  "TARGET_NEON && !BYTES_BIG_ENDIAN && unaligned_access"
> -{
> -  rtx adjust_mem;
> -  /* This pattern is not permitted to fail during expansion: if both arguments
> -     are non-registers (e.g. memory := constant, which can be created by the
> -     auto-vectorizer), force operand 1 into a register.  */
> -  if (!s_register_operand (operands[0], <MODE>mode)
> -      && !s_register_operand (operands[1], <MODE>mode))
> -    operands[1] = force_reg (<MODE>mode, operands[1]);
> -
> -  if (s_register_operand (operands[0], <MODE>mode))
> -    adjust_mem = operands[1];
> -  else
> -    adjust_mem = operands[0];
> -
> -  /* Legitimize address.  */
> -  if (!neon_vector_mem_operand (adjust_mem, 2, true))
> -    XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
> -
> -})
> -
>  (define_insn "*movmisalign<mode>_neon_store"
>    [(set (match_operand:VDX 0 "neon_permissive_struct_operand"  "=Um")
>         (unspec:VDX [(match_operand:VDX 1 "s_register_operand" " w")]
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> index 2d0932b..f6a79e2 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -205,3 +205,27 @@ (define_expand "neg<mode>2"
>         (neg:VDQWH (match_operand:VDQWH 1 "s_register_operand" "")))]
>    "ARM_HAVE_<MODE>_ARITH"
>  )
> +
> +(define_expand "movmisalign<mode>"
> +  [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
> +       (unspec:VDQX [(match_operand:VDQX 1 "neon_perm_struct_or_reg_operand")]
> +        UNSPEC_MISALIGNED_ACCESS))]
> +  "ARM_HAVE_<MODE>_LDST && !BYTES_BIG_ENDIAN && unaligned_access"
> +{
> +  rtx adjust_mem;
> +  /* This pattern is not permitted to fail during expansion: if both arguments
> +     are non-registers (e.g. memory := constant, which can be created by the
> +     auto-vectorizer), force operand 1 into a register.  */
> +  if (!s_register_operand (operands[0], <MODE>mode)
> +      && !s_register_operand (operands[1], <MODE>mode))
> +    operands[1] = force_reg (<MODE>mode, operands[1]);
> +
> +  if (s_register_operand (operands[0], <MODE>mode))
> +    adjust_mem = operands[1];
> +  else
> +    adjust_mem = operands[0];
> +
> +  /* Legitimize address.  */
> +  if (!neon_vector_mem_operand (adjust_mem, 2, true))
> +    XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
> +})
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c b/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
> index afd0d60..7945a06 100644
> --- a/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
> @@ -47,3 +47,6 @@ FUNC(f, float, 16, 8, -, vneg)
>     functions above.  */
>  /* { dg-final { scan-assembler-times {vneg.s[0-9]+  q[0-9]+, q[0-9]+} 6 } } */
>  /* { dg-final { scan-assembler-times {vneg.f[0-9]+  q[0-9]+, q[0-9]+} 2 } } */
> +/* { dg-final { scan-assembler-times {vldr[bhw].[0-9]+\tq[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vstr[bhw].[0-9]+\tq[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-not {orr\tr[0-9]+, r[0-9]+, r[0-9]+} } } */
> --
> 2.7.4
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] arm: Auto-vectorization for MVE: vshl
  2020-12-17 17:48 ` [PATCH 2/3] arm: Auto-vectorization for MVE: vshl Christophe Lyon
@ 2020-12-30 10:34   ` Christophe Lyon
  2021-01-07 12:20     ` Christophe Lyon
  2021-01-15  9:42   ` Kyrylo Tkachov
  1 sibling, 1 reply; 16+ messages in thread
From: Christophe Lyon @ 2020-12-30 10:34 UTC (permalink / raw)
  To: gcc Patches

ping?

On Thu, 17 Dec 2020 at 18:48, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
>
> This patch enables MVE vshlq instructions for auto-vectorization.
>
> The existing mve_vshlq_n_<supf><mode> is kept, as it takes a single
> immediate as second operand, and is used by arm_mve.h.
>
> We move the vashl<mode>3 insn from neon.md to an expander in
> vec-common.md, and the mve_vshlq_<supf><mode> insn from mve.md to
> vec-common.md, adding the second alternative fron neon.md.
>
> mve_vshlq_<supf><mode> will be used by a later patch enabling
> vectorization for vshr, as a unified version of
> ashl3<mode3>_[signed|unsigned] from neon.md. Keeping the use of unspec
> VSHLQ enables to generate both 's' and 'u' variants.
>
> It is not clear whether the neon_shift_[reg|imm]<q> attribute is still
> suitable, since this insn is also used for MVE.
>
> I kept the mve_vshlq_<supf><mode> naming instead of renaming it to
> ashl3_<supf>_<mode> as discussed because the reference in
> arm_mve_builtins.def automatically inserts the "mve_" prefix and I
> didn't want to make a special case for this.
>
> I haven't yet found why the v16qi and v8hi tests are not vectorized.
> With dest[i] = a[i] << b[i] and:
>   {
>     int i;
>     unsigned int i.24_1;
>     unsigned int _2;
>     int16_t * _3;
>     short int _4;
>     int _5;
>     int16_t * _6;
>     short int _7;
>     int _8;
>     int _9;
>     int16_t * _10;
>     short int _11;
>     unsigned int ivtmp_42;
>     unsigned int ivtmp_43;
>
>     <bb 2> [local count: 119292720]:
>
>     <bb 3> [local count: 954449105]:
>     i.24_1 = (unsigned int) i_23;
>     _2 = i.24_1 * 2;
>     _3 = a_15(D) + _2;
>     _4 = *_3;
>     _5 = (int) _4;
>     _6 = b_16(D) + _2;
>     _7 = *_6;
>     _8 = (int) _7;
>     _9 = _5 << _8;
>     _10 = dest_17(D) + _2;
>     _11 = (short int) _9;
>     *_10 = _11;
>     i_19 = i_23 + 1;
>     ivtmp_42 = ivtmp_43 - 1;
>     if (ivtmp_42 != 0)
>       goto <bb 5>; [87.50%]
>     else
>       goto <bb 4>; [12.50%]
>
>     <bb 5> [local count: 835156386]:
>     goto <bb 3>; [100.00%]
>
>     <bb 4> [local count: 119292720]:
>     return;
>
>   }
> the vectorizer says:
> mve-vshl.c:37:96: note:   ==> examining statement: _5 = (int) _4;
> mve-vshl.c:37:96: note:   vect_is_simple_use: operand *_3, type of def: internal
> mve-vshl.c:37:96: note:   vect_is_simple_use: vectype vector(8) short int
> mve-vshl.c:37:96: missed:   conversion not supported by target.
> mve-vshl.c:37:96: note:   vect_is_simple_use: operand *_3, type of def: internal
> mve-vshl.c:37:96: note:   vect_is_simple_use: vectype vector(8) short int
> mve-vshl.c:37:96: note:   vect_is_simple_use: operand *_3, type of def: internal
> mve-vshl.c:37:96: note:   vect_is_simple_use: vectype vector(8) short int
> mve-vshl.c:37:117: missed:   not vectorized: relevant stmt not supported: _5 = (int) _4;
> mve-vshl.c:37:96: missed:  bad operation or unsupported loop bound.
> mve-vshl.c:37:96: note:  ***** Analysis failed with vector mode V8HI
>
> 2020-12-03  Christophe Lyon  <christophe.lyon@linaro.org>
>
>         gcc/
>         * config/arm/mve.md (mve_vshlq_<supf><mode>): Move to
>         vec-commond.md.
>         * config/arm/neon.md (vashl<mode>3): Delete.
>         * config/arm/vec-common.md (mve_vshlq_<supf><mode>): New.
>         (vasl<mode>3): New expander.
>
>         gcc/testsuite/
>         * gcc.target/arm/simd/mve-vshl.c: Add tests for vshl.
> ---
>  gcc/config/arm/mve.md                        | 13 +-----
>  gcc/config/arm/neon.md                       | 19 ---------
>  gcc/config/arm/vec-common.md                 | 30 ++++++++++++++
>  gcc/testsuite/gcc.target/arm/simd/mve-vshl.c | 62 ++++++++++++++++++++++++++++
>  4 files changed, 93 insertions(+), 31 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
>
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index 673a83c..8bdb451 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -822,18 +822,7 @@ (define_insn "mve_vcmpneq_<supf><mode>"
>
>  ;;
>  ;; [vshlq_s, vshlq_u])
> -;;
> -(define_insn "mve_vshlq_<supf><mode>"
> -  [
> -   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> -       (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
> -                      (match_operand:MVE_2 2 "s_register_operand" "w")]
> -        VSHLQ))
> -  ]
> -  "TARGET_HAVE_MVE"
> -  "vshl.<supf>%#<V_sz_elem>\t%q0, %q1, %q2"
> -  [(set_attr "type" "mve_move")
> -])
> +;; See vec-common.md
>
>  ;;
>  ;; [vabdq_s, vabdq_u])
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index 50220be..ac9bf74 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -845,25 +845,6 @@ (define_insn "*smax<mode>3_neon"
>  ; generic vectorizer code.  It ends up creating a V2DI constructor with
>  ; SImode elements.
>
> -(define_insn "vashl<mode>3"
> -  [(set (match_operand:VDQIW 0 "s_register_operand" "=w,w")
> -       (ashift:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w,w")
> -                     (match_operand:VDQIW 2 "imm_lshift_or_reg_neon" "w,Dm")))]
> -  "TARGET_NEON"
> -  {
> -    switch (which_alternative)
> -      {
> -        case 0: return "vshl.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2";
> -        case 1: return neon_output_shift_immediate ("vshl", 'i', &operands[2],
> -                                                   <MODE>mode,
> -                                                   VALID_NEON_QREG_MODE (<MODE>mode),
> -                                                   true);
> -        default: gcc_unreachable ();
> -      }
> -  }
> -  [(set_attr "type" "neon_shift_reg<q>, neon_shift_imm<q>")]
> -)
> -
>  (define_insn "vashr<mode>3_imm"
>    [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
>         (ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> index f6a79e2..3a282f0 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -229,3 +229,33 @@ (define_expand "movmisalign<mode>"
>    if (!neon_vector_mem_operand (adjust_mem, 2, true))
>      XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
>  })
> +
> +(define_insn "mve_vshlq_<supf><mode>"
> +  [(set (match_operand:VDQIW 0 "s_register_operand" "=w,w")
> +       (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w,w")
> +                      (match_operand:VDQIW 2 "imm_lshift_or_reg_neon" "w,Dm")]
> +        VSHLQ))]
> +  "ARM_HAVE_<MODE>_ARITH"
> +{
> +  switch (which_alternative)
> +    {
> +      case 0: return "vshl.<supf>%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2";
> +      case 1: return neon_output_shift_immediate ("vshl", 'i', &operands[2],
> +                                                 <MODE>mode,
> +                                                 VALID_NEON_QREG_MODE (<MODE>mode),
> +                                                 true);
> +      default: gcc_unreachable ();
> +    }
> +}
> +  [(set_attr "type" "neon_shift_reg<q>, neon_shift_imm<q>")]
> +)
> +
> +(define_expand "vashl<mode>3"
> +  [(set (match_operand:VDQIW 0 "s_register_operand" "")
> +       (ashift:VDQIW (match_operand:VDQIW 1 "s_register_operand" "")
> +                     (match_operand:VDQIW 2 "imm_lshift_or_reg_neon" "")))]
> +  "ARM_HAVE_<MODE>_ARITH"
> +{
> +  emit_insn (gen_mve_vshlq_u<mode> (operands[0], operands[1], operands[2]));
> +  DONE;
> +})
> \ No newline at end of file
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> new file mode 100644
> index 0000000..7a06449
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> @@ -0,0 +1,62 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> +/* { dg-add-options arm_v8_1m_mve } */
> +/* { dg-additional-options "-O3" } */
> +
> +#include <stdint.h>
> +
> +#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)                           \
> +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
> +    int i;                                                             \
> +    for (i=0; i<NB; i++) {                                             \
> +      dest[i] = a[i] OP b[i];                                          \
> +    }                                                                  \
> +}
> +
> +#define FUNC_IMM(SIGN, TYPE, BITS, NB, OP, NAME)                               \
> +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a) { \
> +    int i;                                                             \
> +    for (i=0; i<NB; i++) {                                             \
> +      dest[i] = a[i] OP 5;                                             \
> +    }                                                                  \
> +}
> +
> +/* 64-bit vectors.  */
> +FUNC(s, int, 32, 2, <<, vshl)
> +FUNC(u, uint, 32, 2, <<, vshl)
> +FUNC(s, int, 16, 4, <<, vshl)
> +FUNC(u, uint, 16, 4, <<, vshl)
> +FUNC(s, int, 8, 8, <<, vshl)
> +FUNC(u, uint, 8, 8, <<, vshl)
> +
> +/* 128-bit vectors.  */
> +FUNC(s, int, 32, 4, <<, vshl)
> +FUNC(u, uint, 32, 4, <<, vshl)
> +FUNC(s, int, 16, 8, <<, vshl)  /* FIXME: not vectorized */
> +FUNC(u, uint, 16, 8, <<, vshl) /* FIXME: not vectorized */
> +FUNC(s, int, 8, 16, <<, vshl)  /* FIXME: not vectorized */
> +FUNC(u, uint, 8, 16, <<, vshl) /* FIXME: not vectorized */
> +
> +/* 64-bit vectors.  */
> +FUNC_IMM(s, int, 32, 2, <<, vshlimm)
> +FUNC_IMM(u, uint, 32, 2, <<, vshlimm)
> +FUNC_IMM(s, int, 16, 4, <<, vshlimm)
> +FUNC_IMM(u, uint, 16, 4, <<, vshlimm)
> +FUNC_IMM(s, int, 8, 8, <<, vshlimm)
> +FUNC_IMM(u, uint, 8, 8, <<, vshlimm)
> +
> +/* 128-bit vectors.  */
> +FUNC_IMM(s, int, 32, 4, <<, vshlimm)
> +FUNC_IMM(u, uint, 32, 4, <<, vshlimm)
> +FUNC_IMM(s, int, 16, 8, <<, vshlimm)
> +FUNC_IMM(u, uint, 16, 8, <<, vshlimm)
> +FUNC_IMM(s, int, 8, 16, <<, vshlimm)
> +FUNC_IMM(u, uint, 8, 16, <<, vshlimm)
> +
> +/* MVE has only 128-bit vectors, so we can vectorize only half of the
> +   functions above.  */
> +/* We only emit vshl.u, which is equivalent to vshl.s anyway.  */
> +/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 2 } } */
> +
> +/* We emit vshl.i when the shift amount is an immediate.  */
> +/* { dg-final { scan-assembler-times {vshl.i[0-9]+\tq[0-9]+, q[0-9]+} 6 } } */
> --
> 2.7.4
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] arm: Auto-vectorization for MVE: vshr
  2020-12-17 17:48 ` [PATCH 3/3] arm: Auto-vectorization for MVE: vshr Christophe Lyon
@ 2020-12-30 10:34   ` Christophe Lyon
  2021-01-07 12:20     ` Christophe Lyon
  2021-01-15  9:44   ` Kyrylo Tkachov
  1 sibling, 1 reply; 16+ messages in thread
From: Christophe Lyon @ 2020-12-30 10:34 UTC (permalink / raw)
  To: gcc Patches

ping?

On Thu, 17 Dec 2020 at 18:48, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
>
> This patch enables MVE vshr instructions for auto-vectorization.  New
> MVE patterns are introduced that take a vector of constants as second
> operand, all constants being equal.
>
> The existing mve_vshrq_n_<supf><mode> is kept, as it takes a single
> immediate as second operand, and is used by arm_mve.h.
>
> The vashr<mode>3 and vlshr<mode>3 expanders are moved fron neon.md to
> vec-common.md, updated to rely on the normal expansion scheme to
> generate shifts by immediate.
>
> 2020-12-03  Christophe Lyon  <christophe.lyon@linaro.org>
>
>         gcc/
>         * config/arm/mve.md (mve_vshrq_n_s<mode>_imm): New entry.
>         (mve_vshrq_n_u<mode>_imm): Likewise.
>         * config/arm/neon.md (vashr<mode>3, vlshr<mode>3): Move to ...
>         * config/arm/vec-common.md: ... here.
>
>         gcc/testsuite/
>         * gcc.target/arm/simd/mve-vshr.c: Add tests for vshr.
> ---
>  gcc/config/arm/mve.md                        | 34 ++++++++++++++++
>  gcc/config/arm/neon.md                       | 34 ----------------
>  gcc/config/arm/vec-common.md                 | 38 +++++++++++++++++-
>  gcc/testsuite/gcc.target/arm/simd/mve-vshr.c | 59 ++++++++++++++++++++++++++++
>  4 files changed, 130 insertions(+), 35 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vshr.c
>
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index 8bdb451..eea8b20 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -763,6 +763,7 @@ (define_insn "mve_vcreateq_<supf><mode>"
>  ;;
>  ;; [vshrq_n_s, vshrq_n_u])
>  ;;
> +;; Version that takes an immediate as operand 2.
>  (define_insn "mve_vshrq_n_<supf><mode>"
>    [
>     (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> @@ -775,6 +776,39 @@ (define_insn "mve_vshrq_n_<supf><mode>"
>    [(set_attr "type" "mve_move")
>  ])
>
> +;; Versions that take constant vectors as operand 2 (with all elements
> +;; equal).
> +(define_insn "mve_vshrq_n_s<mode>_imm"
> +  [
> +   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> +       (ashiftrt:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
> +                       (match_operand:MVE_2 2 "imm_for_neon_rshift_operand" "i")))
> +  ]
> +  "TARGET_HAVE_MVE"
> +  {
> +    return neon_output_shift_immediate ("vshr", 's', &operands[2],
> +                                       <MODE>mode,
> +                                       VALID_NEON_QREG_MODE (<MODE>mode),
> +                                       true);
> +  }
> +  [(set_attr "type" "mve_move")
> +])
> +(define_insn "mve_vshrq_n_u<mode>_imm"
> +  [
> +   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> +       (lshiftrt:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
> +                       (match_operand:MVE_2 2 "imm_for_neon_rshift_operand" "i")))
> +  ]
> +  "TARGET_HAVE_MVE"
> +  {
> +    return neon_output_shift_immediate ("vshr", 'u', &operands[2],
> +                                       <MODE>mode,
> +                                       VALID_NEON_QREG_MODE (<MODE>mode),
> +                                       true);
> +  }
> +  [(set_attr "type" "mve_move")
> +])
> +
>  ;;
>  ;; [vcvtq_n_from_f_s, vcvtq_n_from_f_u])
>  ;;
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index ac9bf74..a0e8d7a 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -899,40 +899,6 @@ (define_insn "ashl<mode>3_unsigned"
>    [(set_attr "type" "neon_shift_reg<q>")]
>  )
>
> -(define_expand "vashr<mode>3"
> -  [(set (match_operand:VDQIW 0 "s_register_operand")
> -       (ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
> -                       (match_operand:VDQIW 2 "imm_rshift_or_reg_neon")))]
> -  "TARGET_NEON"
> -{
> -  if (s_register_operand (operands[2], <MODE>mode))
> -    {
> -      rtx neg = gen_reg_rtx (<MODE>mode);
> -      emit_insn (gen_neon_neg<mode>2 (neg, operands[2]));
> -      emit_insn (gen_ashl<mode>3_signed (operands[0], operands[1], neg));
> -    }
> -  else
> -    emit_insn (gen_vashr<mode>3_imm (operands[0], operands[1], operands[2]));
> -  DONE;
> -})
> -
> -(define_expand "vlshr<mode>3"
> -  [(set (match_operand:VDQIW 0 "s_register_operand")
> -       (lshiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
> -                       (match_operand:VDQIW 2 "imm_rshift_or_reg_neon")))]
> -  "TARGET_NEON"
> -{
> -  if (s_register_operand (operands[2], <MODE>mode))
> -    {
> -      rtx neg = gen_reg_rtx (<MODE>mode);
> -      emit_insn (gen_neon_neg<mode>2 (neg, operands[2]));
> -      emit_insn (gen_ashl<mode>3_unsigned (operands[0], operands[1], neg));
> -    }
> -  else
> -    emit_insn (gen_vlshr<mode>3_imm (operands[0], operands[1], operands[2]));
> -  DONE;
> -})
> -
>  ;; 64-bit shifts
>
>  ;; This pattern loads a 32-bit shift count into a 64-bit NEON register,
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> index 3a282f0..e126557 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -258,4 +258,40 @@ (define_expand "vashl<mode>3"
>  {
>    emit_insn (gen_mve_vshlq_u<mode> (operands[0], operands[1], operands[2]));
>    DONE;
> -})
> \ No newline at end of file
> +})
> +
> +;; When operand 2 is an immediate, use the normal expansion to match
> +;; gen_vashr<mode>3_imm for Neon and gen_mve_vshrq_n_s<mode>_imm for
> +;; MVE.
> +(define_expand "vashr<mode>3"
> +  [(set (match_operand:VDQIW 0 "s_register_operand")
> +       (ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
> +                       (match_operand:VDQIW 2 "imm_rshift_or_reg_neon")))]
> +  "ARM_HAVE_<MODE>_ARITH"
> +{
> +  if (s_register_operand (operands[2], <MODE>mode))
> +    {
> +      rtx neg = gen_reg_rtx (<MODE>mode);
> +      emit_insn (gen_neg<mode>2 (neg, operands[2]));
> +      emit_insn (gen_mve_vshlq_s<mode> (operands[0], operands[1], neg));
> +      DONE;
> +    }
> +})
> +
> +;; When operand 2 is an immediate, use the normal expansion to match
> +;; gen_vashr<mode>3_imm for Neon and gen_mve_vshrq_n_u<mode>_imm for
> +;; MVE.
> +(define_expand "vlshr<mode>3"
> +  [(set (match_operand:VDQIW 0 "s_register_operand")
> +       (lshiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
> +                       (match_operand:VDQIW 2 "imm_rshift_or_reg_neon")))]
> +  "ARM_HAVE_<MODE>_ARITH"
> +{
> +  if (s_register_operand (operands[2], <MODE>mode))
> +    {
> +      rtx neg = gen_reg_rtx (<MODE>mode);
> +      emit_insn (gen_neg<mode>2 (neg, operands[2]));
> +      emit_insn (gen_mve_vshlq_u<mode> (operands[0], operands[1], neg));
> +      DONE;
> +    }
> +})
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c b/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c
> new file mode 100644
> index 0000000..d4e658c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c
> @@ -0,0 +1,59 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> +/* { dg-add-options arm_v8_1m_mve } */
> +/* { dg-additional-options "-O3" } */
> +
> +#include <stdint.h>
> +
> +#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)                           \
> +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
> +    int i;                                                             \
> +    for (i=0; i<NB; i++) {                                             \
> +      dest[i] = a[i] OP b[i];                                          \
> +    }                                                                  \
> +}
> +
> +#define FUNC_IMM(SIGN, TYPE, BITS, NB, OP, NAME)                               \
> +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a) { \
> +    int i;                                                             \
> +    for (i=0; i<NB; i++) {                                             \
> +      dest[i] = a[i] OP 5;                                             \
> +    }                                                                  \
> +}
> +
> +/* 64-bit vectors.  */
> +FUNC(s, int, 32, 2, >>, vshr)
> +FUNC(u, uint, 32, 2, >>, vshr)
> +FUNC(s, int, 16, 4, >>, vshr)
> +FUNC(u, uint, 16, 4, >>, vshr)
> +FUNC(s, int, 8, 8, >>, vshr)
> +FUNC(u, uint, 8, 8, >>, vshr)
> +
> +/* 128-bit vectors.  */
> +FUNC(s, int, 32, 4, >>, vshr)
> +FUNC(u, uint, 32, 4, >>, vshr)
> +FUNC(s, int, 16, 8, >>, vshr)
> +FUNC(u, uint, 16, 8, >>, vshr)
> +FUNC(s, int, 8, 16, >>, vshr)
> +FUNC(u, uint, 8, 16, >>, vshr)
> +
> +/* 64-bit vectors.  */
> +FUNC_IMM(s, int, 32, 2, >>, vshrimm)
> +FUNC_IMM(u, uint, 32, 2, >>, vshrimm)
> +FUNC_IMM(s, int, 16, 4, >>, vshrimm)
> +FUNC_IMM(u, uint, 16, 4, >>, vshrimm)
> +FUNC_IMM(s, int, 8, 8, >>, vshrimm)
> +FUNC_IMM(u, uint, 8, 8, >>, vshrimm)
> +
> +/* 128-bit vectors.  */
> +FUNC_IMM(s, int, 32, 4, >>, vshrimm)
> +FUNC_IMM(u, uint, 32, 4, >>, vshrimm)
> +FUNC_IMM(s, int, 16, 8, >>, vshrimm)
> +FUNC_IMM(u, uint, 16, 8, >>, vshrimm)
> +FUNC_IMM(s, int, 8, 16, >>, vshrimm)
> +FUNC_IMM(u, uint, 8, 16, >>, vshrimm)
> +
> +/* MVE has only 128-bit vectors, so we can vectorize only half of the
> +   functions above.  */
> +/* { dg-final { scan-assembler-times {vshr.s[0-9]+\tq[0-9]+, q[0-9]+} 3 } } */
> +/* { dg-final { scan-assembler-times {vshr.u[0-9]+\tq[0-9]+, q[0-9]+} 3 } } */
> --
> 2.7.4
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR target/97875)
  2020-12-30 10:33 ` [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR target/97875) Christophe Lyon
@ 2021-01-07 12:20   ` Christophe Lyon
  0 siblings, 0 replies; 16+ messages in thread
From: Christophe Lyon @ 2021-01-07 12:20 UTC (permalink / raw)
  To: gcc Patches

ping^2?

On Wed, 30 Dec 2020 at 11:33, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
>
> ping?
>
> On Thu, 17 Dec 2020 at 18:48, Christophe Lyon
> <christophe.lyon@linaro.org> wrote:
> >
> > This patch adds new movmisalign<mode>_mve_load and store patterns for
> > MVE to help vectorization. They are very similar to their Neon
> > counterparts, but use different iterators and instructions.
> >
> > Indeed MVE supports less vectors modes than Neon, so we use
> > the MVE_VLD_ST iterator where Neon uses VQX.
> >
> > Since the supported modes are different from the ones valid for
> > arithmetic operators, we introduce two new sets of macros:
> >
> > ARM_HAVE_NEON_<MODE>_LDST
> >   true if Neon has vector load/store instructions for <MODE>
> >
> > ARM_HAVE_<MODE>_LDST
> >   true if any vector extension has vector load/store instructions for <MODE>
> >
> > We move the movmisalign<mode> expander from neon.md to vec-commond.md, and
> > replace the TARGET_NEON enabler with ARM_HAVE_<MODE>_LDST.
> >
> > The patch also updates the mve-vneg.c test to scan for the better code
> > generation when loading and storing the vectors involved: it checks
> > that no 'orr' instruction is generated to cope with misalignment at
> > runtime.
> > This test was chosen among the other mve tests, but any other should
> > be OK. Using a plain vector copy loop (dest[i] = a[i]) is not a good
> > test because the compiler chooses to use memcpy.
> >
> > For instance we now generate:
> > test_vneg_s32x4:
> >         vldrw.32       q3, [r1]
> >         vneg.s32  q3, q3
> >         vstrw.32       q3, [r0]
> >         bx      lr
> >
> > instead of:
> > test_vneg_s32x4:
> >         orr     r3, r1, r0
> >         lsls    r3, r3, #28
> >         bne     .L15
> >         vldrw.32        q3, [r1]
> >         vneg.s32  q3, q3
> >         vstrw.32        q3, [r0]
> >         bx      lr
> >         .L15:
> >         push    {r4, r5}
> >         ldrd    r2, r3, [r1, #8]
> >         ldrd    r5, r4, [r1]
> >         rsbs    r2, r2, #0
> >         rsbs    r5, r5, #0
> >         rsbs    r4, r4, #0
> >         rsbs    r3, r3, #0
> >         strd    r5, r4, [r0]
> >         pop     {r4, r5}
> >         strd    r2, r3, [r0, #8]
> >         bx      lr
> >
> > 2020-12-15  Christophe Lyon  <christophe.lyon@linaro.org>
> >
> >         PR target/97875
> >         gcc/
> >         * config/arm/arm.h (ARM_HAVE_NEON_V8QI_LDST): New macro.
> >         (ARM_HAVE_NEON_V16QI_LDST, ARM_HAVE_NEON_V4HI_LDST): Likewise.
> >         (ARM_HAVE_NEON_V8HI_LDST, ARM_HAVE_NEON_V2SI_LDST): Likewise.
> >         (ARM_HAVE_NEON_V4SI_LDST, ARM_HAVE_NEON_V4HF_LDST): Likewise.
> >         (ARM_HAVE_NEON_V8HF_LDST, ARM_HAVE_NEON_V4BF_LDST): Likewise.
> >         (ARM_HAVE_NEON_V8BF_LDST, ARM_HAVE_NEON_V2SF_LDST): Likewise.
> >         (ARM_HAVE_NEON_V4SF_LDST, ARM_HAVE_NEON_DI_LDST): Likewise.
> >         (ARM_HAVE_NEON_V2DI_LDST): Likewise.
> >         (ARM_HAVE_V8QI_LDST, ARM_HAVE_V16QI_LDST): Likewise.
> >         (ARM_HAVE_V4HI_LDST, ARM_HAVE_V8HI_LDST): Likewise.
> >         (ARM_HAVE_V2SI_LDST, ARM_HAVE_V4SI_LDST, ARM_HAVE_V4HF_LDST): Likewise.
> >         (ARM_HAVE_V8HF_LDST, ARM_HAVE_V4BF_LDST, ARM_HAVE_V8BF_LDST): Likewise.
> >         (ARM_HAVE_V2SF_LDST, ARM_HAVE_V4SF_LDST, ARM_HAVE_DI_LDST): Likewise.
> >         (ARM_HAVE_V2DI_LDST): Likewise.
> >         * config/arm/mve.md (*movmisalign<mode>_mve_store): New pattern.
> >         (*movmisalign<mode>_mve_load): New pattern.
> >         * config/arm/neon.md (movmisalign<mode>): Move to ...
> >         * config/arm/vec-common.md: ... here.
> >
> >         PR target/97875
> >         gcc/testsuite/
> >         * gcc.target/arm/simd/mve-vneg.c: Update test.
> > ---
> >  gcc/config/arm/arm.h                         | 40 ++++++++++++++++++++++++++++
> >  gcc/config/arm/mve.md                        | 25 +++++++++++++++++
> >  gcc/config/arm/neon.md                       | 25 -----------------
> >  gcc/config/arm/vec-common.md                 | 24 +++++++++++++++++
> >  gcc/testsuite/gcc.target/arm/simd/mve-vneg.c |  3 +++
> >  5 files changed, 92 insertions(+), 25 deletions(-)
> >
> > diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
> > index 4a63d33..d44e0c6 100644
> > --- a/gcc/config/arm/arm.h
> > +++ b/gcc/config/arm/arm.h
> > @@ -1151,6 +1151,46 @@ extern const int arm_arch_cde_coproc_bits[];
> >  #define ARM_HAVE_V8HF_ARITH (ARM_HAVE_NEON_V8HF_ARITH || TARGET_HAVE_MVE_FLOAT)
> >  #define ARM_HAVE_V4SF_ARITH (ARM_HAVE_NEON_V4SF_ARITH || TARGET_HAVE_MVE_FLOAT)
> >
> > +/* The conditions under which vector modes are supported by load/store
> > +   instructions using Neon.  */
> > +
> > +#define ARM_HAVE_NEON_V8QI_LDST TARGET_NEON
> > +#define ARM_HAVE_NEON_V16QI_LDST TARGET_NEON
> > +#define ARM_HAVE_NEON_V4HI_LDST TARGET_NEON
> > +#define ARM_HAVE_NEON_V8HI_LDST TARGET_NEON
> > +#define ARM_HAVE_NEON_V2SI_LDST TARGET_NEON
> > +#define ARM_HAVE_NEON_V4SI_LDST TARGET_NEON
> > +#define ARM_HAVE_NEON_V4HF_LDST TARGET_NEON_FP16INST
> > +#define ARM_HAVE_NEON_V8HF_LDST TARGET_NEON_FP16INST
> > +#define ARM_HAVE_NEON_V4BF_LDST TARGET_BF16_SIMD
> > +#define ARM_HAVE_NEON_V8BF_LDST TARGET_BF16_SIMD
> > +#define ARM_HAVE_NEON_V2SF_LDST TARGET_NEON
> > +#define ARM_HAVE_NEON_V4SF_LDST TARGET_NEON
> > +#define ARM_HAVE_NEON_DI_LDST TARGET_NEON
> > +#define ARM_HAVE_NEON_V2DI_LDST TARGET_NEON
> > +
> > +/* The conditions under which vector modes are supported by load/store
> > +   instructions by any vector extension.  */
> > +
> > +#define ARM_HAVE_V8QI_LDST (ARM_HAVE_NEON_V8QI_LDST || TARGET_REALLY_IWMMXT)
> > +#define ARM_HAVE_V4HI_LDST (ARM_HAVE_NEON_V4HI_LDST || TARGET_REALLY_IWMMXT)
> > +#define ARM_HAVE_V2SI_LDST (ARM_HAVE_NEON_V2SI_LDST || TARGET_REALLY_IWMMXT)
> > +
> > +#define ARM_HAVE_V16QI_LDST (ARM_HAVE_NEON_V16QI_LDST || TARGET_HAVE_MVE)
> > +#define ARM_HAVE_V8HI_LDST (ARM_HAVE_NEON_V8HI_LDST || TARGET_HAVE_MVE)
> > +#define ARM_HAVE_V4SI_LDST (ARM_HAVE_NEON_V4SI_LDST || TARGET_HAVE_MVE)
> > +#define ARM_HAVE_DI_LDST ARM_HAVE_NEON_DI_LDST
> > +#define ARM_HAVE_V2DI_LDST ARM_HAVE_NEON_V2DI_LDST
> > +
> > +#define ARM_HAVE_V4HF_LDST ARM_HAVE_NEON_V4HF_LDST
> > +#define ARM_HAVE_V2SF_LDST ARM_HAVE_NEON_V2SF_LDST
> > +
> > +#define ARM_HAVE_V4BF_LDST ARM_HAVE_NEON_V4BF_LDST
> > +#define ARM_HAVE_V8BF_LDST ARM_HAVE_NEON_V8BF_LDST
> > +
> > +#define ARM_HAVE_V8HF_LDST (ARM_HAVE_NEON_V8HF_LDST || TARGET_HAVE_MVE_FLOAT)
> > +#define ARM_HAVE_V4SF_LDST (ARM_HAVE_NEON_V4SF_LDST || TARGET_HAVE_MVE_FLOAT)
> > +
> >  /* The register numbers in sequence, for passing to arm_gen_load_multiple.  */
> >  extern int arm_regs_in_sequence[];
> >
> > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> > index b4c5a1e2..673a83c 100644
> > --- a/gcc/config/arm/mve.md
> > +++ b/gcc/config/arm/mve.md
> > @@ -10937,3 +10937,28 @@ (define_insn "arm_vcx3q<a>_p_v16qi"
> >    [(set_attr "type" "coproc")
> >     (set_attr "length" "8")]
> >  )
> > +
> > +(define_insn "*movmisalign<mode>_mve_store"
> > +  [(set (match_operand:MVE_VLD_ST 0 "neon_permissive_struct_operand"        "=Um")
> > +       (unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1 "s_register_operand" " w")]
> > +        UNSPEC_MISALIGNED_ACCESS))]
> > +  "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
> > +   || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))
> > +   && !BYTES_BIG_ENDIAN && unaligned_access"
> > +  "vstr<V_sz_elem1>.<V_sz_elem>\t%q1, %E0"
> > +  [(set_attr "type" "mve_store")
> > +   (set_attr "length" "4")]
> > +)
> > +
> > +
> > +(define_insn "*movmisalign<mode>_mve_load"
> > +  [(set (match_operand:MVE_VLD_ST 0 "s_register_operand"                                "=w")
> > +       (unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1 "neon_permissive_struct_operand" " Um")]
> > +        UNSPEC_MISALIGNED_ACCESS))]
> > +  "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
> > +   || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))
> > +   && !BYTES_BIG_ENDIAN && unaligned_access"
> > +  "vldr<V_sz_elem1>.<V_sz_elem>\t%q0, %E1"
> > +  [(set_attr "type" "mve_load")
> > +   (set_attr "length" "4")]
> > +)
> > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> > index d2e92ba..50220be 100644
> > --- a/gcc/config/arm/neon.md
> > +++ b/gcc/config/arm/neon.md
> > @@ -280,31 +280,6 @@ (define_split
> >    neon_disambiguate_copy (operands, dest, src, 4);
> >  })
> >
> > -(define_expand "movmisalign<mode>"
> > -  [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
> > -       (unspec:VDQX [(match_operand:VDQX 1 "neon_perm_struct_or_reg_operand")]
> > -                    UNSPEC_MISALIGNED_ACCESS))]
> > -  "TARGET_NEON && !BYTES_BIG_ENDIAN && unaligned_access"
> > -{
> > -  rtx adjust_mem;
> > -  /* This pattern is not permitted to fail during expansion: if both arguments
> > -     are non-registers (e.g. memory := constant, which can be created by the
> > -     auto-vectorizer), force operand 1 into a register.  */
> > -  if (!s_register_operand (operands[0], <MODE>mode)
> > -      && !s_register_operand (operands[1], <MODE>mode))
> > -    operands[1] = force_reg (<MODE>mode, operands[1]);
> > -
> > -  if (s_register_operand (operands[0], <MODE>mode))
> > -    adjust_mem = operands[1];
> > -  else
> > -    adjust_mem = operands[0];
> > -
> > -  /* Legitimize address.  */
> > -  if (!neon_vector_mem_operand (adjust_mem, 2, true))
> > -    XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
> > -
> > -})
> > -
> >  (define_insn "*movmisalign<mode>_neon_store"
> >    [(set (match_operand:VDX 0 "neon_permissive_struct_operand"  "=Um")
> >         (unspec:VDX [(match_operand:VDX 1 "s_register_operand" " w")]
> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > index 2d0932b..f6a79e2 100644
> > --- a/gcc/config/arm/vec-common.md
> > +++ b/gcc/config/arm/vec-common.md
> > @@ -205,3 +205,27 @@ (define_expand "neg<mode>2"
> >         (neg:VDQWH (match_operand:VDQWH 1 "s_register_operand" "")))]
> >    "ARM_HAVE_<MODE>_ARITH"
> >  )
> > +
> > +(define_expand "movmisalign<mode>"
> > +  [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
> > +       (unspec:VDQX [(match_operand:VDQX 1 "neon_perm_struct_or_reg_operand")]
> > +        UNSPEC_MISALIGNED_ACCESS))]
> > +  "ARM_HAVE_<MODE>_LDST && !BYTES_BIG_ENDIAN && unaligned_access"
> > +{
> > +  rtx adjust_mem;
> > +  /* This pattern is not permitted to fail during expansion: if both arguments
> > +     are non-registers (e.g. memory := constant, which can be created by the
> > +     auto-vectorizer), force operand 1 into a register.  */
> > +  if (!s_register_operand (operands[0], <MODE>mode)
> > +      && !s_register_operand (operands[1], <MODE>mode))
> > +    operands[1] = force_reg (<MODE>mode, operands[1]);
> > +
> > +  if (s_register_operand (operands[0], <MODE>mode))
> > +    adjust_mem = operands[1];
> > +  else
> > +    adjust_mem = operands[0];
> > +
> > +  /* Legitimize address.  */
> > +  if (!neon_vector_mem_operand (adjust_mem, 2, true))
> > +    XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
> > +})
> > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c b/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
> > index afd0d60..7945a06 100644
> > --- a/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
> > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
> > @@ -47,3 +47,6 @@ FUNC(f, float, 16, 8, -, vneg)
> >     functions above.  */
> >  /* { dg-final { scan-assembler-times {vneg.s[0-9]+  q[0-9]+, q[0-9]+} 6 } } */
> >  /* { dg-final { scan-assembler-times {vneg.f[0-9]+  q[0-9]+, q[0-9]+} 2 } } */
> > +/* { dg-final { scan-assembler-times {vldr[bhw].[0-9]+\tq[0-9]+} 8 } } */
> > +/* { dg-final { scan-assembler-times {vstr[bhw].[0-9]+\tq[0-9]+} 8 } } */
> > +/* { dg-final { scan-assembler-not {orr\tr[0-9]+, r[0-9]+, r[0-9]+} } } */
> > --
> > 2.7.4
> >

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] arm: Auto-vectorization for MVE: vshl
  2020-12-30 10:34   ` Christophe Lyon
@ 2021-01-07 12:20     ` Christophe Lyon
  2021-01-15  9:29       ` Christophe Lyon
  0 siblings, 1 reply; 16+ messages in thread
From: Christophe Lyon @ 2021-01-07 12:20 UTC (permalink / raw)
  To: gcc Patches

ping^2?

On Wed, 30 Dec 2020 at 11:34, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
>
> ping?
>
> On Thu, 17 Dec 2020 at 18:48, Christophe Lyon
> <christophe.lyon@linaro.org> wrote:
> >
> > This patch enables MVE vshlq instructions for auto-vectorization.
> >
> > The existing mve_vshlq_n_<supf><mode> is kept, as it takes a single
> > immediate as second operand, and is used by arm_mve.h.
> >
> > We move the vashl<mode>3 insn from neon.md to an expander in
> > vec-common.md, and the mve_vshlq_<supf><mode> insn from mve.md to
> > vec-common.md, adding the second alternative fron neon.md.
> >
> > mve_vshlq_<supf><mode> will be used by a later patch enabling
> > vectorization for vshr, as a unified version of
> > ashl3<mode3>_[signed|unsigned] from neon.md. Keeping the use of unspec
> > VSHLQ enables to generate both 's' and 'u' variants.
> >
> > It is not clear whether the neon_shift_[reg|imm]<q> attribute is still
> > suitable, since this insn is also used for MVE.
> >
> > I kept the mve_vshlq_<supf><mode> naming instead of renaming it to
> > ashl3_<supf>_<mode> as discussed because the reference in
> > arm_mve_builtins.def automatically inserts the "mve_" prefix and I
> > didn't want to make a special case for this.
> >
> > I haven't yet found why the v16qi and v8hi tests are not vectorized.
> > With dest[i] = a[i] << b[i] and:
> >   {
> >     int i;
> >     unsigned int i.24_1;
> >     unsigned int _2;
> >     int16_t * _3;
> >     short int _4;
> >     int _5;
> >     int16_t * _6;
> >     short int _7;
> >     int _8;
> >     int _9;
> >     int16_t * _10;
> >     short int _11;
> >     unsigned int ivtmp_42;
> >     unsigned int ivtmp_43;
> >
> >     <bb 2> [local count: 119292720]:
> >
> >     <bb 3> [local count: 954449105]:
> >     i.24_1 = (unsigned int) i_23;
> >     _2 = i.24_1 * 2;
> >     _3 = a_15(D) + _2;
> >     _4 = *_3;
> >     _5 = (int) _4;
> >     _6 = b_16(D) + _2;
> >     _7 = *_6;
> >     _8 = (int) _7;
> >     _9 = _5 << _8;
> >     _10 = dest_17(D) + _2;
> >     _11 = (short int) _9;
> >     *_10 = _11;
> >     i_19 = i_23 + 1;
> >     ivtmp_42 = ivtmp_43 - 1;
> >     if (ivtmp_42 != 0)
> >       goto <bb 5>; [87.50%]
> >     else
> >       goto <bb 4>; [12.50%]
> >
> >     <bb 5> [local count: 835156386]:
> >     goto <bb 3>; [100.00%]
> >
> >     <bb 4> [local count: 119292720]:
> >     return;
> >
> >   }
> > the vectorizer says:
> > mve-vshl.c:37:96: note:   ==> examining statement: _5 = (int) _4;
> > mve-vshl.c:37:96: note:   vect_is_simple_use: operand *_3, type of def: internal
> > mve-vshl.c:37:96: note:   vect_is_simple_use: vectype vector(8) short int
> > mve-vshl.c:37:96: missed:   conversion not supported by target.
> > mve-vshl.c:37:96: note:   vect_is_simple_use: operand *_3, type of def: internal
> > mve-vshl.c:37:96: note:   vect_is_simple_use: vectype vector(8) short int
> > mve-vshl.c:37:96: note:   vect_is_simple_use: operand *_3, type of def: internal
> > mve-vshl.c:37:96: note:   vect_is_simple_use: vectype vector(8) short int
> > mve-vshl.c:37:117: missed:   not vectorized: relevant stmt not supported: _5 = (int) _4;
> > mve-vshl.c:37:96: missed:  bad operation or unsupported loop bound.
> > mve-vshl.c:37:96: note:  ***** Analysis failed with vector mode V8HI
> >
> > 2020-12-03  Christophe Lyon  <christophe.lyon@linaro.org>
> >
> >         gcc/
> >         * config/arm/mve.md (mve_vshlq_<supf><mode>): Move to
> >         vec-commond.md.
> >         * config/arm/neon.md (vashl<mode>3): Delete.
> >         * config/arm/vec-common.md (mve_vshlq_<supf><mode>): New.
> >         (vasl<mode>3): New expander.
> >
> >         gcc/testsuite/
> >         * gcc.target/arm/simd/mve-vshl.c: Add tests for vshl.
> > ---
> >  gcc/config/arm/mve.md                        | 13 +-----
> >  gcc/config/arm/neon.md                       | 19 ---------
> >  gcc/config/arm/vec-common.md                 | 30 ++++++++++++++
> >  gcc/testsuite/gcc.target/arm/simd/mve-vshl.c | 62 ++++++++++++++++++++++++++++
> >  4 files changed, 93 insertions(+), 31 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> >
> > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> > index 673a83c..8bdb451 100644
> > --- a/gcc/config/arm/mve.md
> > +++ b/gcc/config/arm/mve.md
> > @@ -822,18 +822,7 @@ (define_insn "mve_vcmpneq_<supf><mode>"
> >
> >  ;;
> >  ;; [vshlq_s, vshlq_u])
> > -;;
> > -(define_insn "mve_vshlq_<supf><mode>"
> > -  [
> > -   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> > -       (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
> > -                      (match_operand:MVE_2 2 "s_register_operand" "w")]
> > -        VSHLQ))
> > -  ]
> > -  "TARGET_HAVE_MVE"
> > -  "vshl.<supf>%#<V_sz_elem>\t%q0, %q1, %q2"
> > -  [(set_attr "type" "mve_move")
> > -])
> > +;; See vec-common.md
> >
> >  ;;
> >  ;; [vabdq_s, vabdq_u])
> > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> > index 50220be..ac9bf74 100644
> > --- a/gcc/config/arm/neon.md
> > +++ b/gcc/config/arm/neon.md
> > @@ -845,25 +845,6 @@ (define_insn "*smax<mode>3_neon"
> >  ; generic vectorizer code.  It ends up creating a V2DI constructor with
> >  ; SImode elements.
> >
> > -(define_insn "vashl<mode>3"
> > -  [(set (match_operand:VDQIW 0 "s_register_operand" "=w,w")
> > -       (ashift:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w,w")
> > -                     (match_operand:VDQIW 2 "imm_lshift_or_reg_neon" "w,Dm")))]
> > -  "TARGET_NEON"
> > -  {
> > -    switch (which_alternative)
> > -      {
> > -        case 0: return "vshl.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2";
> > -        case 1: return neon_output_shift_immediate ("vshl", 'i', &operands[2],
> > -                                                   <MODE>mode,
> > -                                                   VALID_NEON_QREG_MODE (<MODE>mode),
> > -                                                   true);
> > -        default: gcc_unreachable ();
> > -      }
> > -  }
> > -  [(set_attr "type" "neon_shift_reg<q>, neon_shift_imm<q>")]
> > -)
> > -
> >  (define_insn "vashr<mode>3_imm"
> >    [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
> >         (ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > index f6a79e2..3a282f0 100644
> > --- a/gcc/config/arm/vec-common.md
> > +++ b/gcc/config/arm/vec-common.md
> > @@ -229,3 +229,33 @@ (define_expand "movmisalign<mode>"
> >    if (!neon_vector_mem_operand (adjust_mem, 2, true))
> >      XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
> >  })
> > +
> > +(define_insn "mve_vshlq_<supf><mode>"
> > +  [(set (match_operand:VDQIW 0 "s_register_operand" "=w,w")
> > +       (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w,w")
> > +                      (match_operand:VDQIW 2 "imm_lshift_or_reg_neon" "w,Dm")]
> > +        VSHLQ))]
> > +  "ARM_HAVE_<MODE>_ARITH"
> > +{
> > +  switch (which_alternative)
> > +    {
> > +      case 0: return "vshl.<supf>%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2";
> > +      case 1: return neon_output_shift_immediate ("vshl", 'i', &operands[2],
> > +                                                 <MODE>mode,
> > +                                                 VALID_NEON_QREG_MODE (<MODE>mode),
> > +                                                 true);
> > +      default: gcc_unreachable ();
> > +    }
> > +}
> > +  [(set_attr "type" "neon_shift_reg<q>, neon_shift_imm<q>")]
> > +)
> > +
> > +(define_expand "vashl<mode>3"
> > +  [(set (match_operand:VDQIW 0 "s_register_operand" "")
> > +       (ashift:VDQIW (match_operand:VDQIW 1 "s_register_operand" "")
> > +                     (match_operand:VDQIW 2 "imm_lshift_or_reg_neon" "")))]
> > +  "ARM_HAVE_<MODE>_ARITH"
> > +{
> > +  emit_insn (gen_mve_vshlq_u<mode> (operands[0], operands[1], operands[2]));
> > +  DONE;
> > +})
> > \ No newline at end of file
> > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> > new file mode 100644
> > index 0000000..7a06449
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> > @@ -0,0 +1,62 @@
> > +/* { dg-do assemble } */
> > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> > +/* { dg-add-options arm_v8_1m_mve } */
> > +/* { dg-additional-options "-O3" } */
> > +
> > +#include <stdint.h>
> > +
> > +#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)                           \
> > +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
> > +    int i;                                                             \
> > +    for (i=0; i<NB; i++) {                                             \
> > +      dest[i] = a[i] OP b[i];                                          \
> > +    }                                                                  \
> > +}
> > +
> > +#define FUNC_IMM(SIGN, TYPE, BITS, NB, OP, NAME)                               \
> > +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a) { \
> > +    int i;                                                             \
> > +    for (i=0; i<NB; i++) {                                             \
> > +      dest[i] = a[i] OP 5;                                             \
> > +    }                                                                  \
> > +}
> > +
> > +/* 64-bit vectors.  */
> > +FUNC(s, int, 32, 2, <<, vshl)
> > +FUNC(u, uint, 32, 2, <<, vshl)
> > +FUNC(s, int, 16, 4, <<, vshl)
> > +FUNC(u, uint, 16, 4, <<, vshl)
> > +FUNC(s, int, 8, 8, <<, vshl)
> > +FUNC(u, uint, 8, 8, <<, vshl)
> > +
> > +/* 128-bit vectors.  */
> > +FUNC(s, int, 32, 4, <<, vshl)
> > +FUNC(u, uint, 32, 4, <<, vshl)
> > +FUNC(s, int, 16, 8, <<, vshl)  /* FIXME: not vectorized */
> > +FUNC(u, uint, 16, 8, <<, vshl) /* FIXME: not vectorized */
> > +FUNC(s, int, 8, 16, <<, vshl)  /* FIXME: not vectorized */
> > +FUNC(u, uint, 8, 16, <<, vshl) /* FIXME: not vectorized */
> > +
> > +/* 64-bit vectors.  */
> > +FUNC_IMM(s, int, 32, 2, <<, vshlimm)
> > +FUNC_IMM(u, uint, 32, 2, <<, vshlimm)
> > +FUNC_IMM(s, int, 16, 4, <<, vshlimm)
> > +FUNC_IMM(u, uint, 16, 4, <<, vshlimm)
> > +FUNC_IMM(s, int, 8, 8, <<, vshlimm)
> > +FUNC_IMM(u, uint, 8, 8, <<, vshlimm)
> > +
> > +/* 128-bit vectors.  */
> > +FUNC_IMM(s, int, 32, 4, <<, vshlimm)
> > +FUNC_IMM(u, uint, 32, 4, <<, vshlimm)
> > +FUNC_IMM(s, int, 16, 8, <<, vshlimm)
> > +FUNC_IMM(u, uint, 16, 8, <<, vshlimm)
> > +FUNC_IMM(s, int, 8, 16, <<, vshlimm)
> > +FUNC_IMM(u, uint, 8, 16, <<, vshlimm)
> > +
> > +/* MVE has only 128-bit vectors, so we can vectorize only half of the
> > +   functions above.  */
> > +/* We only emit vshl.u, which is equivalent to vshl.s anyway.  */
> > +/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 2 } } */
> > +
> > +/* We emit vshl.i when the shift amount is an immediate.  */
> > +/* { dg-final { scan-assembler-times {vshl.i[0-9]+\tq[0-9]+, q[0-9]+} 6 } } */
> > --
> > 2.7.4
> >

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] arm: Auto-vectorization for MVE: vshr
  2020-12-30 10:34   ` Christophe Lyon
@ 2021-01-07 12:20     ` Christophe Lyon
  2021-01-15  9:29       ` Christophe Lyon
  0 siblings, 1 reply; 16+ messages in thread
From: Christophe Lyon @ 2021-01-07 12:20 UTC (permalink / raw)
  To: gcc Patches

ping^2?

On Wed, 30 Dec 2020 at 11:34, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
>
> ping?
>
> On Thu, 17 Dec 2020 at 18:48, Christophe Lyon
> <christophe.lyon@linaro.org> wrote:
> >
> > This patch enables MVE vshr instructions for auto-vectorization.  New
> > MVE patterns are introduced that take a vector of constants as second
> > operand, all constants being equal.
> >
> > The existing mve_vshrq_n_<supf><mode> is kept, as it takes a single
> > immediate as second operand, and is used by arm_mve.h.
> >
> > The vashr<mode>3 and vlshr<mode>3 expanders are moved fron neon.md to
> > vec-common.md, updated to rely on the normal expansion scheme to
> > generate shifts by immediate.
> >
> > 2020-12-03  Christophe Lyon  <christophe.lyon@linaro.org>
> >
> >         gcc/
> >         * config/arm/mve.md (mve_vshrq_n_s<mode>_imm): New entry.
> >         (mve_vshrq_n_u<mode>_imm): Likewise.
> >         * config/arm/neon.md (vashr<mode>3, vlshr<mode>3): Move to ...
> >         * config/arm/vec-common.md: ... here.
> >
> >         gcc/testsuite/
> >         * gcc.target/arm/simd/mve-vshr.c: Add tests for vshr.
> > ---
> >  gcc/config/arm/mve.md                        | 34 ++++++++++++++++
> >  gcc/config/arm/neon.md                       | 34 ----------------
> >  gcc/config/arm/vec-common.md                 | 38 +++++++++++++++++-
> >  gcc/testsuite/gcc.target/arm/simd/mve-vshr.c | 59 ++++++++++++++++++++++++++++
> >  4 files changed, 130 insertions(+), 35 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vshr.c
> >
> > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> > index 8bdb451..eea8b20 100644
> > --- a/gcc/config/arm/mve.md
> > +++ b/gcc/config/arm/mve.md
> > @@ -763,6 +763,7 @@ (define_insn "mve_vcreateq_<supf><mode>"
> >  ;;
> >  ;; [vshrq_n_s, vshrq_n_u])
> >  ;;
> > +;; Version that takes an immediate as operand 2.
> >  (define_insn "mve_vshrq_n_<supf><mode>"
> >    [
> >     (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> > @@ -775,6 +776,39 @@ (define_insn "mve_vshrq_n_<supf><mode>"
> >    [(set_attr "type" "mve_move")
> >  ])
> >
> > +;; Versions that take constant vectors as operand 2 (with all elements
> > +;; equal).
> > +(define_insn "mve_vshrq_n_s<mode>_imm"
> > +  [
> > +   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> > +       (ashiftrt:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
> > +                       (match_operand:MVE_2 2 "imm_for_neon_rshift_operand" "i")))
> > +  ]
> > +  "TARGET_HAVE_MVE"
> > +  {
> > +    return neon_output_shift_immediate ("vshr", 's', &operands[2],
> > +                                       <MODE>mode,
> > +                                       VALID_NEON_QREG_MODE (<MODE>mode),
> > +                                       true);
> > +  }
> > +  [(set_attr "type" "mve_move")
> > +])
> > +(define_insn "mve_vshrq_n_u<mode>_imm"
> > +  [
> > +   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> > +       (lshiftrt:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
> > +                       (match_operand:MVE_2 2 "imm_for_neon_rshift_operand" "i")))
> > +  ]
> > +  "TARGET_HAVE_MVE"
> > +  {
> > +    return neon_output_shift_immediate ("vshr", 'u', &operands[2],
> > +                                       <MODE>mode,
> > +                                       VALID_NEON_QREG_MODE (<MODE>mode),
> > +                                       true);
> > +  }
> > +  [(set_attr "type" "mve_move")
> > +])
> > +
> >  ;;
> >  ;; [vcvtq_n_from_f_s, vcvtq_n_from_f_u])
> >  ;;
> > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> > index ac9bf74..a0e8d7a 100644
> > --- a/gcc/config/arm/neon.md
> > +++ b/gcc/config/arm/neon.md
> > @@ -899,40 +899,6 @@ (define_insn "ashl<mode>3_unsigned"
> >    [(set_attr "type" "neon_shift_reg<q>")]
> >  )
> >
> > -(define_expand "vashr<mode>3"
> > -  [(set (match_operand:VDQIW 0 "s_register_operand")
> > -       (ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
> > -                       (match_operand:VDQIW 2 "imm_rshift_or_reg_neon")))]
> > -  "TARGET_NEON"
> > -{
> > -  if (s_register_operand (operands[2], <MODE>mode))
> > -    {
> > -      rtx neg = gen_reg_rtx (<MODE>mode);
> > -      emit_insn (gen_neon_neg<mode>2 (neg, operands[2]));
> > -      emit_insn (gen_ashl<mode>3_signed (operands[0], operands[1], neg));
> > -    }
> > -  else
> > -    emit_insn (gen_vashr<mode>3_imm (operands[0], operands[1], operands[2]));
> > -  DONE;
> > -})
> > -
> > -(define_expand "vlshr<mode>3"
> > -  [(set (match_operand:VDQIW 0 "s_register_operand")
> > -       (lshiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
> > -                       (match_operand:VDQIW 2 "imm_rshift_or_reg_neon")))]
> > -  "TARGET_NEON"
> > -{
> > -  if (s_register_operand (operands[2], <MODE>mode))
> > -    {
> > -      rtx neg = gen_reg_rtx (<MODE>mode);
> > -      emit_insn (gen_neon_neg<mode>2 (neg, operands[2]));
> > -      emit_insn (gen_ashl<mode>3_unsigned (operands[0], operands[1], neg));
> > -    }
> > -  else
> > -    emit_insn (gen_vlshr<mode>3_imm (operands[0], operands[1], operands[2]));
> > -  DONE;
> > -})
> > -
> >  ;; 64-bit shifts
> >
> >  ;; This pattern loads a 32-bit shift count into a 64-bit NEON register,
> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > index 3a282f0..e126557 100644
> > --- a/gcc/config/arm/vec-common.md
> > +++ b/gcc/config/arm/vec-common.md
> > @@ -258,4 +258,40 @@ (define_expand "vashl<mode>3"
> >  {
> >    emit_insn (gen_mve_vshlq_u<mode> (operands[0], operands[1], operands[2]));
> >    DONE;
> > -})
> > \ No newline at end of file
> > +})
> > +
> > +;; When operand 2 is an immediate, use the normal expansion to match
> > +;; gen_vashr<mode>3_imm for Neon and gen_mve_vshrq_n_s<mode>_imm for
> > +;; MVE.
> > +(define_expand "vashr<mode>3"
> > +  [(set (match_operand:VDQIW 0 "s_register_operand")
> > +       (ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
> > +                       (match_operand:VDQIW 2 "imm_rshift_or_reg_neon")))]
> > +  "ARM_HAVE_<MODE>_ARITH"
> > +{
> > +  if (s_register_operand (operands[2], <MODE>mode))
> > +    {
> > +      rtx neg = gen_reg_rtx (<MODE>mode);
> > +      emit_insn (gen_neg<mode>2 (neg, operands[2]));
> > +      emit_insn (gen_mve_vshlq_s<mode> (operands[0], operands[1], neg));
> > +      DONE;
> > +    }
> > +})
> > +
> > +;; When operand 2 is an immediate, use the normal expansion to match
> > +;; gen_vashr<mode>3_imm for Neon and gen_mve_vshrq_n_u<mode>_imm for
> > +;; MVE.
> > +(define_expand "vlshr<mode>3"
> > +  [(set (match_operand:VDQIW 0 "s_register_operand")
> > +       (lshiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
> > +                       (match_operand:VDQIW 2 "imm_rshift_or_reg_neon")))]
> > +  "ARM_HAVE_<MODE>_ARITH"
> > +{
> > +  if (s_register_operand (operands[2], <MODE>mode))
> > +    {
> > +      rtx neg = gen_reg_rtx (<MODE>mode);
> > +      emit_insn (gen_neg<mode>2 (neg, operands[2]));
> > +      emit_insn (gen_mve_vshlq_u<mode> (operands[0], operands[1], neg));
> > +      DONE;
> > +    }
> > +})
> > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c b/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c
> > new file mode 100644
> > index 0000000..d4e658c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c
> > @@ -0,0 +1,59 @@
> > +/* { dg-do assemble } */
> > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> > +/* { dg-add-options arm_v8_1m_mve } */
> > +/* { dg-additional-options "-O3" } */
> > +
> > +#include <stdint.h>
> > +
> > +#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)                           \
> > +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
> > +    int i;                                                             \
> > +    for (i=0; i<NB; i++) {                                             \
> > +      dest[i] = a[i] OP b[i];                                          \
> > +    }                                                                  \
> > +}
> > +
> > +#define FUNC_IMM(SIGN, TYPE, BITS, NB, OP, NAME)                               \
> > +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a) { \
> > +    int i;                                                             \
> > +    for (i=0; i<NB; i++) {                                             \
> > +      dest[i] = a[i] OP 5;                                             \
> > +    }                                                                  \
> > +}
> > +
> > +/* 64-bit vectors.  */
> > +FUNC(s, int, 32, 2, >>, vshr)
> > +FUNC(u, uint, 32, 2, >>, vshr)
> > +FUNC(s, int, 16, 4, >>, vshr)
> > +FUNC(u, uint, 16, 4, >>, vshr)
> > +FUNC(s, int, 8, 8, >>, vshr)
> > +FUNC(u, uint, 8, 8, >>, vshr)
> > +
> > +/* 128-bit vectors.  */
> > +FUNC(s, int, 32, 4, >>, vshr)
> > +FUNC(u, uint, 32, 4, >>, vshr)
> > +FUNC(s, int, 16, 8, >>, vshr)
> > +FUNC(u, uint, 16, 8, >>, vshr)
> > +FUNC(s, int, 8, 16, >>, vshr)
> > +FUNC(u, uint, 8, 16, >>, vshr)
> > +
> > +/* 64-bit vectors.  */
> > +FUNC_IMM(s, int, 32, 2, >>, vshrimm)
> > +FUNC_IMM(u, uint, 32, 2, >>, vshrimm)
> > +FUNC_IMM(s, int, 16, 4, >>, vshrimm)
> > +FUNC_IMM(u, uint, 16, 4, >>, vshrimm)
> > +FUNC_IMM(s, int, 8, 8, >>, vshrimm)
> > +FUNC_IMM(u, uint, 8, 8, >>, vshrimm)
> > +
> > +/* 128-bit vectors.  */
> > +FUNC_IMM(s, int, 32, 4, >>, vshrimm)
> > +FUNC_IMM(u, uint, 32, 4, >>, vshrimm)
> > +FUNC_IMM(s, int, 16, 8, >>, vshrimm)
> > +FUNC_IMM(u, uint, 16, 8, >>, vshrimm)
> > +FUNC_IMM(s, int, 8, 16, >>, vshrimm)
> > +FUNC_IMM(u, uint, 8, 16, >>, vshrimm)
> > +
> > +/* MVE has only 128-bit vectors, so we can vectorize only half of the
> > +   functions above.  */
> > +/* { dg-final { scan-assembler-times {vshr.s[0-9]+\tq[0-9]+, q[0-9]+} 3 } } */
> > +/* { dg-final { scan-assembler-times {vshr.u[0-9]+\tq[0-9]+, q[0-9]+} 3 } } */
> > --
> > 2.7.4
> >

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR target/97875)
  2020-12-17 17:48 [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR target/97875) Christophe Lyon
                   ` (2 preceding siblings ...)
  2020-12-30 10:33 ` [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR target/97875) Christophe Lyon
@ 2021-01-08  9:50 ` Kyrylo Tkachov
  2021-01-08 11:04   ` Christophe Lyon
  3 siblings, 1 reply; 16+ messages in thread
From: Kyrylo Tkachov @ 2021-01-08  9:50 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc Patches

Hi Christophe,

> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> Christophe Lyon via Gcc-patches
> Sent: 17 December 2020 17:48
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR
> target/97875)
> 
> This patch adds new movmisalign<mode>_mve_load and store patterns for
> MVE to help vectorization. They are very similar to their Neon
> counterparts, but use different iterators and instructions.
> 
> Indeed MVE supports less vectors modes than Neon, so we use
> the MVE_VLD_ST iterator where Neon uses VQX.
> 
> Since the supported modes are different from the ones valid for
> arithmetic operators, we introduce two new sets of macros:
> 
> ARM_HAVE_NEON_<MODE>_LDST
>   true if Neon has vector load/store instructions for <MODE>
> 
> ARM_HAVE_<MODE>_LDST
>   true if any vector extension has vector load/store instructions for <MODE>
> 

I'm not a big fan of the big number of these macros ☹ but I understand why they're used, so I won't object.

> We move the movmisalign<mode> expander from neon.md to vec-
> commond.md, and
> replace the TARGET_NEON enabler with ARM_HAVE_<MODE>_LDST.
> 
> The patch also updates the mve-vneg.c test to scan for the better code
> generation when loading and storing the vectors involved: it checks
> that no 'orr' instruction is generated to cope with misalignment at
> runtime.
> This test was chosen among the other mve tests, but any other should
> be OK. Using a plain vector copy loop (dest[i] = a[i]) is not a good
> test because the compiler chooses to use memcpy.
> 
> For instance we now generate:
> test_vneg_s32x4:
> 	vldrw.32       q3, [r1]
> 	vneg.s32  q3, q3
> 	vstrw.32       q3, [r0]
> 	bx      lr
> 
> instead of:
> test_vneg_s32x4:
> 	orr     r3, r1, r0
> 	lsls    r3, r3, #28
> 	bne     .L15
> 	vldrw.32	q3, [r1]
> 	vneg.s32  q3, q3
> 	vstrw.32	q3, [r0]
> 	bx      lr
> 	.L15:
> 	push    {r4, r5}
> 	ldrd    r2, r3, [r1, #8]
> 	ldrd    r5, r4, [r1]
> 	rsbs    r2, r2, #0
> 	rsbs    r5, r5, #0
> 	rsbs    r4, r4, #0
> 	rsbs    r3, r3, #0
> 	strd    r5, r4, [r0]
> 	pop     {r4, r5}
> 	strd    r2, r3, [r0, #8]
> 	bx      lr
> 
> 2020-12-15  Christophe Lyon  <christophe.lyon@linaro.org>
> 
> 	PR target/97875
> 	gcc/
> 	* config/arm/arm.h (ARM_HAVE_NEON_V8QI_LDST): New macro.
> 	(ARM_HAVE_NEON_V16QI_LDST, ARM_HAVE_NEON_V4HI_LDST):
> Likewise.
> 	(ARM_HAVE_NEON_V8HI_LDST, ARM_HAVE_NEON_V2SI_LDST):
> Likewise.
> 	(ARM_HAVE_NEON_V4SI_LDST, ARM_HAVE_NEON_V4HF_LDST):
> Likewise.
> 	(ARM_HAVE_NEON_V8HF_LDST, ARM_HAVE_NEON_V4BF_LDST):
> Likewise.
> 	(ARM_HAVE_NEON_V8BF_LDST, ARM_HAVE_NEON_V2SF_LDST):
> Likewise.
> 	(ARM_HAVE_NEON_V4SF_LDST, ARM_HAVE_NEON_DI_LDST):
> Likewise.
> 	(ARM_HAVE_NEON_V2DI_LDST): Likewise.
> 	(ARM_HAVE_V8QI_LDST, ARM_HAVE_V16QI_LDST): Likewise.
> 	(ARM_HAVE_V4HI_LDST, ARM_HAVE_V8HI_LDST): Likewise.
> 	(ARM_HAVE_V2SI_LDST, ARM_HAVE_V4SI_LDST,
> ARM_HAVE_V4HF_LDST): Likewise.
> 	(ARM_HAVE_V8HF_LDST, ARM_HAVE_V4BF_LDST,
> ARM_HAVE_V8BF_LDST): Likewise.
> 	(ARM_HAVE_V2SF_LDST, ARM_HAVE_V4SF_LDST,
> ARM_HAVE_DI_LDST): Likewise.
> 	(ARM_HAVE_V2DI_LDST): Likewise.
> 	* config/arm/mve.md (*movmisalign<mode>_mve_store): New
> pattern.
> 	(*movmisalign<mode>_mve_load): New pattern.
> 	* config/arm/neon.md (movmisalign<mode>): Move to ...
> 	* config/arm/vec-common.md: ... here.
> 
> 	PR target/97875
> 	gcc/testsuite/
> 	* gcc.target/arm/simd/mve-vneg.c: Update test.
> ---
>  gcc/config/arm/arm.h                         | 40 ++++++++++++++++++++++++++++
>  gcc/config/arm/mve.md                        | 25 +++++++++++++++++
>  gcc/config/arm/neon.md                       | 25 -----------------
>  gcc/config/arm/vec-common.md                 | 24 +++++++++++++++++
>  gcc/testsuite/gcc.target/arm/simd/mve-vneg.c |  3 +++
>  5 files changed, 92 insertions(+), 25 deletions(-)
> 
> diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
> index 4a63d33..d44e0c6 100644
> --- a/gcc/config/arm/arm.h
> +++ b/gcc/config/arm/arm.h
> @@ -1151,6 +1151,46 @@ extern const int arm_arch_cde_coproc_bits[];
>  #define ARM_HAVE_V8HF_ARITH (ARM_HAVE_NEON_V8HF_ARITH ||
> TARGET_HAVE_MVE_FLOAT)
>  #define ARM_HAVE_V4SF_ARITH (ARM_HAVE_NEON_V4SF_ARITH ||
> TARGET_HAVE_MVE_FLOAT)
> 
> +/* The conditions under which vector modes are supported by load/store
> +   instructions using Neon.  */
> +
> +#define ARM_HAVE_NEON_V8QI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V16QI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V4HI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V8HI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V2SI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V4SI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V4HF_LDST TARGET_NEON_FP16INST
> +#define ARM_HAVE_NEON_V8HF_LDST TARGET_NEON_FP16INST
> +#define ARM_HAVE_NEON_V4BF_LDST TARGET_BF16_SIMD
> +#define ARM_HAVE_NEON_V8BF_LDST TARGET_BF16_SIMD
> +#define ARM_HAVE_NEON_V2SF_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V4SF_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_DI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V2DI_LDST TARGET_NEON
> +
> +/* The conditions under which vector modes are supported by load/store
> +   instructions by any vector extension.  */
> +
> +#define ARM_HAVE_V8QI_LDST (ARM_HAVE_NEON_V8QI_LDST ||
> TARGET_REALLY_IWMMXT)
> +#define ARM_HAVE_V4HI_LDST (ARM_HAVE_NEON_V4HI_LDST ||
> TARGET_REALLY_IWMMXT)
> +#define ARM_HAVE_V2SI_LDST (ARM_HAVE_NEON_V2SI_LDST ||
> TARGET_REALLY_IWMMXT)
> +
> +#define ARM_HAVE_V16QI_LDST (ARM_HAVE_NEON_V16QI_LDST ||
> TARGET_HAVE_MVE)
> +#define ARM_HAVE_V8HI_LDST (ARM_HAVE_NEON_V8HI_LDST ||
> TARGET_HAVE_MVE)
> +#define ARM_HAVE_V4SI_LDST (ARM_HAVE_NEON_V4SI_LDST ||
> TARGET_HAVE_MVE)
> +#define ARM_HAVE_DI_LDST ARM_HAVE_NEON_DI_LDST
> +#define ARM_HAVE_V2DI_LDST ARM_HAVE_NEON_V2DI_LDST
> +
> +#define ARM_HAVE_V4HF_LDST ARM_HAVE_NEON_V4HF_LDST
> +#define ARM_HAVE_V2SF_LDST ARM_HAVE_NEON_V2SF_LDST
> +
> +#define ARM_HAVE_V4BF_LDST ARM_HAVE_NEON_V4BF_LDST
> +#define ARM_HAVE_V8BF_LDST ARM_HAVE_NEON_V8BF_LDST
> +
> +#define ARM_HAVE_V8HF_LDST (ARM_HAVE_NEON_V8HF_LDST ||
> TARGET_HAVE_MVE_FLOAT)
> +#define ARM_HAVE_V4SF_LDST (ARM_HAVE_NEON_V4SF_LDST ||
> TARGET_HAVE_MVE_FLOAT)
> +
>  /* The register numbers in sequence, for passing to arm_gen_load_multiple.
> */
>  extern int arm_regs_in_sequence[];
> 
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index b4c5a1e2..673a83c 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -10937,3 +10937,28 @@ (define_insn "arm_vcx3q<a>_p_v16qi"
>    [(set_attr "type" "coproc")
>     (set_attr "length" "8")]
>  )
> +
> +(define_insn "*movmisalign<mode>_mve_store"
> +  [(set (match_operand:MVE_VLD_ST 0 "neon_permissive_struct_operand"
> 	     "=Um")
> +	(unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1
> "s_register_operand" " w")]
> +	 UNSPEC_MISALIGNED_ACCESS))]
> +  "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
> +   || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE
> (<MODE>mode))
> +   && !BYTES_BIG_ENDIAN && unaligned_access"
> +  "vstr<V_sz_elem1>.<V_sz_elem>\t%q1, %E0"
> +  [(set_attr "type" "mve_store")
> +   (set_attr "length" "4")]

No need to specify the length here and in the other pattern. It's 4 by default.
Ok for master as long as bootstrap and testing on an arm-none-linux-gnueabihf target is clean (to ensure the Neon-related refactoring didn't hurt anything)
Thanks,
Kyrill


> +)
> +
> +
> +(define_insn "*movmisalign<mode>_mve_load"
> +  [(set (match_operand:MVE_VLD_ST 0 "s_register_operand"
> 		 "=w")
> +	(unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1
> "neon_permissive_struct_operand" " Um")]
> +	 UNSPEC_MISALIGNED_ACCESS))]
> +  "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
> +   || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE
> (<MODE>mode))
> +   && !BYTES_BIG_ENDIAN && unaligned_access"
> +  "vldr<V_sz_elem1>.<V_sz_elem>\t%q0, %E1"
> +  [(set_attr "type" "mve_load")
> +   (set_attr "length" "4")]
> +)
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index d2e92ba..50220be 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -280,31 +280,6 @@ (define_split
>    neon_disambiguate_copy (operands, dest, src, 4);
>  })
> 
> -(define_expand "movmisalign<mode>"
> -  [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
> -	(unspec:VDQX [(match_operand:VDQX 1
> "neon_perm_struct_or_reg_operand")]
> -		     UNSPEC_MISALIGNED_ACCESS))]
> -  "TARGET_NEON && !BYTES_BIG_ENDIAN && unaligned_access"
> -{
> -  rtx adjust_mem;
> -  /* This pattern is not permitted to fail during expansion: if both arguments
> -     are non-registers (e.g. memory := constant, which can be created by the
> -     auto-vectorizer), force operand 1 into a register.  */
> -  if (!s_register_operand (operands[0], <MODE>mode)
> -      && !s_register_operand (operands[1], <MODE>mode))
> -    operands[1] = force_reg (<MODE>mode, operands[1]);
> -
> -  if (s_register_operand (operands[0], <MODE>mode))
> -    adjust_mem = operands[1];
> -  else
> -    adjust_mem = operands[0];
> -
> -  /* Legitimize address.  */
> -  if (!neon_vector_mem_operand (adjust_mem, 2, true))
> -    XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
> -
> -})
> -
>  (define_insn "*movmisalign<mode>_neon_store"
>    [(set (match_operand:VDX 0 "neon_permissive_struct_operand"
> 	"=Um")
>  	(unspec:VDX [(match_operand:VDX 1 "s_register_operand" " w")]
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-
> common.md
> index 2d0932b..f6a79e2 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -205,3 +205,27 @@ (define_expand "neg<mode>2"
>  	(neg:VDQWH (match_operand:VDQWH 1 "s_register_operand" "")))]
>    "ARM_HAVE_<MODE>_ARITH"
>  )
> +
> +(define_expand "movmisalign<mode>"
> +  [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
> +	(unspec:VDQX [(match_operand:VDQX 1
> "neon_perm_struct_or_reg_operand")]
> +	 UNSPEC_MISALIGNED_ACCESS))]
> +  "ARM_HAVE_<MODE>_LDST && !BYTES_BIG_ENDIAN &&
> unaligned_access"
> +{
> +  rtx adjust_mem;
> +  /* This pattern is not permitted to fail during expansion: if both arguments
> +     are non-registers (e.g. memory := constant, which can be created by the
> +     auto-vectorizer), force operand 1 into a register.  */
> +  if (!s_register_operand (operands[0], <MODE>mode)
> +      && !s_register_operand (operands[1], <MODE>mode))
> +    operands[1] = force_reg (<MODE>mode, operands[1]);
> +
> +  if (s_register_operand (operands[0], <MODE>mode))
> +    adjust_mem = operands[1];
> +  else
> +    adjust_mem = operands[0];
> +
> +  /* Legitimize address.  */
> +  if (!neon_vector_mem_operand (adjust_mem, 2, true))
> +    XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
> +})
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
> b/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
> index afd0d60..7945a06 100644
> --- a/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
> @@ -47,3 +47,6 @@ FUNC(f, float, 16, 8, -, vneg)
>     functions above.  */
>  /* { dg-final { scan-assembler-times {vneg.s[0-9]+  q[0-9]+, q[0-9]+} 6 } } */
>  /* { dg-final { scan-assembler-times {vneg.f[0-9]+  q[0-9]+, q[0-9]+} 2 } } */
> +/* { dg-final { scan-assembler-times {vldr[bhw].[0-9]+\tq[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vstr[bhw].[0-9]+\tq[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-not {orr\tr[0-9]+, r[0-9]+, r[0-9]+} } } */
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR target/97875)
  2021-01-08  9:50 ` Kyrylo Tkachov
@ 2021-01-08 11:04   ` Christophe Lyon
  0 siblings, 0 replies; 16+ messages in thread
From: Christophe Lyon @ 2021-01-08 11:04 UTC (permalink / raw)
  To: Kyrylo Tkachov; +Cc: gcc Patches

On Fri, 8 Jan 2021 at 10:50, Kyrylo Tkachov <Kyrylo.Tkachov@arm.com> wrote:
>
> Hi Christophe,
>
> > -----Original Message-----
> > From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> > Christophe Lyon via Gcc-patches
> > Sent: 17 December 2020 17:48
> > To: gcc-patches@gcc.gnu.org
> > Subject: [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR
> > target/97875)
> >
> > This patch adds new movmisalign<mode>_mve_load and store patterns for
> > MVE to help vectorization. They are very similar to their Neon
> > counterparts, but use different iterators and instructions.
> >
> > Indeed MVE supports less vectors modes than Neon, so we use
> > the MVE_VLD_ST iterator where Neon uses VQX.
> >
> > Since the supported modes are different from the ones valid for
> > arithmetic operators, we introduce two new sets of macros:
> >
> > ARM_HAVE_NEON_<MODE>_LDST
> >   true if Neon has vector load/store instructions for <MODE>
> >
> > ARM_HAVE_<MODE>_LDST
> >   true if any vector extension has vector load/store instructions for <MODE>
> >
>
> I'm not a big fan of the big number of these macros ☹ but I understand why they're used, so I won't object.

Indeed, I tried to find other ways, but it seemed better to follow the
new practice of using this new style of macros.


> > We move the movmisalign<mode> expander from neon.md to vec-
> > commond.md, and
> > replace the TARGET_NEON enabler with ARM_HAVE_<MODE>_LDST.
> >
> > The patch also updates the mve-vneg.c test to scan for the better code
> > generation when loading and storing the vectors involved: it checks
> > that no 'orr' instruction is generated to cope with misalignment at
> > runtime.
> > This test was chosen among the other mve tests, but any other should
> > be OK. Using a plain vector copy loop (dest[i] = a[i]) is not a good
> > test because the compiler chooses to use memcpy.
> >
> > For instance we now generate:
> > test_vneg_s32x4:
> >       vldrw.32       q3, [r1]
> >       vneg.s32  q3, q3
> >       vstrw.32       q3, [r0]
> >       bx      lr
> >
> > instead of:
> > test_vneg_s32x4:
> >       orr     r3, r1, r0
> >       lsls    r3, r3, #28
> >       bne     .L15
> >       vldrw.32        q3, [r1]
> >       vneg.s32  q3, q3
> >       vstrw.32        q3, [r0]
> >       bx      lr
> >       .L15:
> >       push    {r4, r5}
> >       ldrd    r2, r3, [r1, #8]
> >       ldrd    r5, r4, [r1]
> >       rsbs    r2, r2, #0
> >       rsbs    r5, r5, #0
> >       rsbs    r4, r4, #0
> >       rsbs    r3, r3, #0
> >       strd    r5, r4, [r0]
> >       pop     {r4, r5}
> >       strd    r2, r3, [r0, #8]
> >       bx      lr
> >
> > 2020-12-15  Christophe Lyon  <christophe.lyon@linaro.org>
> >
> >       PR target/97875
> >       gcc/
> >       * config/arm/arm.h (ARM_HAVE_NEON_V8QI_LDST): New macro.
> >       (ARM_HAVE_NEON_V16QI_LDST, ARM_HAVE_NEON_V4HI_LDST):
> > Likewise.
> >       (ARM_HAVE_NEON_V8HI_LDST, ARM_HAVE_NEON_V2SI_LDST):
> > Likewise.
> >       (ARM_HAVE_NEON_V4SI_LDST, ARM_HAVE_NEON_V4HF_LDST):
> > Likewise.
> >       (ARM_HAVE_NEON_V8HF_LDST, ARM_HAVE_NEON_V4BF_LDST):
> > Likewise.
> >       (ARM_HAVE_NEON_V8BF_LDST, ARM_HAVE_NEON_V2SF_LDST):
> > Likewise.
> >       (ARM_HAVE_NEON_V4SF_LDST, ARM_HAVE_NEON_DI_LDST):
> > Likewise.
> >       (ARM_HAVE_NEON_V2DI_LDST): Likewise.
> >       (ARM_HAVE_V8QI_LDST, ARM_HAVE_V16QI_LDST): Likewise.
> >       (ARM_HAVE_V4HI_LDST, ARM_HAVE_V8HI_LDST): Likewise.
> >       (ARM_HAVE_V2SI_LDST, ARM_HAVE_V4SI_LDST,
> > ARM_HAVE_V4HF_LDST): Likewise.
> >       (ARM_HAVE_V8HF_LDST, ARM_HAVE_V4BF_LDST,
> > ARM_HAVE_V8BF_LDST): Likewise.
> >       (ARM_HAVE_V2SF_LDST, ARM_HAVE_V4SF_LDST,
> > ARM_HAVE_DI_LDST): Likewise.
> >       (ARM_HAVE_V2DI_LDST): Likewise.
> >       * config/arm/mve.md (*movmisalign<mode>_mve_store): New
> > pattern.
> >       (*movmisalign<mode>_mve_load): New pattern.
> >       * config/arm/neon.md (movmisalign<mode>): Move to ...
> >       * config/arm/vec-common.md: ... here.
> >
> >       PR target/97875
> >       gcc/testsuite/
> >       * gcc.target/arm/simd/mve-vneg.c: Update test.
> > ---
> >  gcc/config/arm/arm.h                         | 40 ++++++++++++++++++++++++++++
> >  gcc/config/arm/mve.md                        | 25 +++++++++++++++++
> >  gcc/config/arm/neon.md                       | 25 -----------------
> >  gcc/config/arm/vec-common.md                 | 24 +++++++++++++++++
> >  gcc/testsuite/gcc.target/arm/simd/mve-vneg.c |  3 +++
> >  5 files changed, 92 insertions(+), 25 deletions(-)
> >
> > diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
> > index 4a63d33..d44e0c6 100644
> > --- a/gcc/config/arm/arm.h
> > +++ b/gcc/config/arm/arm.h
> > @@ -1151,6 +1151,46 @@ extern const int arm_arch_cde_coproc_bits[];
> >  #define ARM_HAVE_V8HF_ARITH (ARM_HAVE_NEON_V8HF_ARITH ||
> > TARGET_HAVE_MVE_FLOAT)
> >  #define ARM_HAVE_V4SF_ARITH (ARM_HAVE_NEON_V4SF_ARITH ||
> > TARGET_HAVE_MVE_FLOAT)
> >
> > +/* The conditions under which vector modes are supported by load/store
> > +   instructions using Neon.  */
> > +
> > +#define ARM_HAVE_NEON_V8QI_LDST TARGET_NEON
> > +#define ARM_HAVE_NEON_V16QI_LDST TARGET_NEON
> > +#define ARM_HAVE_NEON_V4HI_LDST TARGET_NEON
> > +#define ARM_HAVE_NEON_V8HI_LDST TARGET_NEON
> > +#define ARM_HAVE_NEON_V2SI_LDST TARGET_NEON
> > +#define ARM_HAVE_NEON_V4SI_LDST TARGET_NEON
> > +#define ARM_HAVE_NEON_V4HF_LDST TARGET_NEON_FP16INST
> > +#define ARM_HAVE_NEON_V8HF_LDST TARGET_NEON_FP16INST
> > +#define ARM_HAVE_NEON_V4BF_LDST TARGET_BF16_SIMD
> > +#define ARM_HAVE_NEON_V8BF_LDST TARGET_BF16_SIMD
> > +#define ARM_HAVE_NEON_V2SF_LDST TARGET_NEON
> > +#define ARM_HAVE_NEON_V4SF_LDST TARGET_NEON
> > +#define ARM_HAVE_NEON_DI_LDST TARGET_NEON
> > +#define ARM_HAVE_NEON_V2DI_LDST TARGET_NEON
> > +
> > +/* The conditions under which vector modes are supported by load/store
> > +   instructions by any vector extension.  */
> > +
> > +#define ARM_HAVE_V8QI_LDST (ARM_HAVE_NEON_V8QI_LDST ||
> > TARGET_REALLY_IWMMXT)
> > +#define ARM_HAVE_V4HI_LDST (ARM_HAVE_NEON_V4HI_LDST ||
> > TARGET_REALLY_IWMMXT)
> > +#define ARM_HAVE_V2SI_LDST (ARM_HAVE_NEON_V2SI_LDST ||
> > TARGET_REALLY_IWMMXT)
> > +
> > +#define ARM_HAVE_V16QI_LDST (ARM_HAVE_NEON_V16QI_LDST ||
> > TARGET_HAVE_MVE)
> > +#define ARM_HAVE_V8HI_LDST (ARM_HAVE_NEON_V8HI_LDST ||
> > TARGET_HAVE_MVE)
> > +#define ARM_HAVE_V4SI_LDST (ARM_HAVE_NEON_V4SI_LDST ||
> > TARGET_HAVE_MVE)
> > +#define ARM_HAVE_DI_LDST ARM_HAVE_NEON_DI_LDST
> > +#define ARM_HAVE_V2DI_LDST ARM_HAVE_NEON_V2DI_LDST
> > +
> > +#define ARM_HAVE_V4HF_LDST ARM_HAVE_NEON_V4HF_LDST
> > +#define ARM_HAVE_V2SF_LDST ARM_HAVE_NEON_V2SF_LDST
> > +
> > +#define ARM_HAVE_V4BF_LDST ARM_HAVE_NEON_V4BF_LDST
> > +#define ARM_HAVE_V8BF_LDST ARM_HAVE_NEON_V8BF_LDST
> > +
> > +#define ARM_HAVE_V8HF_LDST (ARM_HAVE_NEON_V8HF_LDST ||
> > TARGET_HAVE_MVE_FLOAT)
> > +#define ARM_HAVE_V4SF_LDST (ARM_HAVE_NEON_V4SF_LDST ||
> > TARGET_HAVE_MVE_FLOAT)
> > +
> >  /* The register numbers in sequence, for passing to arm_gen_load_multiple.
> > */
> >  extern int arm_regs_in_sequence[];
> >
> > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> > index b4c5a1e2..673a83c 100644
> > --- a/gcc/config/arm/mve.md
> > +++ b/gcc/config/arm/mve.md
> > @@ -10937,3 +10937,28 @@ (define_insn "arm_vcx3q<a>_p_v16qi"
> >    [(set_attr "type" "coproc")
> >     (set_attr "length" "8")]
> >  )
> > +
> > +(define_insn "*movmisalign<mode>_mve_store"
> > +  [(set (match_operand:MVE_VLD_ST 0 "neon_permissive_struct_operand"
> >            "=Um")
> > +     (unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1
> > "s_register_operand" " w")]
> > +      UNSPEC_MISALIGNED_ACCESS))]
> > +  "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
> > +   || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE
> > (<MODE>mode))
> > +   && !BYTES_BIG_ENDIAN && unaligned_access"
> > +  "vstr<V_sz_elem1>.<V_sz_elem>\t%q1, %E0"
> > +  [(set_attr "type" "mve_store")
> > +   (set_attr "length" "4")]
>
> No need to specify the length here and in the other pattern. It's 4 by default.
OK, I'll remove it.

> Ok for master as long as bootstrap and testing on an arm-none-linux-gnueabihf target is clean (to ensure the Neon-related refactoring didn't hurt anything)
Bootstrap in progress, I'll commit if everything is OK.

Thanks

Christophe

> Thanks,
> Kyrill
>
>
> > +)
> > +
> > +
> > +(define_insn "*movmisalign<mode>_mve_load"
> > +  [(set (match_operand:MVE_VLD_ST 0 "s_register_operand"
> >                "=w")
> > +     (unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1
> > "neon_permissive_struct_operand" " Um")]
> > +      UNSPEC_MISALIGNED_ACCESS))]
> > +  "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
> > +   || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE
> > (<MODE>mode))
> > +   && !BYTES_BIG_ENDIAN && unaligned_access"
> > +  "vldr<V_sz_elem1>.<V_sz_elem>\t%q0, %E1"
> > +  [(set_attr "type" "mve_load")
> > +   (set_attr "length" "4")]
> > +)
> > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> > index d2e92ba..50220be 100644
> > --- a/gcc/config/arm/neon.md
> > +++ b/gcc/config/arm/neon.md
> > @@ -280,31 +280,6 @@ (define_split
> >    neon_disambiguate_copy (operands, dest, src, 4);
> >  })
> >
> > -(define_expand "movmisalign<mode>"
> > -  [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
> > -     (unspec:VDQX [(match_operand:VDQX 1
> > "neon_perm_struct_or_reg_operand")]
> > -                  UNSPEC_MISALIGNED_ACCESS))]
> > -  "TARGET_NEON && !BYTES_BIG_ENDIAN && unaligned_access"
> > -{
> > -  rtx adjust_mem;
> > -  /* This pattern is not permitted to fail during expansion: if both arguments
> > -     are non-registers (e.g. memory := constant, which can be created by the
> > -     auto-vectorizer), force operand 1 into a register.  */
> > -  if (!s_register_operand (operands[0], <MODE>mode)
> > -      && !s_register_operand (operands[1], <MODE>mode))
> > -    operands[1] = force_reg (<MODE>mode, operands[1]);
> > -
> > -  if (s_register_operand (operands[0], <MODE>mode))
> > -    adjust_mem = operands[1];
> > -  else
> > -    adjust_mem = operands[0];
> > -
> > -  /* Legitimize address.  */
> > -  if (!neon_vector_mem_operand (adjust_mem, 2, true))
> > -    XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
> > -
> > -})
> > -
> >  (define_insn "*movmisalign<mode>_neon_store"
> >    [(set (match_operand:VDX 0 "neon_permissive_struct_operand"
> >       "=Um")
> >       (unspec:VDX [(match_operand:VDX 1 "s_register_operand" " w")]
> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-
> > common.md
> > index 2d0932b..f6a79e2 100644
> > --- a/gcc/config/arm/vec-common.md
> > +++ b/gcc/config/arm/vec-common.md
> > @@ -205,3 +205,27 @@ (define_expand "neg<mode>2"
> >       (neg:VDQWH (match_operand:VDQWH 1 "s_register_operand" "")))]
> >    "ARM_HAVE_<MODE>_ARITH"
> >  )
> > +
> > +(define_expand "movmisalign<mode>"
> > +  [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
> > +     (unspec:VDQX [(match_operand:VDQX 1
> > "neon_perm_struct_or_reg_operand")]
> > +      UNSPEC_MISALIGNED_ACCESS))]
> > +  "ARM_HAVE_<MODE>_LDST && !BYTES_BIG_ENDIAN &&
> > unaligned_access"
> > +{
> > +  rtx adjust_mem;
> > +  /* This pattern is not permitted to fail during expansion: if both arguments
> > +     are non-registers (e.g. memory := constant, which can be created by the
> > +     auto-vectorizer), force operand 1 into a register.  */
> > +  if (!s_register_operand (operands[0], <MODE>mode)
> > +      && !s_register_operand (operands[1], <MODE>mode))
> > +    operands[1] = force_reg (<MODE>mode, operands[1]);
> > +
> > +  if (s_register_operand (operands[0], <MODE>mode))
> > +    adjust_mem = operands[1];
> > +  else
> > +    adjust_mem = operands[0];
> > +
> > +  /* Legitimize address.  */
> > +  if (!neon_vector_mem_operand (adjust_mem, 2, true))
> > +    XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
> > +})
> > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
> > b/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
> > index afd0d60..7945a06 100644
> > --- a/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
> > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
> > @@ -47,3 +47,6 @@ FUNC(f, float, 16, 8, -, vneg)
> >     functions above.  */
> >  /* { dg-final { scan-assembler-times {vneg.s[0-9]+  q[0-9]+, q[0-9]+} 6 } } */
> >  /* { dg-final { scan-assembler-times {vneg.f[0-9]+  q[0-9]+, q[0-9]+} 2 } } */
> > +/* { dg-final { scan-assembler-times {vldr[bhw].[0-9]+\tq[0-9]+} 8 } } */
> > +/* { dg-final { scan-assembler-times {vstr[bhw].[0-9]+\tq[0-9]+} 8 } } */
> > +/* { dg-final { scan-assembler-not {orr\tr[0-9]+, r[0-9]+, r[0-9]+} } } */
> > --
> > 2.7.4
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] arm: Auto-vectorization for MVE: vshl
  2021-01-07 12:20     ` Christophe Lyon
@ 2021-01-15  9:29       ` Christophe Lyon
  0 siblings, 0 replies; 16+ messages in thread
From: Christophe Lyon @ 2021-01-15  9:29 UTC (permalink / raw)
  To: gcc Patches

ping^3?

On Thu, 7 Jan 2021 at 13:20, Christophe Lyon <christophe.lyon@linaro.org> wrote:
>
> ping^2?
>
> On Wed, 30 Dec 2020 at 11:34, Christophe Lyon
> <christophe.lyon@linaro.org> wrote:
> >
> > ping?
> >
> > On Thu, 17 Dec 2020 at 18:48, Christophe Lyon
> > <christophe.lyon@linaro.org> wrote:
> > >
> > > This patch enables MVE vshlq instructions for auto-vectorization.
> > >
> > > The existing mve_vshlq_n_<supf><mode> is kept, as it takes a single
> > > immediate as second operand, and is used by arm_mve.h.
> > >
> > > We move the vashl<mode>3 insn from neon.md to an expander in
> > > vec-common.md, and the mve_vshlq_<supf><mode> insn from mve.md to
> > > vec-common.md, adding the second alternative fron neon.md.
> > >
> > > mve_vshlq_<supf><mode> will be used by a later patch enabling
> > > vectorization for vshr, as a unified version of
> > > ashl3<mode3>_[signed|unsigned] from neon.md. Keeping the use of unspec
> > > VSHLQ enables to generate both 's' and 'u' variants.
> > >
> > > It is not clear whether the neon_shift_[reg|imm]<q> attribute is still
> > > suitable, since this insn is also used for MVE.
> > >
> > > I kept the mve_vshlq_<supf><mode> naming instead of renaming it to
> > > ashl3_<supf>_<mode> as discussed because the reference in
> > > arm_mve_builtins.def automatically inserts the "mve_" prefix and I
> > > didn't want to make a special case for this.
> > >
> > > I haven't yet found why the v16qi and v8hi tests are not vectorized.
> > > With dest[i] = a[i] << b[i] and:
> > >   {
> > >     int i;
> > >     unsigned int i.24_1;
> > >     unsigned int _2;
> > >     int16_t * _3;
> > >     short int _4;
> > >     int _5;
> > >     int16_t * _6;
> > >     short int _7;
> > >     int _8;
> > >     int _9;
> > >     int16_t * _10;
> > >     short int _11;
> > >     unsigned int ivtmp_42;
> > >     unsigned int ivtmp_43;
> > >
> > >     <bb 2> [local count: 119292720]:
> > >
> > >     <bb 3> [local count: 954449105]:
> > >     i.24_1 = (unsigned int) i_23;
> > >     _2 = i.24_1 * 2;
> > >     _3 = a_15(D) + _2;
> > >     _4 = *_3;
> > >     _5 = (int) _4;
> > >     _6 = b_16(D) + _2;
> > >     _7 = *_6;
> > >     _8 = (int) _7;
> > >     _9 = _5 << _8;
> > >     _10 = dest_17(D) + _2;
> > >     _11 = (short int) _9;
> > >     *_10 = _11;
> > >     i_19 = i_23 + 1;
> > >     ivtmp_42 = ivtmp_43 - 1;
> > >     if (ivtmp_42 != 0)
> > >       goto <bb 5>; [87.50%]
> > >     else
> > >       goto <bb 4>; [12.50%]
> > >
> > >     <bb 5> [local count: 835156386]:
> > >     goto <bb 3>; [100.00%]
> > >
> > >     <bb 4> [local count: 119292720]:
> > >     return;
> > >
> > >   }
> > > the vectorizer says:
> > > mve-vshl.c:37:96: note:   ==> examining statement: _5 = (int) _4;
> > > mve-vshl.c:37:96: note:   vect_is_simple_use: operand *_3, type of def: internal
> > > mve-vshl.c:37:96: note:   vect_is_simple_use: vectype vector(8) short int
> > > mve-vshl.c:37:96: missed:   conversion not supported by target.
> > > mve-vshl.c:37:96: note:   vect_is_simple_use: operand *_3, type of def: internal
> > > mve-vshl.c:37:96: note:   vect_is_simple_use: vectype vector(8) short int
> > > mve-vshl.c:37:96: note:   vect_is_simple_use: operand *_3, type of def: internal
> > > mve-vshl.c:37:96: note:   vect_is_simple_use: vectype vector(8) short int
> > > mve-vshl.c:37:117: missed:   not vectorized: relevant stmt not supported: _5 = (int) _4;
> > > mve-vshl.c:37:96: missed:  bad operation or unsupported loop bound.
> > > mve-vshl.c:37:96: note:  ***** Analysis failed with vector mode V8HI
> > >
> > > 2020-12-03  Christophe Lyon  <christophe.lyon@linaro.org>
> > >
> > >         gcc/
> > >         * config/arm/mve.md (mve_vshlq_<supf><mode>): Move to
> > >         vec-commond.md.
> > >         * config/arm/neon.md (vashl<mode>3): Delete.
> > >         * config/arm/vec-common.md (mve_vshlq_<supf><mode>): New.
> > >         (vasl<mode>3): New expander.
> > >
> > >         gcc/testsuite/
> > >         * gcc.target/arm/simd/mve-vshl.c: Add tests for vshl.
> > > ---
> > >  gcc/config/arm/mve.md                        | 13 +-----
> > >  gcc/config/arm/neon.md                       | 19 ---------
> > >  gcc/config/arm/vec-common.md                 | 30 ++++++++++++++
> > >  gcc/testsuite/gcc.target/arm/simd/mve-vshl.c | 62 ++++++++++++++++++++++++++++
> > >  4 files changed, 93 insertions(+), 31 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> > >
> > > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> > > index 673a83c..8bdb451 100644
> > > --- a/gcc/config/arm/mve.md
> > > +++ b/gcc/config/arm/mve.md
> > > @@ -822,18 +822,7 @@ (define_insn "mve_vcmpneq_<supf><mode>"
> > >
> > >  ;;
> > >  ;; [vshlq_s, vshlq_u])
> > > -;;
> > > -(define_insn "mve_vshlq_<supf><mode>"
> > > -  [
> > > -   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> > > -       (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
> > > -                      (match_operand:MVE_2 2 "s_register_operand" "w")]
> > > -        VSHLQ))
> > > -  ]
> > > -  "TARGET_HAVE_MVE"
> > > -  "vshl.<supf>%#<V_sz_elem>\t%q0, %q1, %q2"
> > > -  [(set_attr "type" "mve_move")
> > > -])
> > > +;; See vec-common.md
> > >
> > >  ;;
> > >  ;; [vabdq_s, vabdq_u])
> > > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> > > index 50220be..ac9bf74 100644
> > > --- a/gcc/config/arm/neon.md
> > > +++ b/gcc/config/arm/neon.md
> > > @@ -845,25 +845,6 @@ (define_insn "*smax<mode>3_neon"
> > >  ; generic vectorizer code.  It ends up creating a V2DI constructor with
> > >  ; SImode elements.
> > >
> > > -(define_insn "vashl<mode>3"
> > > -  [(set (match_operand:VDQIW 0 "s_register_operand" "=w,w")
> > > -       (ashift:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w,w")
> > > -                     (match_operand:VDQIW 2 "imm_lshift_or_reg_neon" "w,Dm")))]
> > > -  "TARGET_NEON"
> > > -  {
> > > -    switch (which_alternative)
> > > -      {
> > > -        case 0: return "vshl.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2";
> > > -        case 1: return neon_output_shift_immediate ("vshl", 'i', &operands[2],
> > > -                                                   <MODE>mode,
> > > -                                                   VALID_NEON_QREG_MODE (<MODE>mode),
> > > -                                                   true);
> > > -        default: gcc_unreachable ();
> > > -      }
> > > -  }
> > > -  [(set_attr "type" "neon_shift_reg<q>, neon_shift_imm<q>")]
> > > -)
> > > -
> > >  (define_insn "vashr<mode>3_imm"
> > >    [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
> > >         (ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
> > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > > index f6a79e2..3a282f0 100644
> > > --- a/gcc/config/arm/vec-common.md
> > > +++ b/gcc/config/arm/vec-common.md
> > > @@ -229,3 +229,33 @@ (define_expand "movmisalign<mode>"
> > >    if (!neon_vector_mem_operand (adjust_mem, 2, true))
> > >      XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
> > >  })
> > > +
> > > +(define_insn "mve_vshlq_<supf><mode>"
> > > +  [(set (match_operand:VDQIW 0 "s_register_operand" "=w,w")
> > > +       (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w,w")
> > > +                      (match_operand:VDQIW 2 "imm_lshift_or_reg_neon" "w,Dm")]
> > > +        VSHLQ))]
> > > +  "ARM_HAVE_<MODE>_ARITH"
> > > +{
> > > +  switch (which_alternative)
> > > +    {
> > > +      case 0: return "vshl.<supf>%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2";
> > > +      case 1: return neon_output_shift_immediate ("vshl", 'i', &operands[2],
> > > +                                                 <MODE>mode,
> > > +                                                 VALID_NEON_QREG_MODE (<MODE>mode),
> > > +                                                 true);
> > > +      default: gcc_unreachable ();
> > > +    }
> > > +}
> > > +  [(set_attr "type" "neon_shift_reg<q>, neon_shift_imm<q>")]
> > > +)
> > > +
> > > +(define_expand "vashl<mode>3"
> > > +  [(set (match_operand:VDQIW 0 "s_register_operand" "")
> > > +       (ashift:VDQIW (match_operand:VDQIW 1 "s_register_operand" "")
> > > +                     (match_operand:VDQIW 2 "imm_lshift_or_reg_neon" "")))]
> > > +  "ARM_HAVE_<MODE>_ARITH"
> > > +{
> > > +  emit_insn (gen_mve_vshlq_u<mode> (operands[0], operands[1], operands[2]));
> > > +  DONE;
> > > +})
> > > \ No newline at end of file
> > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> > > new file mode 100644
> > > index 0000000..7a06449
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> > > @@ -0,0 +1,62 @@
> > > +/* { dg-do assemble } */
> > > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> > > +/* { dg-add-options arm_v8_1m_mve } */
> > > +/* { dg-additional-options "-O3" } */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)                           \
> > > +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
> > > +    int i;                                                             \
> > > +    for (i=0; i<NB; i++) {                                             \
> > > +      dest[i] = a[i] OP b[i];                                          \
> > > +    }                                                                  \
> > > +}
> > > +
> > > +#define FUNC_IMM(SIGN, TYPE, BITS, NB, OP, NAME)                               \
> > > +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a) { \
> > > +    int i;                                                             \
> > > +    for (i=0; i<NB; i++) {                                             \
> > > +      dest[i] = a[i] OP 5;                                             \
> > > +    }                                                                  \
> > > +}
> > > +
> > > +/* 64-bit vectors.  */
> > > +FUNC(s, int, 32, 2, <<, vshl)
> > > +FUNC(u, uint, 32, 2, <<, vshl)
> > > +FUNC(s, int, 16, 4, <<, vshl)
> > > +FUNC(u, uint, 16, 4, <<, vshl)
> > > +FUNC(s, int, 8, 8, <<, vshl)
> > > +FUNC(u, uint, 8, 8, <<, vshl)
> > > +
> > > +/* 128-bit vectors.  */
> > > +FUNC(s, int, 32, 4, <<, vshl)
> > > +FUNC(u, uint, 32, 4, <<, vshl)
> > > +FUNC(s, int, 16, 8, <<, vshl)  /* FIXME: not vectorized */
> > > +FUNC(u, uint, 16, 8, <<, vshl) /* FIXME: not vectorized */
> > > +FUNC(s, int, 8, 16, <<, vshl)  /* FIXME: not vectorized */
> > > +FUNC(u, uint, 8, 16, <<, vshl) /* FIXME: not vectorized */
> > > +
> > > +/* 64-bit vectors.  */
> > > +FUNC_IMM(s, int, 32, 2, <<, vshlimm)
> > > +FUNC_IMM(u, uint, 32, 2, <<, vshlimm)
> > > +FUNC_IMM(s, int, 16, 4, <<, vshlimm)
> > > +FUNC_IMM(u, uint, 16, 4, <<, vshlimm)
> > > +FUNC_IMM(s, int, 8, 8, <<, vshlimm)
> > > +FUNC_IMM(u, uint, 8, 8, <<, vshlimm)
> > > +
> > > +/* 128-bit vectors.  */
> > > +FUNC_IMM(s, int, 32, 4, <<, vshlimm)
> > > +FUNC_IMM(u, uint, 32, 4, <<, vshlimm)
> > > +FUNC_IMM(s, int, 16, 8, <<, vshlimm)
> > > +FUNC_IMM(u, uint, 16, 8, <<, vshlimm)
> > > +FUNC_IMM(s, int, 8, 16, <<, vshlimm)
> > > +FUNC_IMM(u, uint, 8, 16, <<, vshlimm)
> > > +
> > > +/* MVE has only 128-bit vectors, so we can vectorize only half of the
> > > +   functions above.  */
> > > +/* We only emit vshl.u, which is equivalent to vshl.s anyway.  */
> > > +/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 2 } } */
> > > +
> > > +/* We emit vshl.i when the shift amount is an immediate.  */
> > > +/* { dg-final { scan-assembler-times {vshl.i[0-9]+\tq[0-9]+, q[0-9]+} 6 } } */
> > > --
> > > 2.7.4
> > >

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] arm: Auto-vectorization for MVE: vshr
  2021-01-07 12:20     ` Christophe Lyon
@ 2021-01-15  9:29       ` Christophe Lyon
  0 siblings, 0 replies; 16+ messages in thread
From: Christophe Lyon @ 2021-01-15  9:29 UTC (permalink / raw)
  To: gcc Patches

ping^3?

On Thu, 7 Jan 2021 at 13:20, Christophe Lyon <christophe.lyon@linaro.org> wrote:
>
> ping^2?
>
> On Wed, 30 Dec 2020 at 11:34, Christophe Lyon
> <christophe.lyon@linaro.org> wrote:
> >
> > ping?
> >
> > On Thu, 17 Dec 2020 at 18:48, Christophe Lyon
> > <christophe.lyon@linaro.org> wrote:
> > >
> > > This patch enables MVE vshr instructions for auto-vectorization.  New
> > > MVE patterns are introduced that take a vector of constants as second
> > > operand, all constants being equal.
> > >
> > > The existing mve_vshrq_n_<supf><mode> is kept, as it takes a single
> > > immediate as second operand, and is used by arm_mve.h.
> > >
> > > The vashr<mode>3 and vlshr<mode>3 expanders are moved fron neon.md to
> > > vec-common.md, updated to rely on the normal expansion scheme to
> > > generate shifts by immediate.
> > >
> > > 2020-12-03  Christophe Lyon  <christophe.lyon@linaro.org>
> > >
> > >         gcc/
> > >         * config/arm/mve.md (mve_vshrq_n_s<mode>_imm): New entry.
> > >         (mve_vshrq_n_u<mode>_imm): Likewise.
> > >         * config/arm/neon.md (vashr<mode>3, vlshr<mode>3): Move to ...
> > >         * config/arm/vec-common.md: ... here.
> > >
> > >         gcc/testsuite/
> > >         * gcc.target/arm/simd/mve-vshr.c: Add tests for vshr.
> > > ---
> > >  gcc/config/arm/mve.md                        | 34 ++++++++++++++++
> > >  gcc/config/arm/neon.md                       | 34 ----------------
> > >  gcc/config/arm/vec-common.md                 | 38 +++++++++++++++++-
> > >  gcc/testsuite/gcc.target/arm/simd/mve-vshr.c | 59 ++++++++++++++++++++++++++++
> > >  4 files changed, 130 insertions(+), 35 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vshr.c
> > >
> > > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> > > index 8bdb451..eea8b20 100644
> > > --- a/gcc/config/arm/mve.md
> > > +++ b/gcc/config/arm/mve.md
> > > @@ -763,6 +763,7 @@ (define_insn "mve_vcreateq_<supf><mode>"
> > >  ;;
> > >  ;; [vshrq_n_s, vshrq_n_u])
> > >  ;;
> > > +;; Version that takes an immediate as operand 2.
> > >  (define_insn "mve_vshrq_n_<supf><mode>"
> > >    [
> > >     (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> > > @@ -775,6 +776,39 @@ (define_insn "mve_vshrq_n_<supf><mode>"
> > >    [(set_attr "type" "mve_move")
> > >  ])
> > >
> > > +;; Versions that take constant vectors as operand 2 (with all elements
> > > +;; equal).
> > > +(define_insn "mve_vshrq_n_s<mode>_imm"
> > > +  [
> > > +   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> > > +       (ashiftrt:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
> > > +                       (match_operand:MVE_2 2 "imm_for_neon_rshift_operand" "i")))
> > > +  ]
> > > +  "TARGET_HAVE_MVE"
> > > +  {
> > > +    return neon_output_shift_immediate ("vshr", 's', &operands[2],
> > > +                                       <MODE>mode,
> > > +                                       VALID_NEON_QREG_MODE (<MODE>mode),
> > > +                                       true);
> > > +  }
> > > +  [(set_attr "type" "mve_move")
> > > +])
> > > +(define_insn "mve_vshrq_n_u<mode>_imm"
> > > +  [
> > > +   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> > > +       (lshiftrt:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
> > > +                       (match_operand:MVE_2 2 "imm_for_neon_rshift_operand" "i")))
> > > +  ]
> > > +  "TARGET_HAVE_MVE"
> > > +  {
> > > +    return neon_output_shift_immediate ("vshr", 'u', &operands[2],
> > > +                                       <MODE>mode,
> > > +                                       VALID_NEON_QREG_MODE (<MODE>mode),
> > > +                                       true);
> > > +  }
> > > +  [(set_attr "type" "mve_move")
> > > +])
> > > +
> > >  ;;
> > >  ;; [vcvtq_n_from_f_s, vcvtq_n_from_f_u])
> > >  ;;
> > > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> > > index ac9bf74..a0e8d7a 100644
> > > --- a/gcc/config/arm/neon.md
> > > +++ b/gcc/config/arm/neon.md
> > > @@ -899,40 +899,6 @@ (define_insn "ashl<mode>3_unsigned"
> > >    [(set_attr "type" "neon_shift_reg<q>")]
> > >  )
> > >
> > > -(define_expand "vashr<mode>3"
> > > -  [(set (match_operand:VDQIW 0 "s_register_operand")
> > > -       (ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
> > > -                       (match_operand:VDQIW 2 "imm_rshift_or_reg_neon")))]
> > > -  "TARGET_NEON"
> > > -{
> > > -  if (s_register_operand (operands[2], <MODE>mode))
> > > -    {
> > > -      rtx neg = gen_reg_rtx (<MODE>mode);
> > > -      emit_insn (gen_neon_neg<mode>2 (neg, operands[2]));
> > > -      emit_insn (gen_ashl<mode>3_signed (operands[0], operands[1], neg));
> > > -    }
> > > -  else
> > > -    emit_insn (gen_vashr<mode>3_imm (operands[0], operands[1], operands[2]));
> > > -  DONE;
> > > -})
> > > -
> > > -(define_expand "vlshr<mode>3"
> > > -  [(set (match_operand:VDQIW 0 "s_register_operand")
> > > -       (lshiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
> > > -                       (match_operand:VDQIW 2 "imm_rshift_or_reg_neon")))]
> > > -  "TARGET_NEON"
> > > -{
> > > -  if (s_register_operand (operands[2], <MODE>mode))
> > > -    {
> > > -      rtx neg = gen_reg_rtx (<MODE>mode);
> > > -      emit_insn (gen_neon_neg<mode>2 (neg, operands[2]));
> > > -      emit_insn (gen_ashl<mode>3_unsigned (operands[0], operands[1], neg));
> > > -    }
> > > -  else
> > > -    emit_insn (gen_vlshr<mode>3_imm (operands[0], operands[1], operands[2]));
> > > -  DONE;
> > > -})
> > > -
> > >  ;; 64-bit shifts
> > >
> > >  ;; This pattern loads a 32-bit shift count into a 64-bit NEON register,
> > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > > index 3a282f0..e126557 100644
> > > --- a/gcc/config/arm/vec-common.md
> > > +++ b/gcc/config/arm/vec-common.md
> > > @@ -258,4 +258,40 @@ (define_expand "vashl<mode>3"
> > >  {
> > >    emit_insn (gen_mve_vshlq_u<mode> (operands[0], operands[1], operands[2]));
> > >    DONE;
> > > -})
> > > \ No newline at end of file
> > > +})
> > > +
> > > +;; When operand 2 is an immediate, use the normal expansion to match
> > > +;; gen_vashr<mode>3_imm for Neon and gen_mve_vshrq_n_s<mode>_imm for
> > > +;; MVE.
> > > +(define_expand "vashr<mode>3"
> > > +  [(set (match_operand:VDQIW 0 "s_register_operand")
> > > +       (ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
> > > +                       (match_operand:VDQIW 2 "imm_rshift_or_reg_neon")))]
> > > +  "ARM_HAVE_<MODE>_ARITH"
> > > +{
> > > +  if (s_register_operand (operands[2], <MODE>mode))
> > > +    {
> > > +      rtx neg = gen_reg_rtx (<MODE>mode);
> > > +      emit_insn (gen_neg<mode>2 (neg, operands[2]));
> > > +      emit_insn (gen_mve_vshlq_s<mode> (operands[0], operands[1], neg));
> > > +      DONE;
> > > +    }
> > > +})
> > > +
> > > +;; When operand 2 is an immediate, use the normal expansion to match
> > > +;; gen_vashr<mode>3_imm for Neon and gen_mve_vshrq_n_u<mode>_imm for
> > > +;; MVE.
> > > +(define_expand "vlshr<mode>3"
> > > +  [(set (match_operand:VDQIW 0 "s_register_operand")
> > > +       (lshiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
> > > +                       (match_operand:VDQIW 2 "imm_rshift_or_reg_neon")))]
> > > +  "ARM_HAVE_<MODE>_ARITH"
> > > +{
> > > +  if (s_register_operand (operands[2], <MODE>mode))
> > > +    {
> > > +      rtx neg = gen_reg_rtx (<MODE>mode);
> > > +      emit_insn (gen_neg<mode>2 (neg, operands[2]));
> > > +      emit_insn (gen_mve_vshlq_u<mode> (operands[0], operands[1], neg));
> > > +      DONE;
> > > +    }
> > > +})
> > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c b/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c
> > > new file mode 100644
> > > index 0000000..d4e658c
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c
> > > @@ -0,0 +1,59 @@
> > > +/* { dg-do assemble } */
> > > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> > > +/* { dg-add-options arm_v8_1m_mve } */
> > > +/* { dg-additional-options "-O3" } */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)                           \
> > > +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
> > > +    int i;                                                             \
> > > +    for (i=0; i<NB; i++) {                                             \
> > > +      dest[i] = a[i] OP b[i];                                          \
> > > +    }                                                                  \
> > > +}
> > > +
> > > +#define FUNC_IMM(SIGN, TYPE, BITS, NB, OP, NAME)                               \
> > > +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a) { \
> > > +    int i;                                                             \
> > > +    for (i=0; i<NB; i++) {                                             \
> > > +      dest[i] = a[i] OP 5;                                             \
> > > +    }                                                                  \
> > > +}
> > > +
> > > +/* 64-bit vectors.  */
> > > +FUNC(s, int, 32, 2, >>, vshr)
> > > +FUNC(u, uint, 32, 2, >>, vshr)
> > > +FUNC(s, int, 16, 4, >>, vshr)
> > > +FUNC(u, uint, 16, 4, >>, vshr)
> > > +FUNC(s, int, 8, 8, >>, vshr)
> > > +FUNC(u, uint, 8, 8, >>, vshr)
> > > +
> > > +/* 128-bit vectors.  */
> > > +FUNC(s, int, 32, 4, >>, vshr)
> > > +FUNC(u, uint, 32, 4, >>, vshr)
> > > +FUNC(s, int, 16, 8, >>, vshr)
> > > +FUNC(u, uint, 16, 8, >>, vshr)
> > > +FUNC(s, int, 8, 16, >>, vshr)
> > > +FUNC(u, uint, 8, 16, >>, vshr)
> > > +
> > > +/* 64-bit vectors.  */
> > > +FUNC_IMM(s, int, 32, 2, >>, vshrimm)
> > > +FUNC_IMM(u, uint, 32, 2, >>, vshrimm)
> > > +FUNC_IMM(s, int, 16, 4, >>, vshrimm)
> > > +FUNC_IMM(u, uint, 16, 4, >>, vshrimm)
> > > +FUNC_IMM(s, int, 8, 8, >>, vshrimm)
> > > +FUNC_IMM(u, uint, 8, 8, >>, vshrimm)
> > > +
> > > +/* 128-bit vectors.  */
> > > +FUNC_IMM(s, int, 32, 4, >>, vshrimm)
> > > +FUNC_IMM(u, uint, 32, 4, >>, vshrimm)
> > > +FUNC_IMM(s, int, 16, 8, >>, vshrimm)
> > > +FUNC_IMM(u, uint, 16, 8, >>, vshrimm)
> > > +FUNC_IMM(s, int, 8, 16, >>, vshrimm)
> > > +FUNC_IMM(u, uint, 8, 16, >>, vshrimm)
> > > +
> > > +/* MVE has only 128-bit vectors, so we can vectorize only half of the
> > > +   functions above.  */
> > > +/* { dg-final { scan-assembler-times {vshr.s[0-9]+\tq[0-9]+, q[0-9]+} 3 } } */
> > > +/* { dg-final { scan-assembler-times {vshr.u[0-9]+\tq[0-9]+, q[0-9]+} 3 } } */
> > > --
> > > 2.7.4
> > >

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH 2/3] arm: Auto-vectorization for MVE: vshl
  2020-12-17 17:48 ` [PATCH 2/3] arm: Auto-vectorization for MVE: vshl Christophe Lyon
  2020-12-30 10:34   ` Christophe Lyon
@ 2021-01-15  9:42   ` Kyrylo Tkachov
  2021-01-15 10:45     ` Christophe Lyon
  1 sibling, 1 reply; 16+ messages in thread
From: Kyrylo Tkachov @ 2021-01-15  9:42 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc Patches



> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> Christophe Lyon via Gcc-patches
> Sent: 17 December 2020 17:48
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH 2/3] arm: Auto-vectorization for MVE: vshl
> 
> This patch enables MVE vshlq instructions for auto-vectorization.
> 
> The existing mve_vshlq_n_<supf><mode> is kept, as it takes a single
> immediate as second operand, and is used by arm_mve.h.
> 
> We move the vashl<mode>3 insn from neon.md to an expander in
> vec-common.md, and the mve_vshlq_<supf><mode> insn from mve.md to
> vec-common.md, adding the second alternative fron neon.md.
> 
> mve_vshlq_<supf><mode> will be used by a later patch enabling
> vectorization for vshr, as a unified version of
> ashl3<mode3>_[signed|unsigned] from neon.md. Keeping the use of unspec
> VSHLQ enables to generate both 's' and 'u' variants.
> 
> It is not clear whether the neon_shift_[reg|imm]<q> attribute is still
> suitable, since this insn is also used for MVE.
> 
> I kept the mve_vshlq_<supf><mode> naming instead of renaming it to
> ashl3_<supf>_<mode> as discussed because the reference in
> arm_mve_builtins.def automatically inserts the "mve_" prefix and I
> didn't want to make a special case for this.
> 
> I haven't yet found why the v16qi and v8hi tests are not vectorized.
> With dest[i] = a[i] << b[i] and:
>   {
>     int i;
>     unsigned int i.24_1;
>     unsigned int _2;
>     int16_t * _3;
>     short int _4;
>     int _5;
>     int16_t * _6;
>     short int _7;
>     int _8;
>     int _9;
>     int16_t * _10;
>     short int _11;
>     unsigned int ivtmp_42;
>     unsigned int ivtmp_43;
> 
>     <bb 2> [local count: 119292720]:
> 
>     <bb 3> [local count: 954449105]:
>     i.24_1 = (unsigned int) i_23;
>     _2 = i.24_1 * 2;
>     _3 = a_15(D) + _2;
>     _4 = *_3;
>     _5 = (int) _4;
>     _6 = b_16(D) + _2;
>     _7 = *_6;
>     _8 = (int) _7;
>     _9 = _5 << _8;
>     _10 = dest_17(D) + _2;
>     _11 = (short int) _9;
>     *_10 = _11;
>     i_19 = i_23 + 1;
>     ivtmp_42 = ivtmp_43 - 1;
>     if (ivtmp_42 != 0)
>       goto <bb 5>; [87.50%]
>     else
>       goto <bb 4>; [12.50%]
> 
>     <bb 5> [local count: 835156386]:
>     goto <bb 3>; [100.00%]
> 
>     <bb 4> [local count: 119292720]:
>     return;
> 
>   }
> the vectorizer says:
> mve-vshl.c:37:96: note:   ==> examining statement: _5 = (int) _4;
> mve-vshl.c:37:96: note:   vect_is_simple_use: operand *_3, type of def:
> internal
> mve-vshl.c:37:96: note:   vect_is_simple_use: vectype vector(8) short int
> mve-vshl.c:37:96: missed:   conversion not supported by target.
> mve-vshl.c:37:96: note:   vect_is_simple_use: operand *_3, type of def:
> internal
> mve-vshl.c:37:96: note:   vect_is_simple_use: vectype vector(8) short int
> mve-vshl.c:37:96: note:   vect_is_simple_use: operand *_3, type of def:
> internal
> mve-vshl.c:37:96: note:   vect_is_simple_use: vectype vector(8) short int
> mve-vshl.c:37:117: missed:   not vectorized: relevant stmt not supported: _5
> = (int) _4;
> mve-vshl.c:37:96: missed:  bad operation or unsupported loop bound.
> mve-vshl.c:37:96: note:  ***** Analysis failed with vector mode V8HI
> 

Can you file a bug report once this is committed so we can revisit in the future please.

> 2020-12-03  Christophe Lyon  <christophe.lyon@linaro.org>
> 
> 	gcc/
> 	* config/arm/mve.md (mve_vshlq_<supf><mode>): Move to
> 	vec-commond.md.
> 	* config/arm/neon.md (vashl<mode>3): Delete.
> 	* config/arm/vec-common.md (mve_vshlq_<supf><mode>): New.
> 	(vasl<mode>3): New expander.
> 
> 	gcc/testsuite/
> 	* gcc.target/arm/simd/mve-vshl.c: Add tests for vshl.
> ---
>  gcc/config/arm/mve.md                        | 13 +-----
>  gcc/config/arm/neon.md                       | 19 ---------
>  gcc/config/arm/vec-common.md                 | 30 ++++++++++++++
>  gcc/testsuite/gcc.target/arm/simd/mve-vshl.c | 62
> ++++++++++++++++++++++++++++
>  4 files changed, 93 insertions(+), 31 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> 
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index 673a83c..8bdb451 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -822,18 +822,7 @@ (define_insn "mve_vcmpneq_<supf><mode>"
> 
>  ;;
>  ;; [vshlq_s, vshlq_u])
> -;;
> -(define_insn "mve_vshlq_<supf><mode>"
> -  [
> -   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> -	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
> -		       (match_operand:MVE_2 2 "s_register_operand" "w")]
> -	 VSHLQ))
> -  ]
> -  "TARGET_HAVE_MVE"
> -  "vshl.<supf>%#<V_sz_elem>\t%q0, %q1, %q2"
> -  [(set_attr "type" "mve_move")
> -])
> +;; See vec-common.md
> 
>  ;;
>  ;; [vabdq_s, vabdq_u])
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index 50220be..ac9bf74 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -845,25 +845,6 @@ (define_insn "*smax<mode>3_neon"
>  ; generic vectorizer code.  It ends up creating a V2DI constructor with
>  ; SImode elements.
> 
> -(define_insn "vashl<mode>3"
> -  [(set (match_operand:VDQIW 0 "s_register_operand" "=w,w")
> -	(ashift:VDQIW (match_operand:VDQIW 1 "s_register_operand"
> "w,w")
> -		      (match_operand:VDQIW 2 "imm_lshift_or_reg_neon"
> "w,Dm")))]
> -  "TARGET_NEON"
> -  {
> -    switch (which_alternative)
> -      {
> -        case 0: return "vshl.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2";
> -        case 1: return neon_output_shift_immediate ("vshl", 'i', &operands[2],
> -                         			    <MODE>mode,
> -
> VALID_NEON_QREG_MODE (<MODE>mode),
> -						    true);
> -        default: gcc_unreachable ();
> -      }
> -  }
> -  [(set_attr "type" "neon_shift_reg<q>, neon_shift_imm<q>")]
> -)
> -
>  (define_insn "vashr<mode>3_imm"
>    [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
>  	(ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-
> common.md
> index f6a79e2..3a282f0 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -229,3 +229,33 @@ (define_expand "movmisalign<mode>"
>    if (!neon_vector_mem_operand (adjust_mem, 2, true))
>      XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
>  })
> +
> +(define_insn "mve_vshlq_<supf><mode>"
> +  [(set (match_operand:VDQIW 0 "s_register_operand" "=w,w")
> +	(unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand"
> "w,w")
> +		       (match_operand:VDQIW 2 "imm_lshift_or_reg_neon"
> "w,Dm")]
> +	 VSHLQ))]
> +  "ARM_HAVE_<MODE>_ARITH"
> +{
> +  switch (which_alternative)
> +    {
> +      case 0: return
> "vshl.<supf>%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2";
> +      case 1: return neon_output_shift_immediate ("vshl", 'i', &operands[2],
> +						  <MODE>mode,
> +						  VALID_NEON_QREG_MODE
> (<MODE>mode),
> +						  true);
> +      default: gcc_unreachable ();

I know this is copied code, but let's clean it up by removing the switch and using the "*" syntax for the C code in alternative 1.
Ok with those changes.
Thanks,
Kyrill

> +    }
> +}
> +  [(set_attr "type" "neon_shift_reg<q>, neon_shift_imm<q>")]
> +)
> +
> +(define_expand "vashl<mode>3"
> +  [(set (match_operand:VDQIW 0 "s_register_operand" "")
> +	(ashift:VDQIW (match_operand:VDQIW 1 "s_register_operand" "")
> +		      (match_operand:VDQIW 2 "imm_lshift_or_reg_neon"
> "")))]
> +  "ARM_HAVE_<MODE>_ARITH"
> +{
> +  emit_insn (gen_mve_vshlq_u<mode> (operands[0], operands[1],
> operands[2]));
> +  DONE;
> +})
> \ No newline at end of file
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> new file mode 100644
> index 0000000..7a06449
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> @@ -0,0 +1,62 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> +/* { dg-add-options arm_v8_1m_mve } */
> +/* { dg-additional-options "-O3" } */
> +
> +#include <stdint.h>
> +
> +#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)
> 	\
> +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t *
> __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
> +    int i;								\
> +    for (i=0; i<NB; i++) {						\
> +      dest[i] = a[i] OP b[i];						\
> +    }									\
> +}
> +
> +#define FUNC_IMM(SIGN, TYPE, BITS, NB, OP, NAME)
> 	\
> +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t *
> __restrict__ dest, TYPE##BITS##_t *a) { \
> +    int i;								\
> +    for (i=0; i<NB; i++) {						\
> +      dest[i] = a[i] OP 5;						\
> +    }									\
> +}
> +
> +/* 64-bit vectors.  */
> +FUNC(s, int, 32, 2, <<, vshl)
> +FUNC(u, uint, 32, 2, <<, vshl)
> +FUNC(s, int, 16, 4, <<, vshl)
> +FUNC(u, uint, 16, 4, <<, vshl)
> +FUNC(s, int, 8, 8, <<, vshl)
> +FUNC(u, uint, 8, 8, <<, vshl)
> +
> +/* 128-bit vectors.  */
> +FUNC(s, int, 32, 4, <<, vshl)
> +FUNC(u, uint, 32, 4, <<, vshl)
> +FUNC(s, int, 16, 8, <<, vshl)  /* FIXME: not vectorized */
> +FUNC(u, uint, 16, 8, <<, vshl) /* FIXME: not vectorized */
> +FUNC(s, int, 8, 16, <<, vshl)  /* FIXME: not vectorized */
> +FUNC(u, uint, 8, 16, <<, vshl) /* FIXME: not vectorized */
> +
> +/* 64-bit vectors.  */
> +FUNC_IMM(s, int, 32, 2, <<, vshlimm)
> +FUNC_IMM(u, uint, 32, 2, <<, vshlimm)
> +FUNC_IMM(s, int, 16, 4, <<, vshlimm)
> +FUNC_IMM(u, uint, 16, 4, <<, vshlimm)
> +FUNC_IMM(s, int, 8, 8, <<, vshlimm)
> +FUNC_IMM(u, uint, 8, 8, <<, vshlimm)
> +
> +/* 128-bit vectors.  */
> +FUNC_IMM(s, int, 32, 4, <<, vshlimm)
> +FUNC_IMM(u, uint, 32, 4, <<, vshlimm)
> +FUNC_IMM(s, int, 16, 8, <<, vshlimm)
> +FUNC_IMM(u, uint, 16, 8, <<, vshlimm)
> +FUNC_IMM(s, int, 8, 16, <<, vshlimm)
> +FUNC_IMM(u, uint, 8, 16, <<, vshlimm)
> +
> +/* MVE has only 128-bit vectors, so we can vectorize only half of the
> +   functions above.  */
> +/* We only emit vshl.u, which is equivalent to vshl.s anyway.  */
> +/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 2 } } */
> +
> +/* We emit vshl.i when the shift amount is an immediate.  */
> +/* { dg-final { scan-assembler-times {vshl.i[0-9]+\tq[0-9]+, q[0-9]+} 6 } } */
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH 3/3] arm: Auto-vectorization for MVE: vshr
  2020-12-17 17:48 ` [PATCH 3/3] arm: Auto-vectorization for MVE: vshr Christophe Lyon
  2020-12-30 10:34   ` Christophe Lyon
@ 2021-01-15  9:44   ` Kyrylo Tkachov
  1 sibling, 0 replies; 16+ messages in thread
From: Kyrylo Tkachov @ 2021-01-15  9:44 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc Patches



> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> Christophe Lyon via Gcc-patches
> Sent: 17 December 2020 17:48
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH 3/3] arm: Auto-vectorization for MVE: vshr
> 
> This patch enables MVE vshr instructions for auto-vectorization.  New
> MVE patterns are introduced that take a vector of constants as second
> operand, all constants being equal.
> 
> The existing mve_vshrq_n_<supf><mode> is kept, as it takes a single
> immediate as second operand, and is used by arm_mve.h.
> 
> The vashr<mode>3 and vlshr<mode>3 expanders are moved fron neon.md to
> vec-common.md, updated to rely on the normal expansion scheme to
> generate shifts by immediate.

Ok.
Thanks,
Kyrill

> 
> 2020-12-03  Christophe Lyon  <christophe.lyon@linaro.org>
> 
> 	gcc/
> 	* config/arm/mve.md (mve_vshrq_n_s<mode>_imm): New entry.
> 	(mve_vshrq_n_u<mode>_imm): Likewise.
> 	* config/arm/neon.md (vashr<mode>3, vlshr<mode>3): Move to ...
> 	* config/arm/vec-common.md: ... here.
> 
> 	gcc/testsuite/
> 	* gcc.target/arm/simd/mve-vshr.c: Add tests for vshr.
> ---
>  gcc/config/arm/mve.md                        | 34 ++++++++++++++++
>  gcc/config/arm/neon.md                       | 34 ----------------
>  gcc/config/arm/vec-common.md                 | 38 +++++++++++++++++-
>  gcc/testsuite/gcc.target/arm/simd/mve-vshr.c | 59
> ++++++++++++++++++++++++++++
>  4 files changed, 130 insertions(+), 35 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vshr.c
> 
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index 8bdb451..eea8b20 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -763,6 +763,7 @@ (define_insn "mve_vcreateq_<supf><mode>"
>  ;;
>  ;; [vshrq_n_s, vshrq_n_u])
>  ;;
> +;; Version that takes an immediate as operand 2.
>  (define_insn "mve_vshrq_n_<supf><mode>"
>    [
>     (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> @@ -775,6 +776,39 @@ (define_insn "mve_vshrq_n_<supf><mode>"
>    [(set_attr "type" "mve_move")
>  ])
> 
> +;; Versions that take constant vectors as operand 2 (with all elements
> +;; equal).
> +(define_insn "mve_vshrq_n_s<mode>_imm"
> +  [
> +   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> +	(ashiftrt:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
> +			(match_operand:MVE_2 2
> "imm_for_neon_rshift_operand" "i")))
> +  ]
> +  "TARGET_HAVE_MVE"
> +  {
> +    return neon_output_shift_immediate ("vshr", 's', &operands[2],
> +					<MODE>mode,
> +					VALID_NEON_QREG_MODE
> (<MODE>mode),
> +					true);
> +  }
> +  [(set_attr "type" "mve_move")
> +])
> +(define_insn "mve_vshrq_n_u<mode>_imm"
> +  [
> +   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> +	(lshiftrt:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
> +			(match_operand:MVE_2 2
> "imm_for_neon_rshift_operand" "i")))
> +  ]
> +  "TARGET_HAVE_MVE"
> +  {
> +    return neon_output_shift_immediate ("vshr", 'u', &operands[2],
> +					<MODE>mode,
> +					VALID_NEON_QREG_MODE
> (<MODE>mode),
> +					true);
> +  }
> +  [(set_attr "type" "mve_move")
> +])
> +
>  ;;
>  ;; [vcvtq_n_from_f_s, vcvtq_n_from_f_u])
>  ;;
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index ac9bf74..a0e8d7a 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -899,40 +899,6 @@ (define_insn "ashl<mode>3_unsigned"
>    [(set_attr "type" "neon_shift_reg<q>")]
>  )
> 
> -(define_expand "vashr<mode>3"
> -  [(set (match_operand:VDQIW 0 "s_register_operand")
> -	(ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
> -			(match_operand:VDQIW 2
> "imm_rshift_or_reg_neon")))]
> -  "TARGET_NEON"
> -{
> -  if (s_register_operand (operands[2], <MODE>mode))
> -    {
> -      rtx neg = gen_reg_rtx (<MODE>mode);
> -      emit_insn (gen_neon_neg<mode>2 (neg, operands[2]));
> -      emit_insn (gen_ashl<mode>3_signed (operands[0], operands[1], neg));
> -    }
> -  else
> -    emit_insn (gen_vashr<mode>3_imm (operands[0], operands[1],
> operands[2]));
> -  DONE;
> -})
> -
> -(define_expand "vlshr<mode>3"
> -  [(set (match_operand:VDQIW 0 "s_register_operand")
> -	(lshiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
> -			(match_operand:VDQIW 2
> "imm_rshift_or_reg_neon")))]
> -  "TARGET_NEON"
> -{
> -  if (s_register_operand (operands[2], <MODE>mode))
> -    {
> -      rtx neg = gen_reg_rtx (<MODE>mode);
> -      emit_insn (gen_neon_neg<mode>2 (neg, operands[2]));
> -      emit_insn (gen_ashl<mode>3_unsigned (operands[0], operands[1], neg));
> -    }
> -  else
> -    emit_insn (gen_vlshr<mode>3_imm (operands[0], operands[1],
> operands[2]));
> -  DONE;
> -})
> -
>  ;; 64-bit shifts
> 
>  ;; This pattern loads a 32-bit shift count into a 64-bit NEON register,
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-
> common.md
> index 3a282f0..e126557 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -258,4 +258,40 @@ (define_expand "vashl<mode>3"
>  {
>    emit_insn (gen_mve_vshlq_u<mode> (operands[0], operands[1],
> operands[2]));
>    DONE;
> -})
> \ No newline at end of file
> +})
> +
> +;; When operand 2 is an immediate, use the normal expansion to match
> +;; gen_vashr<mode>3_imm for Neon and gen_mve_vshrq_n_s<mode>_imm
> for
> +;; MVE.
> +(define_expand "vashr<mode>3"
> +  [(set (match_operand:VDQIW 0 "s_register_operand")
> +	(ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
> +			(match_operand:VDQIW 2
> "imm_rshift_or_reg_neon")))]
> +  "ARM_HAVE_<MODE>_ARITH"
> +{
> +  if (s_register_operand (operands[2], <MODE>mode))
> +    {
> +      rtx neg = gen_reg_rtx (<MODE>mode);
> +      emit_insn (gen_neg<mode>2 (neg, operands[2]));
> +      emit_insn (gen_mve_vshlq_s<mode> (operands[0], operands[1], neg));
> +      DONE;
> +    }
> +})
> +
> +;; When operand 2 is an immediate, use the normal expansion to match
> +;; gen_vashr<mode>3_imm for Neon and
> gen_mve_vshrq_n_u<mode>_imm for
> +;; MVE.
> +(define_expand "vlshr<mode>3"
> +  [(set (match_operand:VDQIW 0 "s_register_operand")
> +	(lshiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand")
> +			(match_operand:VDQIW 2
> "imm_rshift_or_reg_neon")))]
> +  "ARM_HAVE_<MODE>_ARITH"
> +{
> +  if (s_register_operand (operands[2], <MODE>mode))
> +    {
> +      rtx neg = gen_reg_rtx (<MODE>mode);
> +      emit_insn (gen_neg<mode>2 (neg, operands[2]));
> +      emit_insn (gen_mve_vshlq_u<mode> (operands[0], operands[1], neg));
> +      DONE;
> +    }
> +})
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c
> b/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c
> new file mode 100644
> index 0000000..d4e658c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c
> @@ -0,0 +1,59 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> +/* { dg-add-options arm_v8_1m_mve } */
> +/* { dg-additional-options "-O3" } */
> +
> +#include <stdint.h>
> +
> +#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)
> 	\
> +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t *
> __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
> +    int i;								\
> +    for (i=0; i<NB; i++) {						\
> +      dest[i] = a[i] OP b[i];						\
> +    }									\
> +}
> +
> +#define FUNC_IMM(SIGN, TYPE, BITS, NB, OP, NAME)
> 	\
> +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t *
> __restrict__ dest, TYPE##BITS##_t *a) { \
> +    int i;								\
> +    for (i=0; i<NB; i++) {						\
> +      dest[i] = a[i] OP 5;						\
> +    }									\
> +}
> +
> +/* 64-bit vectors.  */
> +FUNC(s, int, 32, 2, >>, vshr)
> +FUNC(u, uint, 32, 2, >>, vshr)
> +FUNC(s, int, 16, 4, >>, vshr)
> +FUNC(u, uint, 16, 4, >>, vshr)
> +FUNC(s, int, 8, 8, >>, vshr)
> +FUNC(u, uint, 8, 8, >>, vshr)
> +
> +/* 128-bit vectors.  */
> +FUNC(s, int, 32, 4, >>, vshr)
> +FUNC(u, uint, 32, 4, >>, vshr)
> +FUNC(s, int, 16, 8, >>, vshr)
> +FUNC(u, uint, 16, 8, >>, vshr)
> +FUNC(s, int, 8, 16, >>, vshr)
> +FUNC(u, uint, 8, 16, >>, vshr)
> +
> +/* 64-bit vectors.  */
> +FUNC_IMM(s, int, 32, 2, >>, vshrimm)
> +FUNC_IMM(u, uint, 32, 2, >>, vshrimm)
> +FUNC_IMM(s, int, 16, 4, >>, vshrimm)
> +FUNC_IMM(u, uint, 16, 4, >>, vshrimm)
> +FUNC_IMM(s, int, 8, 8, >>, vshrimm)
> +FUNC_IMM(u, uint, 8, 8, >>, vshrimm)
> +
> +/* 128-bit vectors.  */
> +FUNC_IMM(s, int, 32, 4, >>, vshrimm)
> +FUNC_IMM(u, uint, 32, 4, >>, vshrimm)
> +FUNC_IMM(s, int, 16, 8, >>, vshrimm)
> +FUNC_IMM(u, uint, 16, 8, >>, vshrimm)
> +FUNC_IMM(s, int, 8, 16, >>, vshrimm)
> +FUNC_IMM(u, uint, 8, 16, >>, vshrimm)
> +
> +/* MVE has only 128-bit vectors, so we can vectorize only half of the
> +   functions above.  */
> +/* { dg-final { scan-assembler-times {vshr.s[0-9]+\tq[0-9]+, q[0-9]+} 3 } } */
> +/* { dg-final { scan-assembler-times {vshr.u[0-9]+\tq[0-9]+, q[0-9]+} 3 } } */
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] arm: Auto-vectorization for MVE: vshl
  2021-01-15  9:42   ` Kyrylo Tkachov
@ 2021-01-15 10:45     ` Christophe Lyon
  0 siblings, 0 replies; 16+ messages in thread
From: Christophe Lyon @ 2021-01-15 10:45 UTC (permalink / raw)
  To: Kyrylo Tkachov; +Cc: gcc Patches

On Fri, 15 Jan 2021 at 10:42, Kyrylo Tkachov <Kyrylo.Tkachov@arm.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> > Christophe Lyon via Gcc-patches
> > Sent: 17 December 2020 17:48
> > To: gcc-patches@gcc.gnu.org
> > Subject: [PATCH 2/3] arm: Auto-vectorization for MVE: vshl
> >
> > This patch enables MVE vshlq instructions for auto-vectorization.
> >
> > The existing mve_vshlq_n_<supf><mode> is kept, as it takes a single
> > immediate as second operand, and is used by arm_mve.h.
> >
> > We move the vashl<mode>3 insn from neon.md to an expander in
> > vec-common.md, and the mve_vshlq_<supf><mode> insn from mve.md to
> > vec-common.md, adding the second alternative fron neon.md.
> >
> > mve_vshlq_<supf><mode> will be used by a later patch enabling
> > vectorization for vshr, as a unified version of
> > ashl3<mode3>_[signed|unsigned] from neon.md. Keeping the use of unspec
> > VSHLQ enables to generate both 's' and 'u' variants.
> >
> > It is not clear whether the neon_shift_[reg|imm]<q> attribute is still
> > suitable, since this insn is also used for MVE.
> >
> > I kept the mve_vshlq_<supf><mode> naming instead of renaming it to
> > ashl3_<supf>_<mode> as discussed because the reference in
> > arm_mve_builtins.def automatically inserts the "mve_" prefix and I
> > didn't want to make a special case for this.
> >
> > I haven't yet found why the v16qi and v8hi tests are not vectorized.
> > With dest[i] = a[i] << b[i] and:
> >   {
> >     int i;
> >     unsigned int i.24_1;
> >     unsigned int _2;
> >     int16_t * _3;
> >     short int _4;
> >     int _5;
> >     int16_t * _6;
> >     short int _7;
> >     int _8;
> >     int _9;
> >     int16_t * _10;
> >     short int _11;
> >     unsigned int ivtmp_42;
> >     unsigned int ivtmp_43;
> >
> >     <bb 2> [local count: 119292720]:
> >
> >     <bb 3> [local count: 954449105]:
> >     i.24_1 = (unsigned int) i_23;
> >     _2 = i.24_1 * 2;
> >     _3 = a_15(D) + _2;
> >     _4 = *_3;
> >     _5 = (int) _4;
> >     _6 = b_16(D) + _2;
> >     _7 = *_6;
> >     _8 = (int) _7;
> >     _9 = _5 << _8;
> >     _10 = dest_17(D) + _2;
> >     _11 = (short int) _9;
> >     *_10 = _11;
> >     i_19 = i_23 + 1;
> >     ivtmp_42 = ivtmp_43 - 1;
> >     if (ivtmp_42 != 0)
> >       goto <bb 5>; [87.50%]
> >     else
> >       goto <bb 4>; [12.50%]
> >
> >     <bb 5> [local count: 835156386]:
> >     goto <bb 3>; [100.00%]
> >
> >     <bb 4> [local count: 119292720]:
> >     return;
> >
> >   }
> > the vectorizer says:
> > mve-vshl.c:37:96: note:   ==> examining statement: _5 = (int) _4;
> > mve-vshl.c:37:96: note:   vect_is_simple_use: operand *_3, type of def:
> > internal
> > mve-vshl.c:37:96: note:   vect_is_simple_use: vectype vector(8) short int
> > mve-vshl.c:37:96: missed:   conversion not supported by target.
> > mve-vshl.c:37:96: note:   vect_is_simple_use: operand *_3, type of def:
> > internal
> > mve-vshl.c:37:96: note:   vect_is_simple_use: vectype vector(8) short int
> > mve-vshl.c:37:96: note:   vect_is_simple_use: operand *_3, type of def:
> > internal
> > mve-vshl.c:37:96: note:   vect_is_simple_use: vectype vector(8) short int
> > mve-vshl.c:37:117: missed:   not vectorized: relevant stmt not supported: _5
> > = (int) _4;
> > mve-vshl.c:37:96: missed:  bad operation or unsupported loop bound.
> > mve-vshl.c:37:96: note:  ***** Analysis failed with vector mode V8HI
> >
>
> Can you file a bug report once this is committed so we can revisit in the future please.

OK, I filed: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98697

>
> > 2020-12-03  Christophe Lyon  <christophe.lyon@linaro.org>
> >
> >       gcc/
> >       * config/arm/mve.md (mve_vshlq_<supf><mode>): Move to
> >       vec-commond.md.
> >       * config/arm/neon.md (vashl<mode>3): Delete.
> >       * config/arm/vec-common.md (mve_vshlq_<supf><mode>): New.
> >       (vasl<mode>3): New expander.
> >
> >       gcc/testsuite/
> >       * gcc.target/arm/simd/mve-vshl.c: Add tests for vshl.
> > ---
> >  gcc/config/arm/mve.md                        | 13 +-----
> >  gcc/config/arm/neon.md                       | 19 ---------
> >  gcc/config/arm/vec-common.md                 | 30 ++++++++++++++
> >  gcc/testsuite/gcc.target/arm/simd/mve-vshl.c | 62
> > ++++++++++++++++++++++++++++
> >  4 files changed, 93 insertions(+), 31 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> >
> > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> > index 673a83c..8bdb451 100644
> > --- a/gcc/config/arm/mve.md
> > +++ b/gcc/config/arm/mve.md
> > @@ -822,18 +822,7 @@ (define_insn "mve_vcmpneq_<supf><mode>"
> >
> >  ;;
> >  ;; [vshlq_s, vshlq_u])
> > -;;
> > -(define_insn "mve_vshlq_<supf><mode>"
> > -  [
> > -   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> > -     (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
> > -                    (match_operand:MVE_2 2 "s_register_operand" "w")]
> > -      VSHLQ))
> > -  ]
> > -  "TARGET_HAVE_MVE"
> > -  "vshl.<supf>%#<V_sz_elem>\t%q0, %q1, %q2"
> > -  [(set_attr "type" "mve_move")
> > -])
> > +;; See vec-common.md
> >
> >  ;;
> >  ;; [vabdq_s, vabdq_u])
> > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> > index 50220be..ac9bf74 100644
> > --- a/gcc/config/arm/neon.md
> > +++ b/gcc/config/arm/neon.md
> > @@ -845,25 +845,6 @@ (define_insn "*smax<mode>3_neon"
> >  ; generic vectorizer code.  It ends up creating a V2DI constructor with
> >  ; SImode elements.
> >
> > -(define_insn "vashl<mode>3"
> > -  [(set (match_operand:VDQIW 0 "s_register_operand" "=w,w")
> > -     (ashift:VDQIW (match_operand:VDQIW 1 "s_register_operand"
> > "w,w")
> > -                   (match_operand:VDQIW 2 "imm_lshift_or_reg_neon"
> > "w,Dm")))]
> > -  "TARGET_NEON"
> > -  {
> > -    switch (which_alternative)
> > -      {
> > -        case 0: return "vshl.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2";
> > -        case 1: return neon_output_shift_immediate ("vshl", 'i', &operands[2],
> > -                                                 <MODE>mode,
> > -
> > VALID_NEON_QREG_MODE (<MODE>mode),
> > -                                                 true);
> > -        default: gcc_unreachable ();
> > -      }
> > -  }
> > -  [(set_attr "type" "neon_shift_reg<q>, neon_shift_imm<q>")]
> > -)
> > -
> >  (define_insn "vashr<mode>3_imm"
> >    [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
> >       (ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-
> > common.md
> > index f6a79e2..3a282f0 100644
> > --- a/gcc/config/arm/vec-common.md
> > +++ b/gcc/config/arm/vec-common.md
> > @@ -229,3 +229,33 @@ (define_expand "movmisalign<mode>"
> >    if (!neon_vector_mem_operand (adjust_mem, 2, true))
> >      XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
> >  })
> > +
> > +(define_insn "mve_vshlq_<supf><mode>"
> > +  [(set (match_operand:VDQIW 0 "s_register_operand" "=w,w")
> > +     (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand"
> > "w,w")
> > +                    (match_operand:VDQIW 2 "imm_lshift_or_reg_neon"
> > "w,Dm")]
> > +      VSHLQ))]
> > +  "ARM_HAVE_<MODE>_ARITH"
> > +{
> > +  switch (which_alternative)
> > +    {
> > +      case 0: return
> > "vshl.<supf>%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2";
> > +      case 1: return neon_output_shift_immediate ("vshl", 'i', &operands[2],
> > +                                               <MODE>mode,
> > +                                               VALID_NEON_QREG_MODE
> > (<MODE>mode),
> > +                                               true);
> > +      default: gcc_unreachable ();
>
> I know this is copied code, but let's clean it up by removing the switch and using the "*" syntax for the C code in alternative 1.
> Ok with those changes.

Thanks, now pushed as r11-6707

Christophe

> Thanks,
> Kyrill
>
> > +    }
> > +}
> > +  [(set_attr "type" "neon_shift_reg<q>, neon_shift_imm<q>")]
> > +)
> > +
> > +(define_expand "vashl<mode>3"
> > +  [(set (match_operand:VDQIW 0 "s_register_operand" "")
> > +     (ashift:VDQIW (match_operand:VDQIW 1 "s_register_operand" "")
> > +                   (match_operand:VDQIW 2 "imm_lshift_or_reg_neon"
> > "")))]
> > +  "ARM_HAVE_<MODE>_ARITH"
> > +{
> > +  emit_insn (gen_mve_vshlq_u<mode> (operands[0], operands[1],
> > operands[2]));
> > +  DONE;
> > +})
> > \ No newline at end of file
> > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> > b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> > new file mode 100644
> > index 0000000..7a06449
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> > @@ -0,0 +1,62 @@
> > +/* { dg-do assemble } */
> > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> > +/* { dg-add-options arm_v8_1m_mve } */
> > +/* { dg-additional-options "-O3" } */
> > +
> > +#include <stdint.h>
> > +
> > +#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)
> >       \
> > +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t *
> > __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
> > +    int i;                                                           \
> > +    for (i=0; i<NB; i++) {                                           \
> > +      dest[i] = a[i] OP b[i];                                                \
> > +    }                                                                        \
> > +}
> > +
> > +#define FUNC_IMM(SIGN, TYPE, BITS, NB, OP, NAME)
> >       \
> > +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t *
> > __restrict__ dest, TYPE##BITS##_t *a) { \
> > +    int i;                                                           \
> > +    for (i=0; i<NB; i++) {                                           \
> > +      dest[i] = a[i] OP 5;                                           \
> > +    }                                                                        \
> > +}
> > +
> > +/* 64-bit vectors.  */
> > +FUNC(s, int, 32, 2, <<, vshl)
> > +FUNC(u, uint, 32, 2, <<, vshl)
> > +FUNC(s, int, 16, 4, <<, vshl)
> > +FUNC(u, uint, 16, 4, <<, vshl)
> > +FUNC(s, int, 8, 8, <<, vshl)
> > +FUNC(u, uint, 8, 8, <<, vshl)
> > +
> > +/* 128-bit vectors.  */
> > +FUNC(s, int, 32, 4, <<, vshl)
> > +FUNC(u, uint, 32, 4, <<, vshl)
> > +FUNC(s, int, 16, 8, <<, vshl)  /* FIXME: not vectorized */
> > +FUNC(u, uint, 16, 8, <<, vshl) /* FIXME: not vectorized */
> > +FUNC(s, int, 8, 16, <<, vshl)  /* FIXME: not vectorized */
> > +FUNC(u, uint, 8, 16, <<, vshl) /* FIXME: not vectorized */
> > +
> > +/* 64-bit vectors.  */
> > +FUNC_IMM(s, int, 32, 2, <<, vshlimm)
> > +FUNC_IMM(u, uint, 32, 2, <<, vshlimm)
> > +FUNC_IMM(s, int, 16, 4, <<, vshlimm)
> > +FUNC_IMM(u, uint, 16, 4, <<, vshlimm)
> > +FUNC_IMM(s, int, 8, 8, <<, vshlimm)
> > +FUNC_IMM(u, uint, 8, 8, <<, vshlimm)
> > +
> > +/* 128-bit vectors.  */
> > +FUNC_IMM(s, int, 32, 4, <<, vshlimm)
> > +FUNC_IMM(u, uint, 32, 4, <<, vshlimm)
> > +FUNC_IMM(s, int, 16, 8, <<, vshlimm)
> > +FUNC_IMM(u, uint, 16, 8, <<, vshlimm)
> > +FUNC_IMM(s, int, 8, 16, <<, vshlimm)
> > +FUNC_IMM(u, uint, 8, 16, <<, vshlimm)
> > +
> > +/* MVE has only 128-bit vectors, so we can vectorize only half of the
> > +   functions above.  */
> > +/* We only emit vshl.u, which is equivalent to vshl.s anyway.  */
> > +/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 2 } } */
> > +
> > +/* We emit vshl.i when the shift amount is an immediate.  */
> > +/* { dg-final { scan-assembler-times {vshl.i[0-9]+\tq[0-9]+, q[0-9]+} 6 } } */
> > --
> > 2.7.4
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2021-01-15 10:46 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-17 17:48 [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR target/97875) Christophe Lyon
2020-12-17 17:48 ` [PATCH 2/3] arm: Auto-vectorization for MVE: vshl Christophe Lyon
2020-12-30 10:34   ` Christophe Lyon
2021-01-07 12:20     ` Christophe Lyon
2021-01-15  9:29       ` Christophe Lyon
2021-01-15  9:42   ` Kyrylo Tkachov
2021-01-15 10:45     ` Christophe Lyon
2020-12-17 17:48 ` [PATCH 3/3] arm: Auto-vectorization for MVE: vshr Christophe Lyon
2020-12-30 10:34   ` Christophe Lyon
2021-01-07 12:20     ` Christophe Lyon
2021-01-15  9:29       ` Christophe Lyon
2021-01-15  9:44   ` Kyrylo Tkachov
2020-12-30 10:33 ` [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR target/97875) Christophe Lyon
2021-01-07 12:20   ` Christophe Lyon
2021-01-08  9:50 ` Kyrylo Tkachov
2021-01-08 11:04   ` Christophe Lyon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).