public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH 1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version
@ 2021-04-30 14:09 Christophe Lyon
  2021-04-30 14:09 ` [PATCH 2/9] arm: MVE: Cleanup vcmpne/vcmpeq builtins Christophe Lyon
                   ` (9 more replies)
  0 siblings, 10 replies; 35+ messages in thread
From: Christophe Lyon @ 2021-04-30 14:09 UTC (permalink / raw)
  To: gcc-patches

There is no need to have a signed and an unsigned version of these
builtins. This is similar to what we do for Neon in arm_neon.h.
This mechanical patch enables later cleanup patches.

2021-03-01  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/arm_mve.h (__arm_vcmpeq*u*, __arm_vcmpne*u*): Call
	the 's' version of the builtin.
---
 gcc/config/arm/arm_mve.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 3a40c6e..e4dfe91 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -3695,21 +3695,21 @@ __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpneq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return __builtin_mve_vcmpneq_uv16qi (__a, __b);
+  return __builtin_mve_vcmpneq_sv16qi ((int8x16_t)__a, (int8x16_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpneq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return __builtin_mve_vcmpneq_uv8hi (__a, __b);
+  return __builtin_mve_vcmpneq_sv8hi ((int16x8_t)__a, (int16x8_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpneq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return __builtin_mve_vcmpneq_uv4si (__a, __b);
+  return __builtin_mve_vcmpneq_sv4si ((int32x4_t)__a, (int32x4_t)__b);
 }
 
 __extension__ extern __inline int8x16_t
@@ -3932,7 +3932,7 @@ __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpneq_n_u8 (uint8x16_t __a, uint8_t __b)
 {
-  return __builtin_mve_vcmpneq_n_uv16qi (__a, __b);
+  return __builtin_mve_vcmpneq_n_sv16qi ((int8x16_t)__a, (int8_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
@@ -3953,14 +3953,14 @@ __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpeqq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return __builtin_mve_vcmpeqq_uv16qi (__a, __b);
+  return __builtin_mve_vcmpeqq_sv16qi ((int8x16_t)__a, (int8x16_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpeqq_n_u8 (uint8x16_t __a, uint8_t __b)
 {
-  return __builtin_mve_vcmpeqq_n_uv16qi (__a, __b);
+  return __builtin_mve_vcmpeqq_n_sv16qi ((int8x16_t)__a, (int8_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
@@ -4774,7 +4774,7 @@ __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpneq_n_u16 (uint16x8_t __a, uint16_t __b)
 {
-  return __builtin_mve_vcmpneq_n_uv8hi (__a, __b);
+  return __builtin_mve_vcmpneq_n_sv8hi ((int16x8_t)__a, (int16_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
@@ -4795,14 +4795,14 @@ __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpeqq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return __builtin_mve_vcmpeqq_uv8hi (__a, __b);
+  return __builtin_mve_vcmpeqq_sv8hi ((int16x8_t)__a, (int16x8_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpeqq_n_u16 (uint16x8_t __a, uint16_t __b)
 {
-  return __builtin_mve_vcmpeqq_n_uv8hi (__a, __b);
+  return __builtin_mve_vcmpeqq_n_sv8hi ((int16x8_t)__a, (int16_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
@@ -5616,7 +5616,7 @@ __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpneq_n_u32 (uint32x4_t __a, uint32_t __b)
 {
-  return __builtin_mve_vcmpneq_n_uv4si (__a, __b);
+  return __builtin_mve_vcmpneq_n_sv4si ((int32x4_t)__a, (int32_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
@@ -5637,14 +5637,14 @@ __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpeqq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return __builtin_mve_vcmpeqq_uv4si (__a, __b);
+  return __builtin_mve_vcmpeqq_sv4si ((int32x4_t)__a, (int32x4_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpeqq_n_u32 (uint32x4_t __a, uint32_t __b)
 {
-  return __builtin_mve_vcmpeqq_n_uv4si (__a, __b);
+  return __builtin_mve_vcmpeqq_n_sv4si ((int32x4_t)__a, (int32_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
-- 
2.7.4


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 2/9] arm: MVE: Cleanup vcmpne/vcmpeq builtins
  2021-04-30 14:09 [PATCH 1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version Christophe Lyon
@ 2021-04-30 14:09 ` Christophe Lyon
  2021-05-10 11:57   ` Kyrylo Tkachov
  2021-04-30 14:09 ` [PATCH 3/9] arm: MVE: Remove _s and _u suffixes from vcmp* builtins Christophe Lyon
                   ` (8 subsequent siblings)
  9 siblings, 1 reply; 35+ messages in thread
From: Christophe Lyon @ 2021-04-30 14:09 UTC (permalink / raw)
  To: gcc-patches

After the previous patch, we no longer need to emit the unsigned
variants of vcmpneq/vcmpeqq. This patch removes them as well as the
corresponding iterator entries.

2021-03-01  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/arm_mve_builtins.def (vcmpneq_u): Remove.
	(vcmpneq_n_u): Likewise.
	(vcmpeqq_u,): Likewise.
	(vcmpeqq_n_u): Likewise.
	* config/arm/iterators.md (supf): Remove VCMPNEQ_U, VCMPEQQ_U,
	VCMPEQQ_N_U and VCMPNEQ_N_U.
	* config/arm/mve.md (mve_vcmpneq): Remove <supf> iteration.
	(mve_vcmpeqq_n): Likewise.
	(mve_vcmpeqq): Likewise.
	(mve_vcmpneq_n): Likewise.

arm_mve_builtins.def: Remove vcmpneq_u, vcmpneq_n_u, vcmpeqq_u, vcmpeqq_n_u.
iterators.md: Update VCMPNEQ VCMPEQQ VCMPEQQ_N VCMPNEQ_N
mve.md: Remove vcmpneq_s vcmpeqq_n_u vcmpeqq_u, vcmpneq_n_u,
---
 gcc/config/arm/arm_mve_builtins.def |  4 ----
 gcc/config/arm/iterators.md         | 15 +++++++--------
 gcc/config/arm/mve.md               | 16 ++++++++--------
 3 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def
index 460f6ba..ee34fd1 100644
--- a/gcc/config/arm/arm_mve_builtins.def
+++ b/gcc/config/arm/arm_mve_builtins.def
@@ -90,7 +90,6 @@ VAR3 (BINOP_NONE_NONE_IMM, vshrq_n_s, v16qi, v8hi, v4si)
 VAR1 (BINOP_NONE_NONE_UNONE, vaddlvq_p_s, v4si)
 VAR1 (BINOP_UNONE_UNONE_UNONE, vaddlvq_p_u, v4si)
 VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_s, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpneq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vshlq_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_NONE, vshlq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vsubq_u, v16qi, v8hi, v4si)
@@ -118,11 +117,8 @@ VAR3 (BINOP_UNONE_UNONE_UNONE, vhsubq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, veorq_u, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpneq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_n_u, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpeqq_u, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpeqq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vbicq_u, v16qi, v8hi, v4si)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 8fb723e..0aba93f 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -1279,13 +1279,12 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
 		       (VCREATEQ_U "u") (VCREATEQ_S "s") (VSHRQ_N_S "s")
 		       (VSHRQ_N_U "u") (VCVTQ_N_FROM_F_S "s") (VSHLQ_U "u")
 		       (VCVTQ_N_FROM_F_U "u") (VADDLVQ_P_S "s") (VSHLQ_S "s")
-		       (VADDLVQ_P_U "u") (VCMPNEQ_U "u") (VCMPNEQ_S "s")
+		       (VADDLVQ_P_U "u") (VCMPNEQ_S "s")
 		       (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s")
 		       (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
 		       (VADDVQ_P_S "s")	(VADDVQ_P_U "u") (VBRSRQ_N_S "s")
-		       (VBRSRQ_N_U "u") (VCMPEQQ_S "s") (VCMPEQQ_U "u")
-		       (VCMPEQQ_N_S "s") (VCMPEQQ_N_U "u") (VCMPNEQ_N_S "s")
-		       (VCMPNEQ_N_U "u")
+		       (VBRSRQ_N_U "u") (VCMPEQQ_S "s")
+		       (VCMPEQQ_N_S "s") (VCMPNEQ_N_S "s")
 		       (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s")
 		       (VHADDQ_U "u") (VHSUBQ_N_S "s")	(VHSUBQ_N_U "u")
 		       (VHSUBQ_S "s") (VMAXQ_S "s") (VMAXQ_U "u") (VHSUBQ_U "u")
@@ -1541,16 +1540,16 @@ (define_int_iterator VCREATEQ [VCREATEQ_U VCREATEQ_S])
 (define_int_iterator VSHRQ_N [VSHRQ_N_S VSHRQ_N_U])
 (define_int_iterator VCVTQ_N_FROM_F [VCVTQ_N_FROM_F_S VCVTQ_N_FROM_F_U])
 (define_int_iterator VADDLVQ_P [VADDLVQ_P_S VADDLVQ_P_U])
-(define_int_iterator VCMPNEQ [VCMPNEQ_U VCMPNEQ_S])
+(define_int_iterator VCMPNEQ [VCMPNEQ_S])
 (define_int_iterator VSHLQ [VSHLQ_S VSHLQ_U])
 (define_int_iterator VABDQ [VABDQ_S VABDQ_U])
 (define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U])
 (define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U])
 (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
 (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
-(define_int_iterator VCMPEQQ [VCMPEQQ_U VCMPEQQ_S])
-(define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S VCMPEQQ_N_U])
-(define_int_iterator VCMPNEQ_N [VCMPNEQ_N_U VCMPNEQ_N_S])
+(define_int_iterator VCMPEQQ [VCMPEQQ_S])
+(define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S])
+(define_int_iterator VCMPNEQ_N [VCMPNEQ_N_S])
 (define_int_iterator VHADDQ [VHADDQ_S VHADDQ_U])
 (define_int_iterator VHADDQ_N [VHADDQ_N_U VHADDQ_N_S])
 (define_int_iterator VHSUBQ [VHSUBQ_S VHSUBQ_U])
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 7467d5f..b04c22b 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -836,9 +836,9 @@ (define_insn "mve_vaddlvq_p_<supf>v4si"
    (set_attr "length""8")])
 
 ;;
-;; [vcmpneq_u, vcmpneq_s])
+;; [vcmpneq_s])
 ;;
-(define_insn "mve_vcmpneq_<supf><mode>"
+(define_insn "mve_vcmpneq_s<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1035,9 +1035,9 @@ (define_insn "mve_vcmpcsq_u<mode>"
 ])
 
 ;;
-;; [vcmpeqq_n_s, vcmpeqq_n_u])
+;; [vcmpeqq_n_s])
 ;;
-(define_insn "mve_vcmpeqq_n_<supf><mode>"
+(define_insn "mve_vcmpeqq_n_s<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1050,9 +1050,9 @@ (define_insn "mve_vcmpeqq_n_<supf><mode>"
 ])
 
 ;;
-;; [vcmpeqq_u, vcmpeqq_s])
+;; [vcmpeqq_s])
 ;;
-(define_insn "mve_vcmpeqq_<supf><mode>"
+(define_insn "mve_vcmpeqq_s<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1215,9 +1215,9 @@ (define_insn "mve_vcmpltq_s<mode>"
 ])
 
 ;;
-;; [vcmpneq_n_u, vcmpneq_n_s])
+;; [vcmpneq_n_s])
 ;;
-(define_insn "mve_vcmpneq_n_<supf><mode>"
+(define_insn "mve_vcmpneq_n_s<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
-- 
2.7.4


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 3/9] arm: MVE: Remove _s and _u suffixes from vcmp* builtins.
  2021-04-30 14:09 [PATCH 1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version Christophe Lyon
  2021-04-30 14:09 ` [PATCH 2/9] arm: MVE: Cleanup vcmpne/vcmpeq builtins Christophe Lyon
@ 2021-04-30 14:09 ` Christophe Lyon
  2021-05-10 11:58   ` Kyrylo Tkachov
  2021-04-30 14:09 ` [PATCH 4/9] arm: MVE: Factorize all vcmp* integer patterns Christophe Lyon
                   ` (7 subsequent siblings)
  9 siblings, 1 reply; 35+ messages in thread
From: Christophe Lyon @ 2021-04-30 14:09 UTC (permalink / raw)
  To: gcc-patches

This patch brings more unification in the vector comparison builtins,
by removing the useless 's' (signed) suffix since we no longer need
unsigned versions.

2021-03-01  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/arm_mve.h (__arm_vcmp*): Remove 's' suffix.
	* config/arm/arm_mve_builtins.def (vcmp*): Remove 's' suffix.
	* config/arm/mve.md (mve_vcmp*): Remove 's' suffix in pattern
	names.
---
 gcc/config/arm/arm_mve.h            | 120 ++++++++++++++++++------------------
 gcc/config/arm/arm_mve_builtins.def |  32 +++++-----
 gcc/config/arm/mve.md               |  64 +++++++++----------
 3 files changed, 108 insertions(+), 108 deletions(-)

diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index e4dfe91..5d78269 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -3674,42 +3674,42 @@ __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpneq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_mve_vcmpneq_sv16qi (__a, __b);
+  return __builtin_mve_vcmpneq_v16qi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpneq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return __builtin_mve_vcmpneq_sv8hi (__a, __b);
+  return __builtin_mve_vcmpneq_v8hi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpneq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return __builtin_mve_vcmpneq_sv4si (__a, __b);
+  return __builtin_mve_vcmpneq_v4si (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpneq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return __builtin_mve_vcmpneq_sv16qi ((int8x16_t)__a, (int8x16_t)__b);
+  return __builtin_mve_vcmpneq_v16qi ((int8x16_t)__a, (int8x16_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpneq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return __builtin_mve_vcmpneq_sv8hi ((int16x8_t)__a, (int16x8_t)__b);
+  return __builtin_mve_vcmpneq_v8hi ((int16x8_t)__a, (int16x8_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpneq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return __builtin_mve_vcmpneq_sv4si ((int32x4_t)__a, (int32x4_t)__b);
+  return __builtin_mve_vcmpneq_v4si ((int32x4_t)__a, (int32x4_t)__b);
 }
 
 __extension__ extern __inline int8x16_t
@@ -3932,49 +3932,49 @@ __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpneq_n_u8 (uint8x16_t __a, uint8_t __b)
 {
-  return __builtin_mve_vcmpneq_n_sv16qi ((int8x16_t)__a, (int8_t)__b);
+  return __builtin_mve_vcmpneq_n_v16qi ((int8x16_t)__a, (int8_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmphiq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return __builtin_mve_vcmphiq_uv16qi (__a, __b);
+  return __builtin_mve_vcmphiq_v16qi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmphiq_n_u8 (uint8x16_t __a, uint8_t __b)
 {
-  return __builtin_mve_vcmphiq_n_uv16qi (__a, __b);
+  return __builtin_mve_vcmphiq_n_v16qi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpeqq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return __builtin_mve_vcmpeqq_sv16qi ((int8x16_t)__a, (int8x16_t)__b);
+  return __builtin_mve_vcmpeqq_v16qi ((int8x16_t)__a, (int8x16_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpeqq_n_u8 (uint8x16_t __a, uint8_t __b)
 {
-  return __builtin_mve_vcmpeqq_n_sv16qi ((int8x16_t)__a, (int8_t)__b);
+  return __builtin_mve_vcmpeqq_n_v16qi ((int8x16_t)__a, (int8_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpcsq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return __builtin_mve_vcmpcsq_uv16qi (__a, __b);
+  return __builtin_mve_vcmpcsq_v16qi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpcsq_n_u8 (uint8x16_t __a, uint8_t __b)
 {
-  return __builtin_mve_vcmpcsq_n_uv16qi (__a, __b);
+  return __builtin_mve_vcmpcsq_n_v16qi (__a, __b);
 }
 
 __extension__ extern __inline uint8x16_t
@@ -4144,77 +4144,77 @@ __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpneq_n_s8 (int8x16_t __a, int8_t __b)
 {
-  return __builtin_mve_vcmpneq_n_sv16qi (__a, __b);
+  return __builtin_mve_vcmpneq_n_v16qi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpltq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_mve_vcmpltq_sv16qi (__a, __b);
+  return __builtin_mve_vcmpltq_v16qi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpltq_n_s8 (int8x16_t __a, int8_t __b)
 {
-  return __builtin_mve_vcmpltq_n_sv16qi (__a, __b);
+  return __builtin_mve_vcmpltq_n_v16qi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpleq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_mve_vcmpleq_sv16qi (__a, __b);
+  return __builtin_mve_vcmpleq_v16qi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpleq_n_s8 (int8x16_t __a, int8_t __b)
 {
-  return __builtin_mve_vcmpleq_n_sv16qi (__a, __b);
+  return __builtin_mve_vcmpleq_n_v16qi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpgtq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_mve_vcmpgtq_sv16qi (__a, __b);
+  return __builtin_mve_vcmpgtq_v16qi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpgtq_n_s8 (int8x16_t __a, int8_t __b)
 {
-  return __builtin_mve_vcmpgtq_n_sv16qi (__a, __b);
+  return __builtin_mve_vcmpgtq_n_v16qi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpgeq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_mve_vcmpgeq_sv16qi (__a, __b);
+  return __builtin_mve_vcmpgeq_v16qi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpgeq_n_s8 (int8x16_t __a, int8_t __b)
 {
-  return __builtin_mve_vcmpgeq_n_sv16qi (__a, __b);
+  return __builtin_mve_vcmpgeq_n_v16qi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpeqq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_mve_vcmpeqq_sv16qi (__a, __b);
+  return __builtin_mve_vcmpeqq_v16qi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpeqq_n_s8 (int8x16_t __a, int8_t __b)
 {
-  return __builtin_mve_vcmpeqq_n_sv16qi (__a, __b);
+  return __builtin_mve_vcmpeqq_n_v16qi (__a, __b);
 }
 
 __extension__ extern __inline uint8x16_t
@@ -4774,49 +4774,49 @@ __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpneq_n_u16 (uint16x8_t __a, uint16_t __b)
 {
-  return __builtin_mve_vcmpneq_n_sv8hi ((int16x8_t)__a, (int16_t)__b);
+  return __builtin_mve_vcmpneq_n_v8hi ((int16x8_t)__a, (int16_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmphiq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return __builtin_mve_vcmphiq_uv8hi (__a, __b);
+  return __builtin_mve_vcmphiq_v8hi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmphiq_n_u16 (uint16x8_t __a, uint16_t __b)
 {
-  return __builtin_mve_vcmphiq_n_uv8hi (__a, __b);
+  return __builtin_mve_vcmphiq_n_v8hi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpeqq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return __builtin_mve_vcmpeqq_sv8hi ((int16x8_t)__a, (int16x8_t)__b);
+  return __builtin_mve_vcmpeqq_v8hi ((int16x8_t)__a, (int16x8_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpeqq_n_u16 (uint16x8_t __a, uint16_t __b)
 {
-  return __builtin_mve_vcmpeqq_n_sv8hi ((int16x8_t)__a, (int16_t)__b);
+  return __builtin_mve_vcmpeqq_n_v8hi ((int16x8_t)__a, (int16_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpcsq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return __builtin_mve_vcmpcsq_uv8hi (__a, __b);
+  return __builtin_mve_vcmpcsq_v8hi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpcsq_n_u16 (uint16x8_t __a, uint16_t __b)
 {
-  return __builtin_mve_vcmpcsq_n_uv8hi (__a, __b);
+  return __builtin_mve_vcmpcsq_n_v8hi (__a, __b);
 }
 
 __extension__ extern __inline uint16x8_t
@@ -4986,77 +4986,77 @@ __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpneq_n_s16 (int16x8_t __a, int16_t __b)
 {
-  return __builtin_mve_vcmpneq_n_sv8hi (__a, __b);
+  return __builtin_mve_vcmpneq_n_v8hi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpltq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return __builtin_mve_vcmpltq_sv8hi (__a, __b);
+  return __builtin_mve_vcmpltq_v8hi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpltq_n_s16 (int16x8_t __a, int16_t __b)
 {
-  return __builtin_mve_vcmpltq_n_sv8hi (__a, __b);
+  return __builtin_mve_vcmpltq_n_v8hi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpleq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return __builtin_mve_vcmpleq_sv8hi (__a, __b);
+  return __builtin_mve_vcmpleq_v8hi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpleq_n_s16 (int16x8_t __a, int16_t __b)
 {
-  return __builtin_mve_vcmpleq_n_sv8hi (__a, __b);
+  return __builtin_mve_vcmpleq_n_v8hi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpgtq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return __builtin_mve_vcmpgtq_sv8hi (__a, __b);
+  return __builtin_mve_vcmpgtq_v8hi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpgtq_n_s16 (int16x8_t __a, int16_t __b)
 {
-  return __builtin_mve_vcmpgtq_n_sv8hi (__a, __b);
+  return __builtin_mve_vcmpgtq_n_v8hi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpgeq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return __builtin_mve_vcmpgeq_sv8hi (__a, __b);
+  return __builtin_mve_vcmpgeq_v8hi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpgeq_n_s16 (int16x8_t __a, int16_t __b)
 {
-  return __builtin_mve_vcmpgeq_n_sv8hi (__a, __b);
+  return __builtin_mve_vcmpgeq_n_v8hi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpeqq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return __builtin_mve_vcmpeqq_sv8hi (__a, __b);
+  return __builtin_mve_vcmpeqq_v8hi (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpeqq_n_s16 (int16x8_t __a, int16_t __b)
 {
-  return __builtin_mve_vcmpeqq_n_sv8hi (__a, __b);
+  return __builtin_mve_vcmpeqq_n_v8hi (__a, __b);
 }
 
 __extension__ extern __inline uint16x8_t
@@ -5616,49 +5616,49 @@ __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpneq_n_u32 (uint32x4_t __a, uint32_t __b)
 {
-  return __builtin_mve_vcmpneq_n_sv4si ((int32x4_t)__a, (int32_t)__b);
+  return __builtin_mve_vcmpneq_n_v4si ((int32x4_t)__a, (int32_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmphiq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return __builtin_mve_vcmphiq_uv4si (__a, __b);
+  return __builtin_mve_vcmphiq_v4si (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmphiq_n_u32 (uint32x4_t __a, uint32_t __b)
 {
-  return __builtin_mve_vcmphiq_n_uv4si (__a, __b);
+  return __builtin_mve_vcmphiq_n_v4si (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpeqq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return __builtin_mve_vcmpeqq_sv4si ((int32x4_t)__a, (int32x4_t)__b);
+  return __builtin_mve_vcmpeqq_v4si ((int32x4_t)__a, (int32x4_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpeqq_n_u32 (uint32x4_t __a, uint32_t __b)
 {
-  return __builtin_mve_vcmpeqq_n_sv4si ((int32x4_t)__a, (int32_t)__b);
+  return __builtin_mve_vcmpeqq_n_v4si ((int32x4_t)__a, (int32_t)__b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpcsq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return __builtin_mve_vcmpcsq_uv4si (__a, __b);
+  return __builtin_mve_vcmpcsq_v4si (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpcsq_n_u32 (uint32x4_t __a, uint32_t __b)
 {
-  return __builtin_mve_vcmpcsq_n_uv4si (__a, __b);
+  return __builtin_mve_vcmpcsq_n_v4si (__a, __b);
 }
 
 __extension__ extern __inline uint32x4_t
@@ -5828,77 +5828,77 @@ __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpneq_n_s32 (int32x4_t __a, int32_t __b)
 {
-  return __builtin_mve_vcmpneq_n_sv4si (__a, __b);
+  return __builtin_mve_vcmpneq_n_v4si (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpltq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return __builtin_mve_vcmpltq_sv4si (__a, __b);
+  return __builtin_mve_vcmpltq_v4si (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpltq_n_s32 (int32x4_t __a, int32_t __b)
 {
-  return __builtin_mve_vcmpltq_n_sv4si (__a, __b);
+  return __builtin_mve_vcmpltq_n_v4si (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpleq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return __builtin_mve_vcmpleq_sv4si (__a, __b);
+  return __builtin_mve_vcmpleq_v4si (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpleq_n_s32 (int32x4_t __a, int32_t __b)
 {
-  return __builtin_mve_vcmpleq_n_sv4si (__a, __b);
+  return __builtin_mve_vcmpleq_n_v4si (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpgtq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return __builtin_mve_vcmpgtq_sv4si (__a, __b);
+  return __builtin_mve_vcmpgtq_v4si (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpgtq_n_s32 (int32x4_t __a, int32_t __b)
 {
-  return __builtin_mve_vcmpgtq_n_sv4si (__a, __b);
+  return __builtin_mve_vcmpgtq_n_v4si (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpgeq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return __builtin_mve_vcmpgeq_sv4si (__a, __b);
+  return __builtin_mve_vcmpgeq_v4si (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpgeq_n_s32 (int32x4_t __a, int32_t __b)
 {
-  return __builtin_mve_vcmpgeq_n_sv4si (__a, __b);
+  return __builtin_mve_vcmpgeq_n_v4si (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpeqq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return __builtin_mve_vcmpeqq_sv4si (__a, __b);
+  return __builtin_mve_vcmpeqq_v4si (__a, __b);
 }
 
 __extension__ extern __inline mve_pred16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmpeqq_n_s32 (int32x4_t __a, int32_t __b)
 {
-  return __builtin_mve_vcmpeqq_n_sv4si (__a, __b);
+  return __builtin_mve_vcmpeqq_n_v4si (__a, __b);
 }
 
 __extension__ extern __inline uint32x4_t
diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def
index ee34fd1..e9b5b28 100644
--- a/gcc/config/arm/arm_mve_builtins.def
+++ b/gcc/config/arm/arm_mve_builtins.def
@@ -89,7 +89,7 @@ VAR3 (BINOP_UNONE_UNONE_IMM, vshrq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_IMM, vshrq_n_s, v16qi, v8hi, v4si)
 VAR1 (BINOP_NONE_NONE_UNONE, vaddlvq_p_s, v4si)
 VAR1 (BINOP_UNONE_UNONE_UNONE, vaddlvq_p_u, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_s, v16qi, v8hi, v4si)
+VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vshlq_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_NONE, vshlq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vsubq_u, v16qi, v8hi, v4si)
@@ -117,10 +117,10 @@ VAR3 (BINOP_UNONE_UNONE_UNONE, vhsubq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, veorq_u, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_u, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_n_u, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_u, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_n_u, v16qi, v8hi, v4si)
+VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_, v16qi, v8hi, v4si)
+VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_n_, v16qi, v8hi, v4si)
+VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_, v16qi, v8hi, v4si)
+VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_n_, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vbicq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vandq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vaddvq_p_u, v16qi, v8hi, v4si)
@@ -142,17 +142,17 @@ VAR3 (BINOP_UNONE_UNONE_NONE, vbrsrq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_IMM, vshlq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_IMM, vrshrq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_IMM, vqshlq_n_u, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_n_s, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpltq_s, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpltq_n_s, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpleq_s, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpleq_n_s, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpgtq_s, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpgtq_n_s, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpgeq_s, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpgeq_n_s, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpeqq_s, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_NONE_NONE, vcmpeqq_n_s, v16qi, v8hi, v4si)
+VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_n_, v16qi, v8hi, v4si)
+VAR3 (BINOP_UNONE_NONE_NONE, vcmpltq_, v16qi, v8hi, v4si)
+VAR3 (BINOP_UNONE_NONE_NONE, vcmpltq_n_, v16qi, v8hi, v4si)
+VAR3 (BINOP_UNONE_NONE_NONE, vcmpleq_, v16qi, v8hi, v4si)
+VAR3 (BINOP_UNONE_NONE_NONE, vcmpleq_n_, v16qi, v8hi, v4si)
+VAR3 (BINOP_UNONE_NONE_NONE, vcmpgtq_, v16qi, v8hi, v4si)
+VAR3 (BINOP_UNONE_NONE_NONE, vcmpgtq_n_, v16qi, v8hi, v4si)
+VAR3 (BINOP_UNONE_NONE_NONE, vcmpgeq_, v16qi, v8hi, v4si)
+VAR3 (BINOP_UNONE_NONE_NONE, vcmpgeq_n_, v16qi, v8hi, v4si)
+VAR3 (BINOP_UNONE_NONE_NONE, vcmpeqq_, v16qi, v8hi, v4si)
+VAR3 (BINOP_UNONE_NONE_NONE, vcmpeqq_n_, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_NONE_IMM, vqshluq_n_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_UNONE, vaddvq_p_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vsubq_s, v16qi, v8hi, v4si)
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index b04c22b..e9f095d 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -836,9 +836,9 @@ (define_insn "mve_vaddlvq_p_<supf>v4si"
    (set_attr "length""8")])
 
 ;;
-;; [vcmpneq_s])
+;; [vcmpneq_])
 ;;
-(define_insn "mve_vcmpneq_s<mode>"
+(define_insn "mve_vcmpneq_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1005,9 +1005,9 @@ (define_expand "cadd<rot><mode>3"
 )
 
 ;;
-;; [vcmpcsq_n_u])
+;; [vcmpcsq_n_])
 ;;
-(define_insn "mve_vcmpcsq_n_u<mode>"
+(define_insn "mve_vcmpcsq_n_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1020,9 +1020,9 @@ (define_insn "mve_vcmpcsq_n_u<mode>"
 ])
 
 ;;
-;; [vcmpcsq_u])
+;; [vcmpcsq_])
 ;;
-(define_insn "mve_vcmpcsq_u<mode>"
+(define_insn "mve_vcmpcsq_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1035,9 +1035,9 @@ (define_insn "mve_vcmpcsq_u<mode>"
 ])
 
 ;;
-;; [vcmpeqq_n_s])
+;; [vcmpeqq_n_])
 ;;
-(define_insn "mve_vcmpeqq_n_s<mode>"
+(define_insn "mve_vcmpeqq_n_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1050,9 +1050,9 @@ (define_insn "mve_vcmpeqq_n_s<mode>"
 ])
 
 ;;
-;; [vcmpeqq_s])
+;; [vcmpeqq_])
 ;;
-(define_insn "mve_vcmpeqq_s<mode>"
+(define_insn "mve_vcmpeqq_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1065,9 +1065,9 @@ (define_insn "mve_vcmpeqq_s<mode>"
 ])
 
 ;;
-;; [vcmpgeq_n_s])
+;; [vcmpgeq_n_])
 ;;
-(define_insn "mve_vcmpgeq_n_s<mode>"
+(define_insn "mve_vcmpgeq_n_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1080,9 +1080,9 @@ (define_insn "mve_vcmpgeq_n_s<mode>"
 ])
 
 ;;
-;; [vcmpgeq_s])
+;; [vcmpgeq_])
 ;;
-(define_insn "mve_vcmpgeq_s<mode>"
+(define_insn "mve_vcmpgeq_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1095,9 +1095,9 @@ (define_insn "mve_vcmpgeq_s<mode>"
 ])
 
 ;;
-;; [vcmpgtq_n_s])
+;; [vcmpgtq_n_])
 ;;
-(define_insn "mve_vcmpgtq_n_s<mode>"
+(define_insn "mve_vcmpgtq_n_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1110,9 +1110,9 @@ (define_insn "mve_vcmpgtq_n_s<mode>"
 ])
 
 ;;
-;; [vcmpgtq_s])
+;; [vcmpgtq_])
 ;;
-(define_insn "mve_vcmpgtq_s<mode>"
+(define_insn "mve_vcmpgtq_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1125,9 +1125,9 @@ (define_insn "mve_vcmpgtq_s<mode>"
 ])
 
 ;;
-;; [vcmphiq_n_u])
+;; [vcmphiq_n_])
 ;;
-(define_insn "mve_vcmphiq_n_u<mode>"
+(define_insn "mve_vcmphiq_n_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1140,9 +1140,9 @@ (define_insn "mve_vcmphiq_n_u<mode>"
 ])
 
 ;;
-;; [vcmphiq_u])
+;; [vcmphiq_])
 ;;
-(define_insn "mve_vcmphiq_u<mode>"
+(define_insn "mve_vcmphiq_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1155,9 +1155,9 @@ (define_insn "mve_vcmphiq_u<mode>"
 ])
 
 ;;
-;; [vcmpleq_n_s])
+;; [vcmpleq_n_])
 ;;
-(define_insn "mve_vcmpleq_n_s<mode>"
+(define_insn "mve_vcmpleq_n_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1170,9 +1170,9 @@ (define_insn "mve_vcmpleq_n_s<mode>"
 ])
 
 ;;
-;; [vcmpleq_s])
+;; [vcmpleq_])
 ;;
-(define_insn "mve_vcmpleq_s<mode>"
+(define_insn "mve_vcmpleq_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1185,9 +1185,9 @@ (define_insn "mve_vcmpleq_s<mode>"
 ])
 
 ;;
-;; [vcmpltq_n_s])
+;; [vcmpltq_n_])
 ;;
-(define_insn "mve_vcmpltq_n_s<mode>"
+(define_insn "mve_vcmpltq_n_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1200,9 +1200,9 @@ (define_insn "mve_vcmpltq_n_s<mode>"
 ])
 
 ;;
-;; [vcmpltq_s])
+;; [vcmpltq_])
 ;;
-(define_insn "mve_vcmpltq_s<mode>"
+(define_insn "mve_vcmpltq_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1215,9 +1215,9 @@ (define_insn "mve_vcmpltq_s<mode>"
 ])
 
 ;;
-;; [vcmpneq_n_s])
+;; [vcmpneq_n_])
 ;;
-(define_insn "mve_vcmpneq_n_s<mode>"
+(define_insn "mve_vcmpneq_n_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
-- 
2.7.4


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 4/9] arm: MVE: Factorize all vcmp* integer patterns
  2021-04-30 14:09 [PATCH 1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version Christophe Lyon
  2021-04-30 14:09 ` [PATCH 2/9] arm: MVE: Cleanup vcmpne/vcmpeq builtins Christophe Lyon
  2021-04-30 14:09 ` [PATCH 3/9] arm: MVE: Remove _s and _u suffixes from vcmp* builtins Christophe Lyon
@ 2021-04-30 14:09 ` Christophe Lyon
  2021-05-10 11:59   ` Kyrylo Tkachov
  2021-04-30 14:09 ` [PATCH 5/9] arm: MVE: Factorize vcmp_*f* Christophe Lyon
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 35+ messages in thread
From: Christophe Lyon @ 2021-04-30 14:09 UTC (permalink / raw)
  To: gcc-patches

After removing the signed and unsigned suffixes in the previous
patches, we can now factorize the vcmp* patterns: there is no longer
an asymmetry where operators do not have the same set of signed and
unsigned variants.

The will make maintenance easier.

MVE has a different set of vector comparison operators than Neon,
so we have to introduce dedicated iterators.

2021-03-01  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/iterators.md (MVE_COMPARISONS): New.
	(mve_cmp_op): New.
	(mve_cmp_type): New.
	* config/arm/mve.md (mve_vcmp<mve_cmp_op>q_<mode>): New, merge all
	mve_vcmp patterns.
	(mve_vcmpneq_<mode>, mve_vcmpcsq_n_<mode>, mve_vcmpcsq_<mode>)
	(mve_vcmpeqq_n_<mode>, mve_vcmpeqq_<mode>, mve_vcmpgeq_n_<mode>)
	(mve_vcmpgeq_<mode>, mve_vcmpgtq_n_<mode>, mve_vcmpgtq_<mode>)
	(mve_vcmphiq_n_<mode>, mve_vcmphiq_<mode>, mve_vcmpleq_n_<mode>)
	(mve_vcmpleq_<mode>, mve_vcmpltq_n_<mode>, mve_vcmpltq_<mode>)
	(mve_vcmpneq_n_<mode>, mve_vcmpltq_n_<mode>, mve_vcmpltq_<mode>)
	(mve_vcmpneq_n_<mode>): Remove.
---
 gcc/config/arm/iterators.md |   8 ++
 gcc/config/arm/mve.md       | 250 ++++----------------------------------------
 2 files changed, 27 insertions(+), 231 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 0aba93f..29347f7 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -285,6 +285,8 @@ (define_code_iterator GTUGEU [gtu geu])
 
 ;; Comparisons for vc<cmp>
 (define_code_iterator COMPARISONS [eq gt ge le lt])
+;; Comparisons for MVE
+(define_code_iterator MVE_COMPARISONS [eq ge geu gt gtu le lt ne])
 
 ;; A list of ...
 (define_code_iterator IOR_XOR [ior xor])
@@ -336,8 +338,14 @@ (define_code_attr arith_shift_insn
 (define_code_attr cmp_op [(eq "eq") (gt "gt") (ge "ge") (lt "lt") (le "le")
                           (gtu "gt") (geu "ge")])
 
+(define_code_attr mve_cmp_op [(eq "eq") (gt "gt") (ge "ge") (lt "lt") (le "le")
+                              (gtu "hi") (geu "cs") (ne "ne")])
+
 (define_code_attr cmp_type [(eq "i") (gt "s") (ge "s") (lt "s") (le "s")])
 
+(define_code_attr mve_cmp_type [(eq "i") (gt "s") (ge "s") (lt "s") (le "s")
+                                (gtu "u") (geu "u") (ne "i")])
+
 (define_code_attr vfml_op [(plus "a") (minus "s")])
 
 (define_code_attr ss_op [(ss_plus "qadd") (ss_minus "qsub")])
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index e9f095d..40baff7 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -836,17 +836,30 @@ (define_insn "mve_vaddlvq_p_<supf>v4si"
    (set_attr "length""8")])
 
 ;;
-;; [vcmpneq_])
+;; [vcmpneq_, vcmpcsq_, vcmpeqq_, vcmpgeq_, vcmpgtq_, vcmphiq_, vcmpleq_, vcmpltq_])
 ;;
-(define_insn "mve_vcmpneq_<mode>"
+(define_insn "mve_vcmp<mve_cmp_op>q_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
-		    (match_operand:MVE_2 2 "s_register_operand" "w")]
-	 VCMPNEQ))
+	(MVE_COMPARISONS:HI (match_operand:MVE_2 1 "s_register_operand" "w")
+		    (match_operand:MVE_2 2 "s_register_operand" "w")))
+  ]
+  "TARGET_HAVE_MVE"
+  "vcmp.<mve_cmp_type>%#<V_sz_elem>  <mve_cmp_op>, %q1, %q2"
+  [(set_attr "type" "mve_move")
+])
+
+;;
+;; [vcmpcsq_n_, vcmpeqq_n_, vcmpgeq_n_, vcmpgtq_n_, vcmphiq_n_, vcmpleq_n_, vcmpltq_n_, vcmpneq_n_])
+;;
+(define_insn "mve_vcmp<mve_cmp_op>q_n_<mode>"
+  [
+   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
+	(MVE_COMPARISONS:HI (match_operand:MVE_2 1 "s_register_operand" "w")
+		    (match_operand:<V_elem> 2 "s_register_operand" "r")))
   ]
   "TARGET_HAVE_MVE"
-  "vcmp.i%#<V_sz_elem>  ne, %q1, %q2"
+  "vcmp.<mve_cmp_type>%#<V_sz_elem>	<mve_cmp_op>, %q1, %2"
   [(set_attr "type" "mve_move")
 ])
 
@@ -1005,231 +1018,6 @@ (define_expand "cadd<rot><mode>3"
 )
 
 ;;
-;; [vcmpcsq_n_])
-;;
-(define_insn "mve_vcmpcsq_n_<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
-		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
-	 VCMPCSQ_N_U))
-  ]
-  "TARGET_HAVE_MVE"
-  "vcmp.u%#<V_sz_elem>	cs, %q1, %2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpcsq_])
-;;
-(define_insn "mve_vcmpcsq_<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
-		    (match_operand:MVE_2 2 "s_register_operand" "w")]
-	 VCMPCSQ_U))
-  ]
-  "TARGET_HAVE_MVE"
-  "vcmp.u%#<V_sz_elem>	cs, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpeqq_n_])
-;;
-(define_insn "mve_vcmpeqq_n_<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
-		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
-	 VCMPEQQ_N))
-  ]
-  "TARGET_HAVE_MVE"
-  "vcmp.i%#<V_sz_elem>	eq, %q1, %2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpeqq_])
-;;
-(define_insn "mve_vcmpeqq_<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
-		    (match_operand:MVE_2 2 "s_register_operand" "w")]
-	 VCMPEQQ))
-  ]
-  "TARGET_HAVE_MVE"
-  "vcmp.i%#<V_sz_elem>	eq, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpgeq_n_])
-;;
-(define_insn "mve_vcmpgeq_n_<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
-		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
-	 VCMPGEQ_N_S))
-  ]
-  "TARGET_HAVE_MVE"
-  "vcmp.s%#<V_sz_elem>	ge, %q1, %2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpgeq_])
-;;
-(define_insn "mve_vcmpgeq_<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
-		    (match_operand:MVE_2 2 "s_register_operand" "w")]
-	 VCMPGEQ_S))
-  ]
-  "TARGET_HAVE_MVE"
-  "vcmp.s%#<V_sz_elem>	ge, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpgtq_n_])
-;;
-(define_insn "mve_vcmpgtq_n_<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
-		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
-	 VCMPGTQ_N_S))
-  ]
-  "TARGET_HAVE_MVE"
-  "vcmp.s%#<V_sz_elem>	gt, %q1, %2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpgtq_])
-;;
-(define_insn "mve_vcmpgtq_<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
-		    (match_operand:MVE_2 2 "s_register_operand" "w")]
-	 VCMPGTQ_S))
-  ]
-  "TARGET_HAVE_MVE"
-  "vcmp.s%#<V_sz_elem>	gt, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmphiq_n_])
-;;
-(define_insn "mve_vcmphiq_n_<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
-		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
-	 VCMPHIQ_N_U))
-  ]
-  "TARGET_HAVE_MVE"
-  "vcmp.u%#<V_sz_elem>	hi, %q1, %2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmphiq_])
-;;
-(define_insn "mve_vcmphiq_<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
-		    (match_operand:MVE_2 2 "s_register_operand" "w")]
-	 VCMPHIQ_U))
-  ]
-  "TARGET_HAVE_MVE"
-  "vcmp.u%#<V_sz_elem>	hi, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpleq_n_])
-;;
-(define_insn "mve_vcmpleq_n_<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
-		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
-	 VCMPLEQ_N_S))
-  ]
-  "TARGET_HAVE_MVE"
-  "vcmp.s%#<V_sz_elem>	le, %q1, %2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpleq_])
-;;
-(define_insn "mve_vcmpleq_<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
-		    (match_operand:MVE_2 2 "s_register_operand" "w")]
-	 VCMPLEQ_S))
-  ]
-  "TARGET_HAVE_MVE"
-  "vcmp.s%#<V_sz_elem>	le, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpltq_n_])
-;;
-(define_insn "mve_vcmpltq_n_<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
-		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
-	 VCMPLTQ_N_S))
-  ]
-  "TARGET_HAVE_MVE"
-  "vcmp.s%#<V_sz_elem>	lt, %q1, %2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpltq_])
-;;
-(define_insn "mve_vcmpltq_<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
-		    (match_operand:MVE_2 2 "s_register_operand" "w")]
-	 VCMPLTQ_S))
-  ]
-  "TARGET_HAVE_MVE"
-  "vcmp.s%#<V_sz_elem>	lt, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpneq_n_])
-;;
-(define_insn "mve_vcmpneq_n_<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
-		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
-	 VCMPNEQ_N))
-  ]
-  "TARGET_HAVE_MVE"
-  "vcmp.i%#<V_sz_elem>	ne, %q1, %2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
 ;; [veorq_u, veorq_s])
 ;;
 (define_insn "mve_veorq_u<mode>"
-- 
2.7.4


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 5/9] arm: MVE: Factorize vcmp_*f*
  2021-04-30 14:09 [PATCH 1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version Christophe Lyon
                   ` (2 preceding siblings ...)
  2021-04-30 14:09 ` [PATCH 4/9] arm: MVE: Factorize all vcmp* integer patterns Christophe Lyon
@ 2021-04-30 14:09 ` Christophe Lyon
  2021-05-10 11:59   ` Kyrylo Tkachov
  2021-04-30 14:09 ` [PATCH 6/9] arm: Auto-vectorization for MVE: vcmp Christophe Lyon
                   ` (5 subsequent siblings)
  9 siblings, 1 reply; 35+ messages in thread
From: Christophe Lyon @ 2021-04-30 14:09 UTC (permalink / raw)
  To: gcc-patches

Like in the previous, we factorize the vcmp_*f* patterns to make
maintenance easier.

2021-03-12  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/iterators.md (MVE_FP_COMPARISONS): New.
	* config/arm/mve.md (mve_vcmp<mve_cmp_op>q_f<mode>)
	(mve_vcmp<mve_cmp_op>q_n_f<mode>): New, merge all vcmp_*f*
	patterns.
	(mve_vcmpeqq_f<mode>, mve_vcmpeqq_n_f<mode>, mve_vcmpgeq_f<mode>)
	(mve_vcmpgeq_n_f<mode>, mve_vcmpgtq_f<mode>)
	(mve_vcmpgtq_n_f<mode>, mve_vcmpleq_f<mode>)
	(mve_vcmpleq_n_f<mode>, mve_vcmpltq_f<mode>)
	(mve_vcmpltq_n_f<mode>, mve_vcmpneq_f<mode>)
	(mve_vcmpneq_n_f<mode>): Remove.
	* config/arm/unspecs.md (VCMPEQQ_F, VCMPEQQ_N_F, VCMPGEQ_F)
	(VCMPGEQ_N_F, VCMPGTQ_F, VCMPGTQ_N_F, VCMPLEQ_F, VCMPLEQ_N_F)
	(VCMPLTQ_F, VCMPLTQ_N_F, VCMPNEQ_F, VCMPNEQ_N_F): Remove.
---
 gcc/config/arm/iterators.md |   1 +
 gcc/config/arm/mve.md       | 172 +++-----------------------------------------
 gcc/config/arm/unspecs.md   |  12 ----
 3 files changed, 11 insertions(+), 174 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 29347f7..95df8bd 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -287,6 +287,7 @@ (define_code_iterator GTUGEU [gtu geu])
 (define_code_iterator COMPARISONS [eq gt ge le lt])
 ;; Comparisons for MVE
 (define_code_iterator MVE_COMPARISONS [eq ge geu gt gtu le lt ne])
+(define_code_iterator MVE_FP_COMPARISONS [eq ge gt le lt ne])
 
 ;; A list of ...
 (define_code_iterator IOR_XOR [ior xor])
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 40baff7..7c846a4 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -1926,182 +1926,30 @@ (define_insn "mve_vcaddq<mve_rot><mode>"
 ])
 
 ;;
-;; [vcmpeqq_f])
+;; [vcmpeqq_f, vcmpgeq_f, vcmpgtq_f, vcmpleq_f, vcmpltq_f, vcmpneq_f])
 ;;
-(define_insn "mve_vcmpeqq_f<mode>"
+(define_insn "mve_vcmp<mve_cmp_op>q_f<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
-		    (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VCMPEQQ_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmp.f%#<V_sz_elem>	eq, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpeqq_n_f])
-;;
-(define_insn "mve_vcmpeqq_n_f<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
-		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
-	 VCMPEQQ_N_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmp.f%#<V_sz_elem>	eq, %q1, %2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpgeq_f])
-;;
-(define_insn "mve_vcmpgeq_f<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
-		    (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VCMPGEQ_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmp.f%#<V_sz_elem>	ge, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpgeq_n_f])
-;;
-(define_insn "mve_vcmpgeq_n_f<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
-		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
-	 VCMPGEQ_N_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmp.f%#<V_sz_elem>	ge, %q1, %2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpgtq_f])
-;;
-(define_insn "mve_vcmpgtq_f<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
-		    (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VCMPGTQ_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmp.f%#<V_sz_elem>	gt, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpgtq_n_f])
-;;
-(define_insn "mve_vcmpgtq_n_f<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
-		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
-	 VCMPGTQ_N_F))
+	(MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w")
+			       (match_operand:MVE_0 2 "s_register_operand" "w")))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmp.f%#<V_sz_elem>	gt, %q1, %2"
+  "vcmp.f%#<V_sz_elem>	<mve_cmp_op>, %q1, %q2"
   [(set_attr "type" "mve_move")
 ])
 
 ;;
-;; [vcmpleq_f])
+;; [vcmpeqq_n_f, vcmpgeq_n_f, vcmpgtq_n_f, vcmpleq_n_f, vcmpltq_n_f, vcmpneq_n_f])
 ;;
-(define_insn "mve_vcmpleq_f<mode>"
+(define_insn "mve_vcmp<mve_cmp_op>q_n_f<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
-		    (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VCMPLEQ_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmp.f%#<V_sz_elem>	le, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpleq_n_f])
-;;
-(define_insn "mve_vcmpleq_n_f<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
-		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
-	 VCMPLEQ_N_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmp.f%#<V_sz_elem>	le, %q1, %2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpltq_f])
-;;
-(define_insn "mve_vcmpltq_f<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
-		    (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VCMPLTQ_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmp.f%#<V_sz_elem>	lt, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpltq_n_f])
-;;
-(define_insn "mve_vcmpltq_n_f<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
-		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
-	 VCMPLTQ_N_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmp.f%#<V_sz_elem>	lt, %q1, %2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpneq_f])
-;;
-(define_insn "mve_vcmpneq_f<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
-		    (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VCMPNEQ_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmp.f%#<V_sz_elem>	ne, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmpneq_n_f])
-;;
-(define_insn "mve_vcmpneq_n_f<mode>"
-  [
-   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
-	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
-		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
-	 VCMPNEQ_N_F))
+	(MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w")
+			       (match_operand:<V_elem> 2 "s_register_operand" "r")))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmp.f%#<V_sz_elem>	ne, %q1, %2"
+  "vcmp.f%#<V_sz_elem>	<mve_cmp_op>, %q1, %2"
   [(set_attr "type" "mve_move")
 ])
 
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 4d47ab7..07ca53b 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -710,18 +710,6 @@ (define_c_enum "unspec" [
   VABDQ_M_U
   VABDQ_F
   VADDQ_N_F
-  VCMPEQQ_F
-  VCMPEQQ_N_F
-  VCMPGEQ_F
-  VCMPGEQ_N_F
-  VCMPGTQ_F
-  VCMPGTQ_N_F
-  VCMPLEQ_F
-  VCMPLEQ_N_F
-  VCMPLTQ_F
-  VCMPLTQ_N_F
-  VCMPNEQ_F
-  VCMPNEQ_N_F
   VMAXNMAQ_F
   VMAXNMAVQ_F
   VMAXNMQ_F
-- 
2.7.4


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 6/9] arm: Auto-vectorization for MVE: vcmp
  2021-04-30 14:09 [PATCH 1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version Christophe Lyon
                   ` (3 preceding siblings ...)
  2021-04-30 14:09 ` [PATCH 5/9] arm: MVE: Factorize vcmp_*f* Christophe Lyon
@ 2021-04-30 14:09 ` Christophe Lyon
  2021-05-04 11:29   ` Andre Vieira (lists)
  2021-04-30 14:09 ` [PATCH 7/9] arm: Auto-vectorization for MVE: add __fp16 support to VCMP Christophe Lyon
                   ` (4 subsequent siblings)
  9 siblings, 1 reply; 35+ messages in thread
From: Christophe Lyon @ 2021-04-30 14:09 UTC (permalink / raw)
  To: gcc-patches

Since MVE has a different set of vector comparison operators from
Neon, we have to update the expansion to take into account the new
ones, for instance 'NE' for which MVE does not require to use 'EQ'
with the inverted condition.

Conversely, Neon supports comparisons with #0, MVE does not.

For:
typedef long int vs32 __attribute__((vector_size(16)));
vs32 cmp_eq_vs32_reg (vs32 a, vs32 b) { return a == b; }

we now generate:
cmp_eq_vs32_reg:
	vldr.64 d4, .L123       @ 8     [c=8 l=4]  *mve_movv4si/8
	vldr.64 d5, .L123+8
	vldr.64 d6, .L123+16    @ 9     [c=8 l=4]  *mve_movv4si/8
	vldr.64 d7, .L123+24
	vcmp.i32  eq, q0, q1    @ 7     [c=16 l=4]  mve_vcmpeqq_v4si
	vpsel q0, q3, q2        @ 15    [c=8 l=4]  mve_vpselq_sv4si
	bx      lr      @ 26    [c=8 l=4]  *thumb2_return
.L124:
	.align  3
.L123:
	.word   0
	.word   0
	.word   0
	.word   0
	.word   1
	.word   1
	.word   1
	.word   1

For some reason emit_move_insn (zero, CONST0_RTX (cmp_mode)) produces
a pair of vldr instead of vmov.i32, qX, #0

2021-03-01  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/arm-protos.h (arm_expand_vector_compare): Update
	prototype.
	* config/arm/arm.c (arm_expand_vector_compare): Add support for
	MVE.
	(arm_expand_vcond): Likewise.
	* config/arm/iterators.md (supf): Remove VCMPNEQ_S, VCMPEQQ_S,
	VCMPEQQ_N_S, VCMPNEQ_N_S.
	(VCMPNEQ, VCMPEQQ, VCMPEQQ_N, VCMPNEQ_N): Remove.
	* config/arm/mve.md (@mve_vcmp<mve_cmp_op>q_<mode>): Add '@' prefix.
	(@mve_vcmp<mve_cmp_op>q_f<mode>): Likewise.
	(@mve_vcmp<mve_cmp_op>q_n_f<mode>): Likewise.
	(@mve_vpselq_<supf><mode>): Likewise.
	(@mve_vpselq_f<mode>"): Likewise.
	* config/arm/neon.md (vec_cmp<mode><v_cmp_result): Enable for MVE
	and move to vec-common.md.
	(vec_cmpu<mode><mode>): Likewise.
	(vcond<mode><mode>): Likewise.
	(vcond<V_cvtto><mode>): Likewise.
	(vcondu<mode><v_cmp_result>): Likewise.
	(vcond_mask_<mode><v_cmp_result>): Likewise.
	* config/arm/unspecs.md (VCMPNEQ_U, VCMPNEQ_S, VCMPEQQ_S)
	(VCMPEQQ_N_S, VCMPNEQ_N_S, VCMPEQQ_U, CMPEQQ_N_U, VCMPNEQ_N_U)
	(VCMPGEQ_N_S, VCMPGEQ_S, VCMPGTQ_N_S, VCMPGTQ_S, VCMPLEQ_N_S)
	(VCMPLEQ_S, VCMPLTQ_N_S, VCMPLTQ_S, VCMPCSQ_N_U, VCMPCSQ_U)
	(VCMPHIQ_N_U, VCMPHIQ_U): Remove.
	* config/arm/vec-common.md (vec_cmp<mode><v_cmp_result): Moved
	from neon.md.
	(vec_cmpu<mode><mode>): Likewise.
	(vcond<mode><mode>): Likewise.
	(vcond<V_cvtto><mode>): Likewise.
	(vcondu<mode><v_cmp_result>): Likewise.
	(vcond_mask_<mode><v_cmp_result>): Likewise.

	gcc/testsuite
	* gcc.target/arm/simd/mve-compare-1.c: New test with GCC vectors.
	* gcc.target/arm/simd/mve-compare-2.c: New test with GCC vectors.
	* gcc.target/arm/simd/mve-compare-scalar-1.c: New test with GCC
	vectors.
	* gcc.target/arm/simd/mve-vcmp-f32.c: New test for
	auto-vectorization.
	* gcc.target/arm/simd/mve-vcmp.c: New test for auto-vectorization.

add gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
---
 gcc/config/arm/arm-protos.h                        |   2 +-
 gcc/config/arm/arm.c                               | 211 ++++++++++++++++-----
 gcc/config/arm/iterators.md                        |   9 +-
 gcc/config/arm/mve.md                              |  10 +-
 gcc/config/arm/neon.md                             |  87 ---------
 gcc/config/arm/unspecs.md                          |  20 --
 gcc/config/arm/vec-common.md                       | 107 +++++++++++
 gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c  |  80 ++++++++
 gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c  |  38 ++++
 .../gcc.target/arm/simd/mve-compare-scalar-1.c     |  69 +++++++
 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c   |  30 +++
 gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c       |  50 +++++
 12 files changed, 547 insertions(+), 166 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 2521541..ffccaa7 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -373,7 +373,7 @@ extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
 extern bool arm_fusion_enabled_p (tune_params::fuse_ops);
 extern bool arm_valid_symbolic_address_p (rtx);
 extern bool arm_validize_comparison (rtx *, rtx *, rtx *);
-extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool);
+extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool, bool);
 #endif /* RTX_CODE */
 
 extern bool arm_gen_setmem (rtx *);
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 0371d98..80e28ef 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -30933,66 +30933,114 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
    and return true if TARGET contains the inverse.  If !CAN_INVERT,
    always store the result in TARGET, never its inverse.
 
+   If VCOND_MVE, do not emit the vpsel instruction here, let arm_expand_vcond do
+   it with the right destination type to avoid emiting two vpsel, one here and
+   one in arm_expand_vcond.
+
    Note that the handling of floating-point comparisons is not
    IEEE compliant.  */
 
 bool
 arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
-			   bool can_invert)
+			   bool can_invert, bool vcond_mve)
 {
   machine_mode cmp_result_mode = GET_MODE (target);
   machine_mode cmp_mode = GET_MODE (op0);
 
   bool inverted;
-  switch (code)
-    {
-    /* For these we need to compute the inverse of the requested
-       comparison.  */
-    case UNORDERED:
-    case UNLT:
-    case UNLE:
-    case UNGT:
-    case UNGE:
-    case UNEQ:
-    case NE:
-      code = reverse_condition_maybe_unordered (code);
-      if (!can_invert)
-	{
-	  /* Recursively emit the inverted comparison into a temporary
-	     and then store its inverse in TARGET.  This avoids reusing
-	     TARGET (which for integer NE could be one of the inputs).  */
-	  rtx tmp = gen_reg_rtx (cmp_result_mode);
-	  if (arm_expand_vector_compare (tmp, code, op0, op1, true))
-	    gcc_unreachable ();
-	  emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp)));
-	  return false;
-	}
-      inverted = true;
-      break;
 
-    default:
+  /* MVE supports more comparisons than Neon.  */
+  if (TARGET_HAVE_MVE)
       inverted = false;
-      break;
-    }
+  else
+    switch (code)
+      {
+	/* For these we need to compute the inverse of the requested
+	   comparison.  */
+      case UNORDERED:
+      case UNLT:
+      case UNLE:
+      case UNGT:
+      case UNGE:
+      case UNEQ:
+      case NE:
+	code = reverse_condition_maybe_unordered (code);
+	if (!can_invert)
+	  {
+	    /* Recursively emit the inverted comparison into a temporary
+	       and then store its inverse in TARGET.  This avoids reusing
+	       TARGET (which for integer NE could be one of the inputs).  */
+	    rtx tmp = gen_reg_rtx (cmp_result_mode);
+	    if (arm_expand_vector_compare (tmp, code, op0, op1, true, vcond_mve))
+	      gcc_unreachable ();
+	    emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp)));
+	    return false;
+	  }
+	inverted = true;
+	break;
+
+      default:
+	inverted = false;
+	break;
+      }
 
   switch (code)
     {
-    /* These are natively supported for zero comparisons, but otherwise
-       require the operands to be swapped.  */
+    /* These are natively supported by Neon for zero comparisons, but otherwise
+       require the operands to be swapped. For MVE, we can only compare
+       registers.  */
     case LE:
     case LT:
-      if (op1 != CONST0_RTX (cmp_mode))
-	{
-	  code = swap_condition (code);
-	  std::swap (op0, op1);
-	}
+      if (!TARGET_HAVE_MVE)
+	if (op1 != CONST0_RTX (cmp_mode))
+	  {
+	    code = swap_condition (code);
+	    std::swap (op0, op1);
+	  }
       /* Fall through.  */
 
-    /* These are natively supported for both register and zero operands.  */
+    /* These are natively supported by Neon for both register and zero
+       operands. MVE supports registers only.  */
     case EQ:
     case GE:
     case GT:
-      emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1));
+    case NE:
+      if (TARGET_HAVE_MVE) {
+	rtx vpr_p0;
+	if (vcond_mve)
+	  vpr_p0 = target;
+	else
+	  vpr_p0 = gen_reg_rtx (HImode);
+
+	switch (cmp_mode)
+	  {
+	  case E_V16QImode:
+	  case E_V8HImode:
+	  case E_V4SImode:
+	    emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
+	    break;
+	  case E_V8HFmode:
+	  case E_V4SFmode:
+	    if (TARGET_HAVE_MVE_FLOAT)
+	      emit_insn (gen_mve_vcmpq_f (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
+	    else
+	      gcc_unreachable ();
+	    break;
+	  default:
+	    gcc_unreachable ();
+	  }
+
+	/* If we are not expanding a vcond, build the result here.  */
+	if (!vcond_mve) {
+	  rtx zero = gen_reg_rtx (cmp_result_mode);
+	  rtx one = gen_reg_rtx (cmp_result_mode);
+	  emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
+	  emit_move_insn (one, CONST1_RTX (cmp_result_mode));
+	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
+	}
+      }
+      else
+	emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1));
       return inverted;
 
     /* These are natively supported for register operands only.
@@ -31000,16 +31048,50 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
        or canonicalized by target-independent code.  */
     case GEU:
     case GTU:
-      emit_insn (gen_neon_vc (code, cmp_mode, target,
-			      op0, force_reg (cmp_mode, op1)));
+      if (TARGET_HAVE_MVE) {
+	rtx vpr_p0;
+	if (vcond_mve)
+	  vpr_p0 = target;
+	else
+	  vpr_p0 = gen_reg_rtx (HImode);
+
+	emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
+	if (!vcond_mve) {
+	  rtx zero = gen_reg_rtx (cmp_result_mode);
+	  rtx one = gen_reg_rtx (cmp_result_mode);
+	  emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
+	  emit_move_insn (one, CONST1_RTX (cmp_result_mode));
+	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
+	}
+      }
+       else
+	emit_insn (gen_neon_vc (code, cmp_mode, target,
+				op0, force_reg (cmp_mode, op1)));
       return inverted;
 
     /* These require the operands to be swapped and likewise do not
        support comparisons with zero.  */
     case LEU:
     case LTU:
-      emit_insn (gen_neon_vc (swap_condition (code), cmp_mode,
-			      target, force_reg (cmp_mode, op1), op0));
+      if (TARGET_HAVE_MVE) {
+	rtx vpr_p0;
+	if (vcond_mve)
+	  vpr_p0 = target;
+	else
+	  vpr_p0 = gen_reg_rtx (HImode);
+
+	emit_insn (gen_mve_vcmpq (swap_condition (code), cmp_mode, vpr_p0, force_reg (cmp_mode, op1), op0));
+	if (!vcond_mve) {
+	  rtx zero = gen_reg_rtx (cmp_result_mode);
+	  rtx one = gen_reg_rtx (cmp_result_mode);
+	  emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
+	  emit_move_insn (one, CONST1_RTX (cmp_result_mode));
+	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
+	}
+      }
+      else
+	emit_insn (gen_neon_vc (swap_condition (code), cmp_mode,
+				target, force_reg (cmp_mode, op1), op0));
       return inverted;
 
     /* These need a combination of two comparisons.  */
@@ -31021,8 +31103,8 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
 	rtx gt_res = gen_reg_rtx (cmp_result_mode);
 	rtx alt_res = gen_reg_rtx (cmp_result_mode);
 	rtx_code alt_code = (code == LTGT ? LT : LE);
-	if (arm_expand_vector_compare (gt_res, GT, op0, op1, true)
-	    || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true))
+	if (arm_expand_vector_compare (gt_res, GT, op0, op1, true, vcond_mve)
+	    || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true, vcond_mve))
 	  gcc_unreachable ();
 	emit_insn (gen_rtx_SET (target, gen_rtx_IOR (cmp_result_mode,
 						     gt_res, alt_res)));
@@ -31040,13 +31122,50 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
 void
 arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode)
 {
-  rtx mask = gen_reg_rtx (cmp_result_mode);
+  /* When expanding for MVE, we do not want to emit a (useless) vpsel in
+     arm_expand_vector_compare, and another one here.  */
+  bool vcond_mve=false;
+  rtx mask;
+
+  if (TARGET_HAVE_MVE)
+    {
+      vcond_mve=true;
+      mask = gen_reg_rtx (HImode);
+    }
+  else
+    mask = gen_reg_rtx (cmp_result_mode);
+
   bool inverted = arm_expand_vector_compare (mask, GET_CODE (operands[3]),
-					     operands[4], operands[5], true);
+					     operands[4], operands[5], true, vcond_mve);
   if (inverted)
     std::swap (operands[1], operands[2]);
+  if (TARGET_NEON)
   emit_insn (gen_neon_vbsl (GET_MODE (operands[0]), operands[0],
 			    mask, operands[1], operands[2]));
+  else
+    {
+      machine_mode cmp_mode = GET_MODE (operands[4]);
+      rtx vpr_p0 = mask;
+      rtx zero = gen_reg_rtx (cmp_mode);
+      rtx one = gen_reg_rtx (cmp_mode);
+      emit_move_insn (zero, CONST0_RTX (cmp_mode));
+      emit_move_insn (one, CONST1_RTX (cmp_mode));
+      switch (cmp_mode)
+	{
+	case E_V16QImode:
+	case E_V8HImode:
+	case E_V4SImode:
+	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, operands[0], one, zero, vpr_p0));
+	  break;
+	case E_V8HFmode:
+	case E_V4SFmode:
+	  if (TARGET_HAVE_MVE_FLOAT)
+	    emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], one, zero, vpr_p0));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+    }
 }
 \f
 #define MAX_VECT_LEN 16
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 95df8bd..a128465 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -1288,12 +1288,11 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
 		       (VCREATEQ_U "u") (VCREATEQ_S "s") (VSHRQ_N_S "s")
 		       (VSHRQ_N_U "u") (VCVTQ_N_FROM_F_S "s") (VSHLQ_U "u")
 		       (VCVTQ_N_FROM_F_U "u") (VADDLVQ_P_S "s") (VSHLQ_S "s")
-		       (VADDLVQ_P_U "u") (VCMPNEQ_S "s")
+		       (VADDLVQ_P_U "u")
 		       (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s")
 		       (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
 		       (VADDVQ_P_S "s")	(VADDVQ_P_U "u") (VBRSRQ_N_S "s")
-		       (VBRSRQ_N_U "u") (VCMPEQQ_S "s")
-		       (VCMPEQQ_N_S "s") (VCMPNEQ_N_S "s")
+		       (VBRSRQ_N_U "u")
 		       (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s")
 		       (VHADDQ_U "u") (VHSUBQ_N_S "s")	(VHSUBQ_N_U "u")
 		       (VHSUBQ_S "s") (VMAXQ_S "s") (VMAXQ_U "u") (VHSUBQ_U "u")
@@ -1549,16 +1548,12 @@ (define_int_iterator VCREATEQ [VCREATEQ_U VCREATEQ_S])
 (define_int_iterator VSHRQ_N [VSHRQ_N_S VSHRQ_N_U])
 (define_int_iterator VCVTQ_N_FROM_F [VCVTQ_N_FROM_F_S VCVTQ_N_FROM_F_U])
 (define_int_iterator VADDLVQ_P [VADDLVQ_P_S VADDLVQ_P_U])
-(define_int_iterator VCMPNEQ [VCMPNEQ_S])
 (define_int_iterator VSHLQ [VSHLQ_S VSHLQ_U])
 (define_int_iterator VABDQ [VABDQ_S VABDQ_U])
 (define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U])
 (define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U])
 (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
 (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
-(define_int_iterator VCMPEQQ [VCMPEQQ_S])
-(define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S])
-(define_int_iterator VCMPNEQ_N [VCMPNEQ_N_S])
 (define_int_iterator VHADDQ [VHADDQ_S VHADDQ_U])
 (define_int_iterator VHADDQ_N [VHADDQ_N_U VHADDQ_N_S])
 (define_int_iterator VHSUBQ [VHSUBQ_S VHSUBQ_U])
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 7c846a4..97f0a87 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -838,7 +838,7 @@ (define_insn "mve_vaddlvq_p_<supf>v4si"
 ;;
 ;; [vcmpneq_, vcmpcsq_, vcmpeqq_, vcmpgeq_, vcmpgtq_, vcmphiq_, vcmpleq_, vcmpltq_])
 ;;
-(define_insn "mve_vcmp<mve_cmp_op>q_<mode>"
+(define_insn "@mve_vcmp<mve_cmp_op>q_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(MVE_COMPARISONS:HI (match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1928,7 +1928,7 @@ (define_insn "mve_vcaddq<mve_rot><mode>"
 ;;
 ;; [vcmpeqq_f, vcmpgeq_f, vcmpgtq_f, vcmpleq_f, vcmpltq_f, vcmpneq_f])
 ;;
-(define_insn "mve_vcmp<mve_cmp_op>q_f<mode>"
+(define_insn "@mve_vcmp<mve_cmp_op>q_f<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w")
@@ -1942,7 +1942,7 @@ (define_insn "mve_vcmp<mve_cmp_op>q_f<mode>"
 ;;
 ;; [vcmpeqq_n_f, vcmpgeq_n_f, vcmpgtq_n_f, vcmpleq_n_f, vcmpltq_n_f, vcmpneq_n_f])
 ;;
-(define_insn "mve_vcmp<mve_cmp_op>q_n_f<mode>"
+(define_insn "@mve_vcmp<mve_cmp_op>q_n_f<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w")
@@ -3307,7 +3307,7 @@ (define_insn "mve_vnegq_m_s<mode>"
 ;;
 ;; [vpselq_u, vpselq_s])
 ;;
-(define_insn "mve_vpselq_<supf><mode>"
+(define_insn "@mve_vpselq_<supf><mode>"
   [
    (set (match_operand:MVE_1 0 "s_register_operand" "=w")
 	(unspec:MVE_1 [(match_operand:MVE_1 1 "s_register_operand" "w")
@@ -4402,7 +4402,7 @@ (define_insn "mve_vorrq_m_n_<supf><mode>"
 ;;
 ;; [vpselq_f])
 ;;
-(define_insn "mve_vpselq_f<mode>"
+(define_insn "@mve_vpselq_f<mode>"
   [
    (set (match_operand:MVE_0 0 "s_register_operand" "=w")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index fec2cc9..6660846 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1416,93 +1416,6 @@ (define_insn "*us_sub<mode>_neon"
   [(set_attr "type" "neon_qsub<q>")]
 )
 
-(define_expand "vec_cmp<mode><v_cmp_result>"
-  [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
-	(match_operator:<V_cmp_result> 1 "comparison_operator"
-	  [(match_operand:VDQW 2 "s_register_operand")
-	   (match_operand:VDQW 3 "reg_or_zero_operand")]))]
-  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
-{
-  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
-			     operands[2], operands[3], false);
-  DONE;
-})
-
-(define_expand "vec_cmpu<mode><mode>"
-  [(set (match_operand:VDQIW 0 "s_register_operand")
-	(match_operator:VDQIW 1 "comparison_operator"
-	  [(match_operand:VDQIW 2 "s_register_operand")
-	   (match_operand:VDQIW 3 "reg_or_zero_operand")]))]
-  "TARGET_NEON"
-{
-  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
-			     operands[2], operands[3], false);
-  DONE;
-})
-
-;; Conditional instructions.  These are comparisons with conditional moves for
-;; vectors.  They perform the assignment:
-;;   
-;;     Vop0 = (Vop4 <op3> Vop5) ? Vop1 : Vop2;
-;;
-;; where op3 is <, <=, ==, !=, >= or >.  Operations are performed
-;; element-wise.
-
-(define_expand "vcond<mode><mode>"
-  [(set (match_operand:VDQW 0 "s_register_operand")
-	(if_then_else:VDQW
-	  (match_operator 3 "comparison_operator"
-	    [(match_operand:VDQW 4 "s_register_operand")
-	     (match_operand:VDQW 5 "reg_or_zero_operand")])
-	  (match_operand:VDQW 1 "s_register_operand")
-	  (match_operand:VDQW 2 "s_register_operand")))]
-  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
-{
-  arm_expand_vcond (operands, <V_cmp_result>mode);
-  DONE;
-})
-
-(define_expand "vcond<V_cvtto><mode>"
-  [(set (match_operand:<V_CVTTO> 0 "s_register_operand")
-	(if_then_else:<V_CVTTO>
-	  (match_operator 3 "comparison_operator"
-	    [(match_operand:V32 4 "s_register_operand")
-	     (match_operand:V32 5 "reg_or_zero_operand")])
-	  (match_operand:<V_CVTTO> 1 "s_register_operand")
-	  (match_operand:<V_CVTTO> 2 "s_register_operand")))]
-  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
-{
-  arm_expand_vcond (operands, <V_cmp_result>mode);
-  DONE;
-})
-
-(define_expand "vcondu<mode><v_cmp_result>"
-  [(set (match_operand:VDQW 0 "s_register_operand")
-	(if_then_else:VDQW
-	  (match_operator 3 "arm_comparison_operator"
-	    [(match_operand:<V_cmp_result> 4 "s_register_operand")
-	     (match_operand:<V_cmp_result> 5 "reg_or_zero_operand")])
-	  (match_operand:VDQW 1 "s_register_operand")
-	  (match_operand:VDQW 2 "s_register_operand")))]
-  "TARGET_NEON"
-{
-  arm_expand_vcond (operands, <V_cmp_result>mode);
-  DONE;
-})
-
-(define_expand "vcond_mask_<mode><v_cmp_result>"
-  [(set (match_operand:VDQW 0 "s_register_operand")
-	(if_then_else:VDQW
-	  (match_operand:<V_cmp_result> 3 "s_register_operand")
-	  (match_operand:VDQW 1 "s_register_operand")
-	  (match_operand:VDQW 2 "s_register_operand")))]
-  "TARGET_NEON"
-{
-  emit_insn (gen_neon_vbsl<mode> (operands[0], operands[3], operands[1],
-				  operands[2]));
-  DONE;
-})
-
 ;; Patterns for builtins.
 
 ; good for plain vadd, vaddq.
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 07ca53b..0778db1 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -596,8 +596,6 @@ (define_c_enum "unspec" [
   VCVTQ_N_FROM_F_U
   VADDLVQ_P_S
   VADDLVQ_P_U
-  VCMPNEQ_U
-  VCMPNEQ_S
   VSHLQ_S
   VSHLQ_U
   VABDQ_S
@@ -605,9 +603,6 @@ (define_c_enum "unspec" [
   VADDVAQ_S
   VADDVQ_P_S
   VBRSRQ_N_S
-  VCMPEQQ_S
-  VCMPEQQ_N_S
-  VCMPNEQ_N_S
   VHADDQ_S
   VHADDQ_N_S
   VHSUBQ_S
@@ -645,9 +640,6 @@ (define_c_enum "unspec" [
   VADDVAQ_U
   VADDVQ_P_U
   VBRSRQ_N_U
-  VCMPEQQ_U
-  VCMPEQQ_N_U
-  VCMPNEQ_N_U
   VHADDQ_U
   VHADDQ_N_U
   VHSUBQ_U
@@ -680,14 +672,6 @@ (define_c_enum "unspec" [
   VSHLQ_R_U
   VSUBQ_U
   VSUBQ_N_U
-  VCMPGEQ_N_S
-  VCMPGEQ_S
-  VCMPGTQ_N_S
-  VCMPGTQ_S
-  VCMPLEQ_N_S
-  VCMPLEQ_S
-  VCMPLTQ_N_S
-  VCMPLTQ_S
   VHCADDQ_ROT270_S
   VHCADDQ_ROT90_S
   VMAXAQ_S
@@ -702,10 +686,6 @@ (define_c_enum "unspec" [
   VQRDMULHQ_N_S
   VQRDMULHQ_S
   VQSHLUQ_N_S
-  VCMPCSQ_N_U
-  VCMPCSQ_U
-  VCMPHIQ_N_U
-  VCMPHIQ_U
   VABDQ_M_S
   VABDQ_M_U
   VABDQ_F
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 0b2b3b1..034b48b 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -362,3 +362,110 @@ (define_expand "vlshr<mode>3"
       DONE;
     }
 })
+
+(define_expand "vec_cmp<mode><v_cmp_result>"
+  [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
+	(match_operator:<V_cmp_result> 1 "comparison_operator"
+	  [(match_operand:VDQW 2 "s_register_operand")
+	   (match_operand:VDQW 3 "reg_or_zero_operand")]))]
+  "ARM_HAVE_<MODE>_ARITH
+   && !TARGET_REALLY_IWMMXT
+   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
+			     operands[2], operands[3], false, false);
+  DONE;
+})
+
+(define_expand "vec_cmpu<mode><mode>"
+  [(set (match_operand:VDQIW 0 "s_register_operand")
+	(match_operator:VDQIW 1 "comparison_operator"
+	  [(match_operand:VDQIW 2 "s_register_operand")
+	   (match_operand:VDQIW 3 "reg_or_zero_operand")]))]
+  "ARM_HAVE_<MODE>_ARITH
+   && !TARGET_REALLY_IWMMXT"
+{
+  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
+			     operands[2], operands[3], false, false);
+  DONE;
+})
+
+;; Conditional instructions.  These are comparisons with conditional moves for
+;; vectors.  They perform the assignment:
+;;
+;;     Vop0 = (Vop4 <op3> Vop5) ? Vop1 : Vop2;
+;;
+;; where op3 is <, <=, ==, !=, >= or >.  Operations are performed
+;; element-wise.
+
+(define_expand "vcond<mode><mode>"
+  [(set (match_operand:VDQW 0 "s_register_operand")
+	(if_then_else:VDQW
+	  (match_operator 3 "comparison_operator"
+	    [(match_operand:VDQW 4 "s_register_operand")
+	     (match_operand:VDQW 5 "reg_or_zero_operand")])
+	  (match_operand:VDQW 1 "s_register_operand")
+	  (match_operand:VDQW 2 "s_register_operand")))]
+  "ARM_HAVE_<MODE>_ARITH
+   && !TARGET_REALLY_IWMMXT
+   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  arm_expand_vcond (operands, <V_cmp_result>mode);
+  DONE;
+})
+
+(define_expand "vcond<V_cvtto><mode>"
+  [(set (match_operand:<V_CVTTO> 0 "s_register_operand")
+	(if_then_else:<V_CVTTO>
+	  (match_operator 3 "comparison_operator"
+	    [(match_operand:V32 4 "s_register_operand")
+	     (match_operand:V32 5 "reg_or_zero_operand")])
+	  (match_operand:<V_CVTTO> 1 "s_register_operand")
+	  (match_operand:<V_CVTTO> 2 "s_register_operand")))]
+  "ARM_HAVE_<MODE>_ARITH
+   && !TARGET_REALLY_IWMMXT
+   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  arm_expand_vcond (operands, <V_cmp_result>mode);
+  DONE;
+})
+
+(define_expand "vcondu<mode><v_cmp_result>"
+  [(set (match_operand:VDQW 0 "s_register_operand")
+	(if_then_else:VDQW
+	  (match_operator 3 "arm_comparison_operator"
+	    [(match_operand:<V_cmp_result> 4 "s_register_operand")
+	     (match_operand:<V_cmp_result> 5 "reg_or_zero_operand")])
+	  (match_operand:VDQW 1 "s_register_operand")
+	  (match_operand:VDQW 2 "s_register_operand")))]
+  "ARM_HAVE_<MODE>_ARITH
+   && !TARGET_REALLY_IWMMXT"
+{
+  arm_expand_vcond (operands, <V_cmp_result>mode);
+  DONE;
+})
+
+(define_expand "vcond_mask_<mode><v_cmp_result>"
+  [(set (match_operand:VDQW 0 "s_register_operand")
+        (if_then_else:VDQW
+          (match_operand:<V_cmp_result> 3 "s_register_operand")
+          (match_operand:VDQW 1 "s_register_operand")
+          (match_operand:VDQW 2 "s_register_operand")))]
+  "ARM_HAVE_<MODE>_ARITH
+   && !TARGET_REALLY_IWMMXT"
+{
+  if (TARGET_NEON)
+    {
+      emit_insn (gen_neon_vbsl (<MODE>mode, operands[0], operands[3],
+                                operands[1], operands[2]));
+    }
+  else if (TARGET_HAVE_MVE)
+    {
+      emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0],
+                                 operands[1], operands[2], operands[3]));
+    }
+  else
+    gcc_unreachable ();
+
+  DONE;
+})
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
new file mode 100644
index 0000000..029c931
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
@@ -0,0 +1,80 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O3" } */
+
+/* Integer tests.  */
+
+#define COMPARE_REG(NAME, OP, TYPE) \
+  TYPE \
+  cmp_##NAME##_##TYPE##_reg (TYPE a, TYPE b) \
+  { \
+    return a OP b; \
+  }
+
+#define COMPARE_REG_AND_ZERO(NAME, OP, TYPE) \
+  COMPARE_REG (NAME, OP, TYPE) \
+  \
+  TYPE \
+  cmp_##NAME##_##TYPE##_zero (TYPE a) \
+  { \
+    return a OP (TYPE) {}; \
+  }
+
+#define COMPARE_TYPE(TYPE, COMPARE_ORDERED) \
+  COMPARE_REG_AND_ZERO (eq, ==, TYPE) \
+  COMPARE_REG_AND_ZERO (ne, !=, TYPE) \
+  COMPARE_ORDERED (lt, <, TYPE) \
+  COMPARE_ORDERED (le, <=, TYPE) \
+  COMPARE_ORDERED (gt, >, TYPE) \
+  COMPARE_ORDERED (ge, >=, TYPE)
+
+#define TEST_TYPE(NAME, ELEM, COMPARE_ORDERED, SIZE)  \
+  typedef ELEM NAME##SIZE __attribute__((vector_size(SIZE))); \
+  COMPARE_TYPE (NAME##SIZE, COMPARE_ORDERED)
+
+/* 64-bits vectors, not vectorized.  */
+TEST_TYPE (vs8, __INT8_TYPE__, COMPARE_REG_AND_ZERO, 8)
+TEST_TYPE (vu8, __UINT8_TYPE__, COMPARE_REG, 8)
+TEST_TYPE (vs16, __INT16_TYPE__, COMPARE_REG_AND_ZERO, 8)
+TEST_TYPE (vu16, __UINT16_TYPE__, COMPARE_REG, 8)
+TEST_TYPE (vs32, __INT32_TYPE__, COMPARE_REG_AND_ZERO, 8)
+TEST_TYPE (vu32, __UINT32_TYPE__, COMPARE_REG, 8)
+
+/* 128-bits vectors.  */
+TEST_TYPE (vs8, __INT8_TYPE__, COMPARE_REG_AND_ZERO, 16)
+TEST_TYPE (vu8, __UINT8_TYPE__, COMPARE_REG, 16)
+TEST_TYPE (vs16, __INT16_TYPE__, COMPARE_REG_AND_ZERO, 16)
+TEST_TYPE (vu16, __UINT16_TYPE__, COMPARE_REG, 16)
+TEST_TYPE (vs32, __INT32_TYPE__, COMPARE_REG_AND_ZERO, 16)
+TEST_TYPE (vu32, __UINT32_TYPE__, COMPARE_REG, 16)
+
+/* { 8 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
+/* { dg-final { scan-assembler-times {\tvcmp.i8  eq, q[0-9]+, q[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.i8  ne, q[0-9]+, q[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s8  lt, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s8  le, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s8  gt, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s8  ge, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u8  hi, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u8  cs, q[0-9]+, q[0-9]+\n} 2 } } */
+
+/* { 16 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
+/* { dg-final { scan-assembler-times {\tvcmp.i16  eq, q[0-9]+, q[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.i16  ne, q[0-9]+, q[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s16  lt, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s16  le, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s16  gt, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s16  ge, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u16  hi, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u16  cs, q[0-9]+, q[0-9]+\n} 2 } } */
+
+/* { 32 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
+/* { dg-final { scan-assembler-times {\tvcmp.i32  eq, q[0-9]+, q[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.i32  ne, q[0-9]+, q[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s32  lt, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s32  le, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s32  gt, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s32  ge, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u32  hi, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u32  cs, q[0-9]+, q[0-9]+\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
new file mode 100644
index 0000000..8515195
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
@@ -0,0 +1,38 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+
+/* float 32 tests.  */
+
+#ifndef ELEM_TYPE
+#define ELEM_TYPE float
+#endif
+#ifndef INT_ELEM_TYPE
+#define INT_ELEM_TYPE __INT32_TYPE__
+#endif
+
+#define COMPARE(NAME, OP)			\
+  int_vec					\
+  cmp_##NAME##_reg (vec a, vec b)		\
+  {						\
+    return a OP b;				\
+  }
+
+typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));
+typedef ELEM_TYPE vec __attribute__((vector_size(16)));
+
+COMPARE (eq, ==)
+COMPARE (ne, !=)
+COMPARE (lt, <)
+COMPARE (le, <=)
+COMPARE (gt, >)
+COMPARE (ge, >=)
+
+/* eq, ne, lt, le, gt, ge.
+/* { dg-final { scan-assembler-times {\tvcmp.f32\teq, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tne, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tle, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tge, q[0-9]+, q[0-9]+\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
new file mode 100644
index 0000000..7774972
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
@@ -0,0 +1,69 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O3" } */
+
+#define COMPARE_REG(NAME, OP, TYPE, SCALAR)	  \
+  TYPE						  \
+  cmp_##NAME##_##TYPE##_scalar (TYPE a, SCALAR b) \
+  {						  \
+    return a OP b;				  \
+  }
+
+#define COMPARE_TYPE(SCALAR, TYPE)				\
+  COMPARE_REG (eq, ==, TYPE, SCALAR)				\
+  COMPARE_REG (ne, !=, TYPE, SCALAR)				\
+  COMPARE_REG (lt, <, TYPE, SCALAR)				\
+  COMPARE_REG (le, <=, TYPE, SCALAR)				\
+  COMPARE_REG (gt, >, TYPE, SCALAR)				\
+  COMPARE_REG (ge, >=, TYPE, SCALAR)
+
+#define TEST_TYPE(NAME, ELEM, SIZE)			      \
+  typedef ELEM NAME##SIZE __attribute__((vector_size(SIZE))); \
+  COMPARE_TYPE (ELEM, NAME##SIZE)
+
+/* 64-bits vectors, not vectorized.  */
+TEST_TYPE (vs8, __INT8_TYPE__, 8)
+TEST_TYPE (vu8, __UINT8_TYPE__, 8)
+TEST_TYPE (vs16, __INT16_TYPE__, 8)
+TEST_TYPE (vu16, __UINT16_TYPE__, 8)
+TEST_TYPE (vs32, __INT32_TYPE__, 8)
+TEST_TYPE (vu32, __UINT32_TYPE__, 8)
+
+/* 128-bits vectors.  */
+TEST_TYPE (vs8, __INT8_TYPE__, 16)
+TEST_TYPE (vu8, __UINT8_TYPE__, 16)
+TEST_TYPE (vs16, __INT16_TYPE__, 16)
+TEST_TYPE (vu16, __UINT16_TYPE__, 16)
+TEST_TYPE (vs32, __INT32_TYPE__, 16)
+TEST_TYPE (vu32, __UINT32_TYPE__, 16)
+
+/* { 8 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
+/* { dg-final { scan-assembler-times {\tvcmp.i8  eq, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.i8  ne, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s8  lt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s8  le, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s8  gt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s8  ge, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u8  hi, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u8  cs, q[0-9]+, q[0-9]+\n} 2 } } */
+
+/* { 16 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
+/* { dg-final { scan-assembler-times {\tvcmp.i16  eq, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.i16  ne, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s16  lt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s16  le, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s16  gt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s16  ge, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u16  hi, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u16  cs, q[0-9]+, q[0-9]+\n} 2 } } */
+
+/* { 32 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
+/* { dg-final { scan-assembler-times {\tvcmp.i32  eq, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.i32  ne, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s32  lt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s32  le, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s32  gt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s32  ge, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u32  hi, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u32  cs, q[0-9]+, q[0-9]+\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
new file mode 100644
index 0000000..4ed449e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
@@ -0,0 +1,30 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+
+#include <stdint.h>
+
+#define NB 4
+
+#define FUNC(OP, NAME)							\
+  void test_ ## NAME ##_f (float * __restrict__ dest, float *a, float *b) { \
+    int i;								\
+    for (i=0; i<NB; i++) {						\
+      dest[i] = a[i] OP b[i];						\
+    }									\
+  }
+
+FUNC(==, vcmpeq)
+FUNC(!=, vcmpne)
+FUNC(<, vcmplt)
+FUNC(<=, vcmple)
+FUNC(>, vcmpgt)
+FUNC(>=, vcmpge)
+
+/* { dg-final { scan-assembler-times {\tvcmp.f32\teq, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tne, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tle, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tge, q[0-9]+, q[0-9]+\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c
new file mode 100644
index 0000000..8da15e7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c
@@ -0,0 +1,50 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O3" } */
+
+#include <stdint.h>
+
+#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)				\
+  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
+    int i;								\
+    for (i=0; i<NB; i++) {						\
+      dest[i] = a[i] OP b[i];						\
+    }									\
+}
+
+#define ALL_FUNCS(OP, NAME) \
+  FUNC(s, int, 32, 2, OP, NAME)			\
+  FUNC(u, uint, 32, 2, OP, NAME)		\
+  FUNC(s, int, 16, 4, OP, NAME)			\
+  FUNC(u, uint, 16, 4, OP, NAME)		\
+  FUNC(s, int, 8, 8, OP, NAME)			\
+  FUNC(u, uint, 8, 8, OP, NAME)			\
+  FUNC(s, int, 32, 4, OP, NAME)			\
+  FUNC(u, uint, 32, 4, OP, NAME)		\
+  FUNC(s, int, 16, 8, OP, NAME)			\
+  FUNC(u, uint, 16, 8, OP, NAME)		\
+  FUNC(s, int, 8, 16, OP, NAME)			\
+  FUNC(u, uint, 8, 16, OP, NAME)
+
+ALL_FUNCS(==, vcmpeq)
+ALL_FUNCS(!=, vcmpne)
+ALL_FUNCS(<, vcmplt)
+ALL_FUNCS(<=, vcmple)
+ALL_FUNCS(>, vcmpgt)
+ALL_FUNCS(>=, vcmpge)
+
+/* MVE has only 128-bit vectors, so we can vectorize only half of the
+   functions above.  */
+/* { dg-final { scan-assembler-times {\tvcmp.i[0-9]+  eq, q[0-9]+, q[0-9]+\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.i[0-9]+  ne, q[0-9]+, q[0-9]+\n} 6 } } */
+
+/* lt, le, gt, ge apply to signed types, cs and hi to unsigned types.  */
+/* lt and le with unsigned types are replaced with the opposite condition, hence
+   the double number of matches for cs and hi.  */
+/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  lt, q[0-9]+, q[0-9]+\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  le, q[0-9]+, q[0-9]+\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  gt, q[0-9]+, q[0-9]+\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  ge, q[0-9]+, q[0-9]+\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u[0-9]+  cs, q[0-9]+, q[0-9]+\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u[0-9]+  hi, q[0-9]+, q[0-9]+\n} 6 } } */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 7/9] arm: Auto-vectorization for MVE: add __fp16 support to VCMP
  2021-04-30 14:09 [PATCH 1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version Christophe Lyon
                   ` (4 preceding siblings ...)
  2021-04-30 14:09 ` [PATCH 6/9] arm: Auto-vectorization for MVE: vcmp Christophe Lyon
@ 2021-04-30 14:09 ` Christophe Lyon
  2021-05-04 11:48   ` Andre Vieira (lists)
  2021-04-30 14:09 ` [PATCH 8/9] arm: Auto-vectorization for MVE: vld2/vst2 Christophe Lyon
                   ` (3 subsequent siblings)
  9 siblings, 1 reply; 35+ messages in thread
From: Christophe Lyon @ 2021-04-30 14:09 UTC (permalink / raw)
  To: gcc-patches

This patch adds __fp16 support to the previous patch that added vcmp
support with MVE. For this we update existing expanders to use VDQWH
iterator, and add a new expander vcond<VH_cvtto><mode>.  In the
process we need to create suitable iterators, and update v_cmp_result
as needed.

2021-04-26  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/iterators.md (V16): New iterator.
	(VH_cvtto): New iterator.
	(v_cmp_result): Added V4HF and V8HF support.
	* config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>): Use VDQWH.
	(vcond<mode><mode>): Likewise.
	(vcond_mask_<mode><v_cmp_result>): Likewise.
	(vcond<VH_cvtto><mode>): New expander.

	gcc/testsuite/
	* gcc.target/arm/simd/mve-compare-3.c: New test with GCC vectors.
	* gcc.target/arm/simd/mve-vcmp-f16.c: New test for
	auto-vectorization.
---
 gcc/config/arm/iterators.md                       |  6 ++++
 gcc/config/arm/vec-common.md                      | 40 ++++++++++++++++-------
 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c | 38 +++++++++++++++++++++
 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c  | 30 +++++++++++++++++
 4 files changed, 102 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index a128465..3042baf 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -231,6 +231,9 @@ (define_mode_iterator VU [V16QI V8HI V4SI])
 ;; Vector modes for 16-bit floating-point support.
 (define_mode_iterator VH [V8HF V4HF])
 
+;; Modes with 16-bit elements only.
+(define_mode_iterator V16 [V4HI V4HF V8HI V8HF])
+
 ;; 16-bit floating-point vector modes suitable for moving (includes BFmode).
 (define_mode_iterator VHFBF [V8HF V4HF V4BF V8BF])
 
@@ -571,6 +574,8 @@ (define_mode_attr V_cvtto [(V2SI "v2sf") (V2SF "v2si")
 ;; (Opposite) mode to convert to/from for vector-half mode conversions.
 (define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")
 			    (V8HI "V8HF") (V8HF "V8HI")])
+(define_mode_attr VH_cvtto [(V4HI "v4hf") (V4HF "v4hi")
+			    (V8HI "v8hf") (V8HF "v8hi")])
 
 ;; Define element mode for each vector mode.
 (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
@@ -720,6 +725,7 @@ (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")
 (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")
 				(V4HI "v4hi") (V8HI  "v8hi")
 				(V2SI "v2si") (V4SI  "v4si")
+				(V4HF "v4hi") (V8HF  "v8hi")
 				(DI   "di")   (V2DI  "v2di")
 				(V2SF "v2si") (V4SF  "v4si")])
 
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 034b48b..3fd341c 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -366,8 +366,8 @@ (define_expand "vlshr<mode>3"
 (define_expand "vec_cmp<mode><v_cmp_result>"
   [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
 	(match_operator:<V_cmp_result> 1 "comparison_operator"
-	  [(match_operand:VDQW 2 "s_register_operand")
-	   (match_operand:VDQW 3 "reg_or_zero_operand")]))]
+	  [(match_operand:VDQWH 2 "s_register_operand")
+	   (match_operand:VDQWH 3 "reg_or_zero_operand")]))]
   "ARM_HAVE_<MODE>_ARITH
    && !TARGET_REALLY_IWMMXT
    && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
@@ -399,13 +399,13 @@ (define_expand "vec_cmpu<mode><mode>"
 ;; element-wise.
 
 (define_expand "vcond<mode><mode>"
-  [(set (match_operand:VDQW 0 "s_register_operand")
-	(if_then_else:VDQW
+  [(set (match_operand:VDQWH 0 "s_register_operand")
+	(if_then_else:VDQWH
 	  (match_operator 3 "comparison_operator"
-	    [(match_operand:VDQW 4 "s_register_operand")
-	     (match_operand:VDQW 5 "reg_or_zero_operand")])
-	  (match_operand:VDQW 1 "s_register_operand")
-	  (match_operand:VDQW 2 "s_register_operand")))]
+	    [(match_operand:VDQWH 4 "s_register_operand")
+	     (match_operand:VDQWH 5 "reg_or_zero_operand")])
+	  (match_operand:VDQWH 1 "s_register_operand")
+	  (match_operand:VDQWH 2 "s_register_operand")))]
   "ARM_HAVE_<MODE>_ARITH
    && !TARGET_REALLY_IWMMXT
    && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
@@ -430,6 +430,22 @@ (define_expand "vcond<V_cvtto><mode>"
   DONE;
 })
 
+(define_expand "vcond<VH_cvtto><mode>"
+  [(set (match_operand:<VH_CVTTO> 0 "s_register_operand")
+	(if_then_else:<VH_CVTTO>
+	  (match_operator 3 "comparison_operator"
+	    [(match_operand:V16 4 "s_register_operand")
+	     (match_operand:V16 5 "reg_or_zero_operand")])
+	  (match_operand:<VH_CVTTO> 1 "s_register_operand")
+	  (match_operand:<VH_CVTTO> 2 "s_register_operand")))]
+  "ARM_HAVE_<MODE>_ARITH
+   && !TARGET_REALLY_IWMMXT
+   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  arm_expand_vcond (operands, <V_cmp_result>mode);
+  DONE;
+})
+
 (define_expand "vcondu<mode><v_cmp_result>"
   [(set (match_operand:VDQW 0 "s_register_operand")
 	(if_then_else:VDQW
@@ -446,11 +462,11 @@ (define_expand "vcondu<mode><v_cmp_result>"
 })
 
 (define_expand "vcond_mask_<mode><v_cmp_result>"
-  [(set (match_operand:VDQW 0 "s_register_operand")
-        (if_then_else:VDQW
+  [(set (match_operand:VDQWH 0 "s_register_operand")
+        (if_then_else:VDQWH
           (match_operand:<V_cmp_result> 3 "s_register_operand")
-          (match_operand:VDQW 1 "s_register_operand")
-          (match_operand:VDQW 2 "s_register_operand")))]
+          (match_operand:VDQWH 1 "s_register_operand")
+          (match_operand:VDQWH 2 "s_register_operand")))]
   "ARM_HAVE_<MODE>_ARITH
    && !TARGET_REALLY_IWMMXT"
 {
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
new file mode 100644
index 0000000..76f81e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
@@ -0,0 +1,38 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+
+/* float 16 tests.  */
+
+#ifndef ELEM_TYPE
+#define ELEM_TYPE __fp16
+#endif
+#ifndef INT_ELEM_TYPE
+#define INT_ELEM_TYPE __INT16_TYPE__
+#endif
+
+#define COMPARE(NAME, OP)			\
+  int_vec					\
+  cmp_##NAME##_reg (vec a, vec b)		\
+  {						\
+    return a OP b;				\
+  }
+
+typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));
+typedef ELEM_TYPE vec __attribute__((vector_size(16)));
+
+COMPARE (eq, ==)
+COMPARE (ne, !=)
+COMPARE (lt, <)
+COMPARE (le, <=)
+COMPARE (gt, >)
+COMPARE (ge, >=)
+
+/* eq, ne, lt, le, gt, ge.
+/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
new file mode 100644
index 0000000..dbae2d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
@@ -0,0 +1,30 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+
+#include <stdint.h>
+
+#define NB 8
+
+#define FUNC(OP, NAME)							\
+  void test_ ## NAME ##_f (__fp16 * __restrict__ dest, __fp16 *a, __fp16 *b) { \
+    int i;								\
+    for (i=0; i<NB; i++) {						\
+      dest[i] = a[i] OP b[i];						\
+    }									\
+  }
+
+FUNC(==, vcmpeq)
+FUNC(!=, vcmpne)
+FUNC(<, vcmplt)
+FUNC(<=, vcmple)
+FUNC(>, vcmpgt)
+FUNC(>=, vcmpge)
+
+/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 8/9] arm: Auto-vectorization for MVE: vld2/vst2
  2021-04-30 14:09 [PATCH 1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version Christophe Lyon
                   ` (5 preceding siblings ...)
  2021-04-30 14:09 ` [PATCH 7/9] arm: Auto-vectorization for MVE: add __fp16 support to VCMP Christophe Lyon
@ 2021-04-30 14:09 ` Christophe Lyon
  2021-05-17  9:55   ` Christophe Lyon
  2021-05-24 12:15   ` Kyrylo Tkachov
  2021-04-30 14:09 ` [PATCH 9/9] arm: Auto-vectorization for MVE: vld4/vst4 Christophe Lyon
                   ` (2 subsequent siblings)
  9 siblings, 2 replies; 35+ messages in thread
From: Christophe Lyon @ 2021-04-30 14:09 UTC (permalink / raw)
  To: gcc-patches

This patch enables MVE vld2/vst2 instructions for auto-vectorization.
We move the existing expanders from neon.md and enable them for MVE,
calling the respective emitter.

2021-03-12  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/neon.md (vec_load_lanesoi<mode>)
	(vec_store_lanesoi<mode>): Move ...
	* config/arm/vec-common.md: here.

	gcc/testsuite/
	* gcc.target/arm/simd/mve-vld2.c: New test, derived from
	slp-perm-2.c
---
 gcc/config/arm/neon.md                       | 14 ----
 gcc/config/arm/vec-common.md                 | 27 ++++++++
 gcc/testsuite/gcc.target/arm/simd/mve-vld2.c | 96 ++++++++++++++++++++++++++++
 3 files changed, 123 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vld2.c

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 6660846..bc8775c 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -5063,13 +5063,6 @@ (define_insn "neon_vld2<mode>"
                     (const_string "neon_load2_2reg<q>")))]
 )
 
-(define_expand "vec_load_lanesoi<mode>"
-  [(set (match_operand:OI 0 "s_register_operand")
-        (unspec:OI [(match_operand:OI 1 "neon_struct_operand")
-                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
-		   UNSPEC_VLD2))]
-  "TARGET_NEON")
-
 (define_insn "neon_vld2<mode>"
   [(set (match_operand:OI 0 "s_register_operand" "=w")
         (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um")
@@ -5197,13 +5190,6 @@ (define_insn "neon_vst2<mode>"
                     (const_string "neon_store2_one_lane<q>")))]
 )
 
-(define_expand "vec_store_lanesoi<mode>"
-  [(set (match_operand:OI 0 "neon_struct_operand")
-	(unspec:OI [(match_operand:OI 1 "s_register_operand")
-                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
-                   UNSPEC_VST2))]
-  "TARGET_NEON")
-
 (define_insn "neon_vst2<mode>"
   [(set (match_operand:OI 0 "neon_struct_operand" "=Um")
 	(unspec:OI [(match_operand:OI 1 "s_register_operand" "w")
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 3fd341c..7abefea 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -482,6 +482,33 @@ (define_expand "vcond_mask_<mode><v_cmp_result>"
     }
   else
     gcc_unreachable ();
+  DONE;
+})
 
+(define_expand "vec_load_lanesoi<mode>"
+  [(set (match_operand:OI 0 "s_register_operand")
+        (unspec:OI [(match_operand:OI 1 "neon_struct_operand")
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+		   UNSPEC_VLD2))]
+  "TARGET_NEON || TARGET_HAVE_MVE"
+{
+  if (TARGET_NEON)
+    emit_insn (gen_neon_vld2<mode> (operands[0], operands[1]));
+  else
+    emit_insn (gen_mve_vld2q<mode> (operands[0], operands[1]));
+  DONE;
+})
+
+(define_expand "vec_store_lanesoi<mode>"
+  [(set (match_operand:OI 0 "neon_struct_operand")
+	(unspec:OI [(match_operand:OI 1 "s_register_operand")
+                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   UNSPEC_VST2))]
+  "TARGET_NEON || TARGET_HAVE_MVE"
+{
+  if (TARGET_NEON)
+    emit_insn (gen_neon_vst2<mode> (operands[0], operands[1]));
+  else
+    emit_insn (gen_mve_vst2q<mode> (operands[0], operands[1]));
   DONE;
 })
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vld2.c b/gcc/testsuite/gcc.target/arm/simd/mve-vld2.c
new file mode 100644
index 0000000..9c7c3f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vld2.c
@@ -0,0 +1,96 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3" } */
+
+#include <stdint.h>
+
+#define M00 100
+#define M10 216
+#define M01 1322
+#define M11 13
+
+#define N 128
+
+
+/* Integer tests.  */
+#define FUNC(SIGN, TYPE, BITS)						\
+  void foo_##SIGN##BITS##x (TYPE##BITS##_t *__restrict__ pInput,	\
+			    TYPE##BITS##_t *__restrict__ pOutput)	\
+  {									\
+    unsigned int i;							\
+    TYPE##BITS##_t  a, b;						\
+    									\
+    for (i = 0; i < N / BITS; i++)					\
+      {									\
+	a = *pInput++;							\
+	b = *pInput++;							\
+									\
+	*pOutput++ = M00 * a + M01 * b;					\
+	*pOutput++ = M10 * a + M11 * b;					\
+      }									\
+  }
+
+FUNC(s, int, 8)
+FUNC(u, uint, 8)
+FUNC(s, int, 16)
+FUNC(u, uint, 16)
+FUNC(s, int, 32)
+FUNC(u, uint, 32)
+
+/* float test, keep the macro because it's similar to the above, but does not
+   need the ##BITS##_t.  */
+#define FUNC_FLOAT(SIGN, TYPE, BITS)					\
+  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,			\
+			    TYPE *__restrict__ pOutput)			\
+  {									\
+    unsigned int i;							\
+    TYPE a, b;								\
+    									\
+    for (i = 0; i < N / BITS; i++)					\
+      {									\
+	a = *pInput++;							\
+	b = *pInput++;							\
+									\
+	*pOutput++ = M00 * a + M01 * b;					\
+	*pOutput++ = M10 * a + M11 * b;					\
+      }									\
+  }
+
+FUNC_FLOAT(f, float, 32)
+
+/* __fp16 test, needs explicit casts to avoid conversions to floating-point and
+   failure to vectorize.  */
+__fp16 M00_fp16 = 100.0f16;
+__fp16 M10_fp16 = 216.0f16;
+__fp16 M01_fp16 = 1322.0f16;
+__fp16 M11_fp16 = 13.0f16;
+
+#define FUNC_FLOAT_FP16(SIGN, TYPE, BITS)				\
+  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,			\
+			    TYPE *__restrict__ pOutput)			\
+  {									\
+    unsigned int i;							\
+    TYPE a, b;								\
+    									\
+    for (i = 0; i < N / BITS; i++)					\
+      {									\
+	a = *pInput++;							\
+	b = *pInput++;							\
+									\
+	*pOutput++ = (__fp16)(M00_fp16 * a) + (__fp16)(M01_fp16 * b);	\
+	*pOutput++ = (__fp16)(M10_fp16 * a) + (__fp16)(M11_fp16 * b);	\
+      }									\
+  }
+
+FUNC_FLOAT_FP16(f, __fp16, 16)
+
+/* vld2X.8 is used for signed and unsigned chars: 2 pairs.  */
+/* vld2X.16 is used for signed and unsigned shorts and __fp16: 3 pairs.  */
+/* vld2X.32 is used for signed and unsigned ints and float: 3 pairs.  */
+/* { dg-final { scan-assembler-times {vld2[01].8\t.q[0-9]+, q[0-9]+., } 4 } } */
+/* { dg-final { scan-assembler-times {vld2[01].16\t.q[0-9]+, q[0-9]+., } 6 } } */
+/* { dg-final { scan-assembler-times {vld2[01].32\t.q[0-9]+, q[0-9]+., } 6 } } */
+/* { dg-final { scan-assembler-times {vst2[01].8\t.q[0-9]+, q[0-9]+., } 4 } } */
+/* { dg-final { scan-assembler-times {vst2[01].16\t.q[0-9]+, q[0-9]+., } 6 } } */
+/* { dg-final { scan-assembler-times {vst2[01].32\t.q[0-9]+, q[0-9]+., } 6 } } */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 9/9] arm: Auto-vectorization for MVE: vld4/vst4
  2021-04-30 14:09 [PATCH 1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version Christophe Lyon
                   ` (6 preceding siblings ...)
  2021-04-30 14:09 ` [PATCH 8/9] arm: Auto-vectorization for MVE: vld2/vst2 Christophe Lyon
@ 2021-04-30 14:09 ` Christophe Lyon
  2021-05-04 12:03   ` Andre Vieira (lists)
  2021-05-24 12:15   ` Kyrylo Tkachov
  2021-05-10 11:21 ` [PATCH 1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version Christophe Lyon
  2021-05-10 11:54 ` Kyrylo Tkachov
  9 siblings, 2 replies; 35+ messages in thread
From: Christophe Lyon @ 2021-04-30 14:09 UTC (permalink / raw)
  To: gcc-patches

This patch enables MVE vld4/vst4 instructions for auto-vectorization.
We move the existing expanders from neon.md and enable them for MVE,
calling the respective emitter.

2021-03-12  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/neon.md (vec_load_lanesxi<mode>)
	(vec_store_lanexoi<mode>): Move ...
	* config/arm/vec-common.md: here.

	gcc/testsuite/
	* gcc.target/arm/simd/mve-vld4.c: New test, derived from
	slp-perm-3.c
---
 gcc/config/arm/neon.md                       |  20 ----
 gcc/config/arm/vec-common.md                 |  26 +++++
 gcc/testsuite/gcc.target/arm/simd/mve-vld4.c | 140 +++++++++++++++++++++++++++
 3 files changed, 166 insertions(+), 20 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vld4.c

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index bc8775c..fb58baf 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -5617,16 +5617,6 @@ (define_insn "neon_vld4<mode>"
                     (const_string "neon_load4_4reg<q>")))]
 )
 
-(define_expand "vec_load_lanesxi<mode>"
-  [(match_operand:XI 0 "s_register_operand")
-   (match_operand:XI 1 "neon_struct_operand")
-   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
-  "TARGET_NEON"
-{
-  emit_insn (gen_neon_vld4<mode> (operands[0], operands[1]));
-  DONE;
-})
-
 (define_expand "neon_vld4<mode>"
   [(match_operand:XI 0 "s_register_operand")
    (match_operand:XI 1 "neon_struct_operand")
@@ -5818,16 +5808,6 @@ (define_insn "neon_vst4<mode>"
                     (const_string "neon_store4_4reg<q>")))]
 )
 
-(define_expand "vec_store_lanesxi<mode>"
-  [(match_operand:XI 0 "neon_struct_operand")
-   (match_operand:XI 1 "s_register_operand")
-   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
-  "TARGET_NEON"
-{
-  emit_insn (gen_neon_vst4<mode> (operands[0], operands[1]));
-  DONE;
-})
-
 (define_expand "neon_vst4<mode>"
   [(match_operand:XI 0 "neon_struct_operand")
    (match_operand:XI 1 "s_register_operand")
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 7abefea..d46b78d 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -512,3 +512,29 @@ (define_expand "vec_store_lanesoi<mode>"
     emit_insn (gen_mve_vst2q<mode> (operands[0], operands[1]));
   DONE;
 })
+
+(define_expand "vec_load_lanesxi<mode>"
+  [(match_operand:XI 0 "s_register_operand")
+   (match_operand:XI 1 "neon_struct_operand")
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_NEON || TARGET_HAVE_MVE"
+{
+  if (TARGET_NEON)
+    emit_insn (gen_neon_vld4<mode> (operands[0], operands[1]));
+  else
+    emit_insn (gen_mve_vld4q<mode> (operands[0], operands[1]));
+  DONE;
+})
+
+(define_expand "vec_store_lanesxi<mode>"
+  [(match_operand:XI 0 "neon_struct_operand")
+   (match_operand:XI 1 "s_register_operand")
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_NEON || TARGET_HAVE_MVE"
+{
+  if (TARGET_NEON)
+    emit_insn (gen_neon_vst4<mode> (operands[0], operands[1]));
+  else
+    emit_insn (gen_mve_vst4q<mode> (operands[0], operands[1]));
+  DONE;
+})
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c b/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
new file mode 100644
index 0000000..ce3e755
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
@@ -0,0 +1,140 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3" } */
+
+#include <stdint.h>
+
+#define M00 100
+#define M10 216
+#define M20 23
+#define M30 237
+#define M01 1322
+#define M11 13
+#define M21 27271
+#define M31 2280
+#define M02 74
+#define M12 191
+#define M22 500
+#define M32 111
+#define M03 134
+#define M13 117
+#define M23 11
+#define M33 771
+
+#define N 128
+
+/* Integer tests.  */
+#define FUNC(SIGN, TYPE, BITS)						\
+  void foo_##SIGN##BITS##x (TYPE##BITS##_t *__restrict__ pInput,	\
+			    TYPE##BITS##_t *__restrict__ pOutput)	\
+  {									\
+    unsigned int i;							\
+    TYPE##BITS##_t  a, b, c, d;						\
+    									\
+    for (i = 0; i < N / BITS; i++)					\
+      {									\
+	a = *pInput++;							\
+	b = *pInput++;							\
+	c = *pInput++;							\
+	d = *pInput++;							\
+									\
+	*pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;		\
+	*pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;		\
+	*pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;		\
+	*pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;		\
+      }									\
+  }
+
+FUNC(s, int, 8)
+FUNC(u, uint, 8)
+FUNC(s, int, 16)
+FUNC(u, uint, 16)
+FUNC(s, int, 32)
+FUNC(u, uint, 32)
+
+/* float test, keep the macro because it's similar to the above, but does not
+   need the ##BITS##_t.  */
+#define FUNC_FLOAT(SIGN, TYPE, BITS)						\
+  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,			\
+			    TYPE *__restrict__ pOutput)			\
+  {									\
+    unsigned int i;							\
+    TYPE a, b, c, d;							\
+    									\
+    for (i = 0; i < N / BITS; i++)					\
+      {									\
+	a = *pInput++;							\
+	b = *pInput++;							\
+	c = *pInput++;							\
+	d = *pInput++;							\
+									\
+	*pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;		\
+	*pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;		\
+	*pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;		\
+	*pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;		\
+      }									\
+  }
+
+FUNC_FLOAT(f, float, 32)
+
+/* __fp16 test, needs explicit casts to avoid conversions to floating-point and
+   failure to vectorize.  */
+__fp16 M00_fp16 = 100.0f16;
+__fp16 M10_fp16 = 216.0f16;
+__fp16 M20_fp16 = 23.0f16;
+__fp16 M30_fp16 = 237.0f16;
+__fp16 M01_fp16 = 1322.0f16;
+__fp16 M11_fp16 = 13.0f16;
+__fp16 M21_fp16 = 27271.0f16;
+__fp16 M31_fp16 = 2280.0f16;
+__fp16 M02_fp16 = 74.0f16;
+__fp16 M12_fp16 = 191.0f16;
+__fp16 M22_fp16 = 500.0f16;
+__fp16 M32_fp16 = 111.0f16;
+__fp16 M03_fp16 = 134.0f16;
+__fp16 M13_fp16 = 117.0f16;
+__fp16 M23_fp16 = 11.0f16;
+__fp16 M33_fp16 = 771.0f16;
+
+#define FUNC_FLOAT_FP16(SIGN, TYPE, BITS)				\
+  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,			\
+			    TYPE *__restrict__ pOutput)			\
+  {									\
+    unsigned int i;							\
+    TYPE a, b, c, d;							\
+    									\
+    for (i = 0; i < N / BITS; i++)					\
+      {									\
+	a = *pInput++;							\
+	b = *pInput++;							\
+	c = *pInput++;							\
+	d = *pInput++;							\
+									\
+	TYPE ab, cd;							\
+	ab = (__fp16)(M00_fp16 * a) + (__fp16)(M01_fp16 * b);		\
+	cd = (__fp16)(M02_fp16 * c) + (__fp16)(M03_fp16 * d);		\
+	*pOutput++ = ab + cd;						\
+	ab = (__fp16)(M10_fp16 * a) + (__fp16)(M11_fp16 * b);		\
+	cd = (__fp16)(M12_fp16 * c) + (__fp16)(M13_fp16 * d);		\
+	*pOutput++ = ab + cd;						\
+	ab = (__fp16)(M20_fp16 * a) + (__fp16)(M21_fp16 * b);		\
+	cd = (__fp16)(M22_fp16 * c) + (__fp16)(M23_fp16 * d);		\
+	*pOutput++ = ab + cd;						\
+	ab = (__fp16)(M30_fp16 * a) + (__fp16)(M31_fp16 * b);		\
+	cd = (__fp16)(M32_fp16 * c) + (__fp16)(M33_fp16 * d);		\
+	*pOutput++ = ab + cd;						\
+      }									\
+  }
+
+FUNC_FLOAT_FP16(f, __fp16, 16)
+
+/* vld4X.8 is used for signed and unsigned chars: 2 * 4.  */
+/* vld4X.16 is used for signed and unsigned shorts and __fp16: 3 * 4.  */
+/* vld4X.32 is used for signed and unsigned ints and float: 3 * 4.  */
+/* { dg-final { scan-assembler-times {vld4[0123].8\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 8 } } */
+/* { dg-final { scan-assembler-times {vld4[0123].16\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
+/* { dg-final { scan-assembler-times {vld4[0123].32\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
+/* { dg-final { scan-assembler-times {vst4[0123].8\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 8 } } */
+/* { dg-final { scan-assembler-times {vst4[0123].16\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
+/* { dg-final { scan-assembler-times {vst4[0123].32\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 6/9] arm: Auto-vectorization for MVE: vcmp
  2021-04-30 14:09 ` [PATCH 6/9] arm: Auto-vectorization for MVE: vcmp Christophe Lyon
@ 2021-05-04 11:29   ` Andre Vieira (lists)
  2021-05-04 13:41     ` Christophe Lyon
  0 siblings, 1 reply; 35+ messages in thread
From: Andre Vieira (lists) @ 2021-05-04 11:29 UTC (permalink / raw)
  To: gcc-patches, Christophe Lyon

Hi Christophe,

On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:
> Since MVE has a different set of vector comparison operators from
> Neon, we have to update the expansion to take into account the new
> ones, for instance 'NE' for which MVE does not require to use 'EQ'
> with the inverted condition.
>
> Conversely, Neon supports comparisons with #0, MVE does not.
>
> For:
> typedef long int vs32 __attribute__((vector_size(16)));
> vs32 cmp_eq_vs32_reg (vs32 a, vs32 b) { return a == b; }
>
> we now generate:
> cmp_eq_vs32_reg:
> 	vldr.64 d4, .L123       @ 8     [c=8 l=4]  *mve_movv4si/8
> 	vldr.64 d5, .L123+8
> 	vldr.64 d6, .L123+16    @ 9     [c=8 l=4]  *mve_movv4si/8
> 	vldr.64 d7, .L123+24
> 	vcmp.i32  eq, q0, q1    @ 7     [c=16 l=4]  mve_vcmpeqq_v4si
> 	vpsel q0, q3, q2        @ 15    [c=8 l=4]  mve_vpselq_sv4si
> 	bx      lr      @ 26    [c=8 l=4]  *thumb2_return
> .L124:
> 	.align  3
> .L123:
> 	.word   0
> 	.word   0
> 	.word   0
> 	.word   0
> 	.word   1
> 	.word   1
> 	.word   1
> 	.word   1
>
> For some reason emit_move_insn (zero, CONST0_RTX (cmp_mode)) produces
> a pair of vldr instead of vmov.i32, qX, #0
I think ideally we would even want:
vpte  eq, q0, q1
vmovt.i32 q0, #0
vmove.i32 q0, #1

But we don't have a way to generate VPT blocks with multiple 
instructions yet unfortunately so I guess VPSEL will have to do for now.

>
> 2021-03-01  Christophe Lyon  <christophe.lyon@linaro.org>
>
> 	gcc/
> 	* config/arm/arm-protos.h (arm_expand_vector_compare): Update
> 	prototype.
> 	* config/arm/arm.c (arm_expand_vector_compare): Add support for
> 	MVE.
> 	(arm_expand_vcond): Likewise.
> 	* config/arm/iterators.md (supf): Remove VCMPNEQ_S, VCMPEQQ_S,
> 	VCMPEQQ_N_S, VCMPNEQ_N_S.
> 	(VCMPNEQ, VCMPEQQ, VCMPEQQ_N, VCMPNEQ_N): Remove.
> 	* config/arm/mve.md (@mve_vcmp<mve_cmp_op>q_<mode>): Add '@' prefix.
> 	(@mve_vcmp<mve_cmp_op>q_f<mode>): Likewise.
> 	(@mve_vcmp<mve_cmp_op>q_n_f<mode>): Likewise.
> 	(@mve_vpselq_<supf><mode>): Likewise.
> 	(@mve_vpselq_f<mode>"): Likewise.
> 	* config/arm/neon.md (vec_cmp<mode><v_cmp_result): Enable for MVE
> 	and move to vec-common.md.
> 	(vec_cmpu<mode><mode>): Likewise.
> 	(vcond<mode><mode>): Likewise.
> 	(vcond<V_cvtto><mode>): Likewise.
> 	(vcondu<mode><v_cmp_result>): Likewise.
> 	(vcond_mask_<mode><v_cmp_result>): Likewise.
> 	* config/arm/unspecs.md (VCMPNEQ_U, VCMPNEQ_S, VCMPEQQ_S)
> 	(VCMPEQQ_N_S, VCMPNEQ_N_S, VCMPEQQ_U, CMPEQQ_N_U, VCMPNEQ_N_U)
> 	(VCMPGEQ_N_S, VCMPGEQ_S, VCMPGTQ_N_S, VCMPGTQ_S, VCMPLEQ_N_S)
> 	(VCMPLEQ_S, VCMPLTQ_N_S, VCMPLTQ_S, VCMPCSQ_N_U, VCMPCSQ_U)
> 	(VCMPHIQ_N_U, VCMPHIQ_U): Remove.
> 	* config/arm/vec-common.md (vec_cmp<mode><v_cmp_result): Moved
> 	from neon.md.
> 	(vec_cmpu<mode><mode>): Likewise.
> 	(vcond<mode><mode>): Likewise.
> 	(vcond<V_cvtto><mode>): Likewise.
> 	(vcondu<mode><v_cmp_result>): Likewise.
> 	(vcond_mask_<mode><v_cmp_result>): Likewise.
>
> 	gcc/testsuite
> 	* gcc.target/arm/simd/mve-compare-1.c: New test with GCC vectors.
> 	* gcc.target/arm/simd/mve-compare-2.c: New test with GCC vectors.
> 	* gcc.target/arm/simd/mve-compare-scalar-1.c: New test with GCC
> 	vectors.
> 	* gcc.target/arm/simd/mve-vcmp-f32.c: New test for
> 	auto-vectorization.
> 	* gcc.target/arm/simd/mve-vcmp.c: New test for auto-vectorization.
>
> add gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
> ---
>   gcc/config/arm/arm-protos.h                        |   2 +-
>   gcc/config/arm/arm.c                               | 211 ++++++++++++++++-----
>   gcc/config/arm/iterators.md                        |   9 +-
>   gcc/config/arm/mve.md                              |  10 +-
>   gcc/config/arm/neon.md                             |  87 ---------
>   gcc/config/arm/unspecs.md                          |  20 --
>   gcc/config/arm/vec-common.md                       | 107 +++++++++++
>   gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c  |  80 ++++++++
>   gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c  |  38 ++++
>   .../gcc.target/arm/simd/mve-compare-scalar-1.c     |  69 +++++++
>   gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c   |  30 +++
>   gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c       |  50 +++++
>   12 files changed, 547 insertions(+), 166 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
>   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
>   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
>   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
>   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c
>
> diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
> index 2521541..ffccaa7 100644
> --- a/gcc/config/arm/arm-protos.h
> +++ b/gcc/config/arm/arm-protos.h
> @@ -373,7 +373,7 @@ extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
>   extern bool arm_fusion_enabled_p (tune_params::fuse_ops);
>   extern bool arm_valid_symbolic_address_p (rtx);
>   extern bool arm_validize_comparison (rtx *, rtx *, rtx *);
> -extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool);
> +extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool, bool);
>   #endif /* RTX_CODE */
>   
>   extern bool arm_gen_setmem (rtx *);
> diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
> index 0371d98..80e28ef 100644
> --- a/gcc/config/arm/arm.c
> +++ b/gcc/config/arm/arm.c
> @@ -30933,66 +30933,114 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
>      and return true if TARGET contains the inverse.  If !CAN_INVERT,
>      always store the result in TARGET, never its inverse.
>   
> +   If VCOND_MVE, do not emit the vpsel instruction here, let arm_expand_vcond do
> +   it with the right destination type to avoid emiting two vpsel, one here and
> +   one in arm_expand_vcond.
> +
>      Note that the handling of floating-point comparisons is not
>      IEEE compliant.  */
>   
>   bool
>   arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
> -			   bool can_invert)
> +			   bool can_invert, bool vcond_mve)
>   {
>     machine_mode cmp_result_mode = GET_MODE (target);
>     machine_mode cmp_mode = GET_MODE (op0);
>   
>     bool inverted;
> -  switch (code)
> -    {
> -    /* For these we need to compute the inverse of the requested
> -       comparison.  */
> -    case UNORDERED:
> -    case UNLT:
> -    case UNLE:
> -    case UNGT:
> -    case UNGE:
> -    case UNEQ:
> -    case NE:
> -      code = reverse_condition_maybe_unordered (code);
> -      if (!can_invert)
> -	{
> -	  /* Recursively emit the inverted comparison into a temporary
> -	     and then store its inverse in TARGET.  This avoids reusing
> -	     TARGET (which for integer NE could be one of the inputs).  */
> -	  rtx tmp = gen_reg_rtx (cmp_result_mode);
> -	  if (arm_expand_vector_compare (tmp, code, op0, op1, true))
> -	    gcc_unreachable ();
> -	  emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp)));
> -	  return false;
> -	}
> -      inverted = true;
> -      break;
>   
> -    default:
> +  /* MVE supports more comparisons than Neon.  */
> +  if (TARGET_HAVE_MVE)
>         inverted = false;
> -      break;
> -    }
> +  else
> +    switch (code)
> +      {
> +	/* For these we need to compute the inverse of the requested
> +	   comparison.  */
> +      case UNORDERED:
> +      case UNLT:
> +      case UNLE:
> +      case UNGT:
> +      case UNGE:
> +      case UNEQ:
> +      case NE:
> +	code = reverse_condition_maybe_unordered (code);
> +	if (!can_invert)
> +	  {
> +	    /* Recursively emit the inverted comparison into a temporary
> +	       and then store its inverse in TARGET.  This avoids reusing
> +	       TARGET (which for integer NE could be one of the inputs).  */
> +	    rtx tmp = gen_reg_rtx (cmp_result_mode);
> +	    if (arm_expand_vector_compare (tmp, code, op0, op1, true, vcond_mve))
> +	      gcc_unreachable ();
> +	    emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp)));
> +	    return false;
> +	  }
> +	inverted = true;
> +	break;
> +
> +      default:
> +	inverted = false;
> +	break;
> +      }
>   
>     switch (code)
>       {
> -    /* These are natively supported for zero comparisons, but otherwise
> -       require the operands to be swapped.  */
> +    /* These are natively supported by Neon for zero comparisons, but otherwise
> +       require the operands to be swapped. For MVE, we can only compare
> +       registers.  */
>       case LE:
>       case LT:
> -      if (op1 != CONST0_RTX (cmp_mode))
> -	{
> -	  code = swap_condition (code);
> -	  std::swap (op0, op1);
> -	}
> +      if (!TARGET_HAVE_MVE)
> +	if (op1 != CONST0_RTX (cmp_mode))
> +	  {
> +	    code = swap_condition (code);
> +	    std::swap (op0, op1);
> +	  }
>         /* Fall through.  */
>   
> -    /* These are natively supported for both register and zero operands.  */
> +    /* These are natively supported by Neon for both register and zero
> +       operands. MVE supports registers only.  */
>       case EQ:
>       case GE:
>       case GT:
> -      emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1));
> +    case NE:
> +      if (TARGET_HAVE_MVE) {
> +	rtx vpr_p0;
> +	if (vcond_mve)
> +	  vpr_p0 = target;
> +	else
> +	  vpr_p0 = gen_reg_rtx (HImode);
> +
> +	switch (cmp_mode)
> +	  {
> +	  case E_V16QImode:
> +	  case E_V8HImode:
> +	  case E_V4SImode:
> +	    emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
> +	    break;
> +	  case E_V8HFmode:
> +	  case E_V4SFmode:
> +	    if (TARGET_HAVE_MVE_FLOAT)
> +	      emit_insn (gen_mve_vcmpq_f (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
> +	    else
> +	      gcc_unreachable ();
> +	    break;
> +	  default:
> +	    gcc_unreachable ();
> +	  }
> +
> +	/* If we are not expanding a vcond, build the result here.  */
> +	if (!vcond_mve) {
> +	  rtx zero = gen_reg_rtx (cmp_result_mode);
> +	  rtx one = gen_reg_rtx (cmp_result_mode);
> +	  emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
> +	  emit_move_insn (one, CONST1_RTX (cmp_result_mode));
> +	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
> +	}
> +      }
> +      else
> +	emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1));
>         return inverted;
>   
>       /* These are natively supported for register operands only.
> @@ -31000,16 +31048,50 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
>          or canonicalized by target-independent code.  */
>       case GEU:
>       case GTU:
> -      emit_insn (gen_neon_vc (code, cmp_mode, target,
> -			      op0, force_reg (cmp_mode, op1)));
> +      if (TARGET_HAVE_MVE) {
> +	rtx vpr_p0;
> +	if (vcond_mve)
> +	  vpr_p0 = target;
> +	else
> +	  vpr_p0 = gen_reg_rtx (HImode);
> +
> +	emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
> +	if (!vcond_mve) {
> +	  rtx zero = gen_reg_rtx (cmp_result_mode);
> +	  rtx one = gen_reg_rtx (cmp_result_mode);
> +	  emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
> +	  emit_move_insn (one, CONST1_RTX (cmp_result_mode));
> +	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
> +	}
> +      }
> +       else
> +	emit_insn (gen_neon_vc (code, cmp_mode, target,
> +				op0, force_reg (cmp_mode, op1)));
>         return inverted;
>   
>       /* These require the operands to be swapped and likewise do not
>          support comparisons with zero.  */
>       case LEU:
>       case LTU:
> -      emit_insn (gen_neon_vc (swap_condition (code), cmp_mode,
> -			      target, force_reg (cmp_mode, op1), op0));
> +      if (TARGET_HAVE_MVE) {
> +	rtx vpr_p0;
> +	if (vcond_mve)
> +	  vpr_p0 = target;
> +	else
> +	  vpr_p0 = gen_reg_rtx (HImode);
> +
> +	emit_insn (gen_mve_vcmpq (swap_condition (code), cmp_mode, vpr_p0, force_reg (cmp_mode, op1), op0));
> +	if (!vcond_mve) {
> +	  rtx zero = gen_reg_rtx (cmp_result_mode);
> +	  rtx one = gen_reg_rtx (cmp_result_mode);
> +	  emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
> +	  emit_move_insn (one, CONST1_RTX (cmp_result_mode));
> +	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
> +	}
> +      }
> +      else
> +	emit_insn (gen_neon_vc (swap_condition (code), cmp_mode,
> +				target, force_reg (cmp_mode, op1), op0));
>         return inverted;
>   
>       /* These need a combination of two comparisons.  */
> @@ -31021,8 +31103,8 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
>   	rtx gt_res = gen_reg_rtx (cmp_result_mode);
>   	rtx alt_res = gen_reg_rtx (cmp_result_mode);
>   	rtx_code alt_code = (code == LTGT ? LT : LE);
> -	if (arm_expand_vector_compare (gt_res, GT, op0, op1, true)
> -	    || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true))
> +	if (arm_expand_vector_compare (gt_res, GT, op0, op1, true, vcond_mve)
> +	    || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true, vcond_mve))
>   	  gcc_unreachable ();
>   	emit_insn (gen_rtx_SET (target, gen_rtx_IOR (cmp_result_mode,
>   						     gt_res, alt_res)));
> @@ -31040,13 +31122,50 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
>   void
>   arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode)
>   {
> -  rtx mask = gen_reg_rtx (cmp_result_mode);
> +  /* When expanding for MVE, we do not want to emit a (useless) vpsel in
> +     arm_expand_vector_compare, and another one here.  */
> +  bool vcond_mve=false;
> +  rtx mask;
> +
> +  if (TARGET_HAVE_MVE)
> +    {
> +      vcond_mve=true;
> +      mask = gen_reg_rtx (HImode);
> +    }
> +  else
> +    mask = gen_reg_rtx (cmp_result_mode);
> +
>     bool inverted = arm_expand_vector_compare (mask, GET_CODE (operands[3]),
> -					     operands[4], operands[5], true);
> +					     operands[4], operands[5], true, vcond_mve);
>     if (inverted)
>       std::swap (operands[1], operands[2]);
> +  if (TARGET_NEON)
>     emit_insn (gen_neon_vbsl (GET_MODE (operands[0]), operands[0],
>   			    mask, operands[1], operands[2]));
> +  else
> +    {
> +      machine_mode cmp_mode = GET_MODE (operands[4]);
> +      rtx vpr_p0 = mask;
> +      rtx zero = gen_reg_rtx (cmp_mode);
> +      rtx one = gen_reg_rtx (cmp_mode);
> +      emit_move_insn (zero, CONST0_RTX (cmp_mode));
> +      emit_move_insn (one, CONST1_RTX (cmp_mode));
> +      switch (cmp_mode)
> +	{
> +	case E_V16QImode:
> +	case E_V8HImode:
> +	case E_V4SImode:
> +	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, operands[0], one, zero, vpr_p0));
> +	  break;
> +	case E_V8HFmode:
> +	case E_V4SFmode:
> +	  if (TARGET_HAVE_MVE_FLOAT)
> +	    emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], one, zero, vpr_p0));
> +	  break;
> +	default:
> +	  gcc_unreachable ();
> +	}
> +    }
>   }
>   \f
>   #define MAX_VECT_LEN 16
> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> index 95df8bd..a128465 100644
> --- a/gcc/config/arm/iterators.md
> +++ b/gcc/config/arm/iterators.md
> @@ -1288,12 +1288,11 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
>   		       (VCREATEQ_U "u") (VCREATEQ_S "s") (VSHRQ_N_S "s")
>   		       (VSHRQ_N_U "u") (VCVTQ_N_FROM_F_S "s") (VSHLQ_U "u")
>   		       (VCVTQ_N_FROM_F_U "u") (VADDLVQ_P_S "s") (VSHLQ_S "s")
> -		       (VADDLVQ_P_U "u") (VCMPNEQ_S "s")
> +		       (VADDLVQ_P_U "u")
>   		       (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s")
>   		       (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
>   		       (VADDVQ_P_S "s")	(VADDVQ_P_U "u") (VBRSRQ_N_S "s")
> -		       (VBRSRQ_N_U "u") (VCMPEQQ_S "s")
> -		       (VCMPEQQ_N_S "s") (VCMPNEQ_N_S "s")
> +		       (VBRSRQ_N_U "u")
>   		       (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s")
>   		       (VHADDQ_U "u") (VHSUBQ_N_S "s")	(VHSUBQ_N_U "u")
>   		       (VHSUBQ_S "s") (VMAXQ_S "s") (VMAXQ_U "u") (VHSUBQ_U "u")
> @@ -1549,16 +1548,12 @@ (define_int_iterator VCREATEQ [VCREATEQ_U VCREATEQ_S])
>   (define_int_iterator VSHRQ_N [VSHRQ_N_S VSHRQ_N_U])
>   (define_int_iterator VCVTQ_N_FROM_F [VCVTQ_N_FROM_F_S VCVTQ_N_FROM_F_U])
>   (define_int_iterator VADDLVQ_P [VADDLVQ_P_S VADDLVQ_P_U])
> -(define_int_iterator VCMPNEQ [VCMPNEQ_S])
>   (define_int_iterator VSHLQ [VSHLQ_S VSHLQ_U])
>   (define_int_iterator VABDQ [VABDQ_S VABDQ_U])
>   (define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U])
>   (define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U])
>   (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
>   (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
> -(define_int_iterator VCMPEQQ [VCMPEQQ_S])
> -(define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S])
> -(define_int_iterator VCMPNEQ_N [VCMPNEQ_N_S])
>   (define_int_iterator VHADDQ [VHADDQ_S VHADDQ_U])
>   (define_int_iterator VHADDQ_N [VHADDQ_N_U VHADDQ_N_S])
>   (define_int_iterator VHSUBQ [VHSUBQ_S VHSUBQ_U])
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index 7c846a4..97f0a87 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -838,7 +838,7 @@ (define_insn "mve_vaddlvq_p_<supf>v4si"
>   ;;
>   ;; [vcmpneq_, vcmpcsq_, vcmpeqq_, vcmpgeq_, vcmpgtq_, vcmphiq_, vcmpleq_, vcmpltq_])
>   ;;
> -(define_insn "mve_vcmp<mve_cmp_op>q_<mode>"
> +(define_insn "@mve_vcmp<mve_cmp_op>q_<mode>"
>     [
>      (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>   	(MVE_COMPARISONS:HI (match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1928,7 +1928,7 @@ (define_insn "mve_vcaddq<mve_rot><mode>"
>   ;;
>   ;; [vcmpeqq_f, vcmpgeq_f, vcmpgtq_f, vcmpleq_f, vcmpltq_f, vcmpneq_f])
>   ;;
> -(define_insn "mve_vcmp<mve_cmp_op>q_f<mode>"
> +(define_insn "@mve_vcmp<mve_cmp_op>q_f<mode>"
>     [
>      (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>   	(MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w")
> @@ -1942,7 +1942,7 @@ (define_insn "mve_vcmp<mve_cmp_op>q_f<mode>"
>   ;;
>   ;; [vcmpeqq_n_f, vcmpgeq_n_f, vcmpgtq_n_f, vcmpleq_n_f, vcmpltq_n_f, vcmpneq_n_f])
>   ;;
> -(define_insn "mve_vcmp<mve_cmp_op>q_n_f<mode>"
> +(define_insn "@mve_vcmp<mve_cmp_op>q_n_f<mode>"
>     [
>      (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>   	(MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w")
> @@ -3307,7 +3307,7 @@ (define_insn "mve_vnegq_m_s<mode>"
>   ;;
>   ;; [vpselq_u, vpselq_s])
>   ;;
> -(define_insn "mve_vpselq_<supf><mode>"
> +(define_insn "@mve_vpselq_<supf><mode>"
>     [
>      (set (match_operand:MVE_1 0 "s_register_operand" "=w")
>   	(unspec:MVE_1 [(match_operand:MVE_1 1 "s_register_operand" "w")
> @@ -4402,7 +4402,7 @@ (define_insn "mve_vorrq_m_n_<supf><mode>"
>   ;;
>   ;; [vpselq_f])
>   ;;
> -(define_insn "mve_vpselq_f<mode>"
> +(define_insn "@mve_vpselq_f<mode>"
>     [
>      (set (match_operand:MVE_0 0 "s_register_operand" "=w")
>   	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index fec2cc9..6660846 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -1416,93 +1416,6 @@ (define_insn "*us_sub<mode>_neon"
>     [(set_attr "type" "neon_qsub<q>")]
>   )
>   
> -(define_expand "vec_cmp<mode><v_cmp_result>"
> -  [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
> -	(match_operator:<V_cmp_result> 1 "comparison_operator"
> -	  [(match_operand:VDQW 2 "s_register_operand")
> -	   (match_operand:VDQW 3 "reg_or_zero_operand")]))]
> -  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> -{
> -  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
> -			     operands[2], operands[3], false);
> -  DONE;
> -})
> -
> -(define_expand "vec_cmpu<mode><mode>"
> -  [(set (match_operand:VDQIW 0 "s_register_operand")
> -	(match_operator:VDQIW 1 "comparison_operator"
> -	  [(match_operand:VDQIW 2 "s_register_operand")
> -	   (match_operand:VDQIW 3 "reg_or_zero_operand")]))]
> -  "TARGET_NEON"
> -{
> -  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
> -			     operands[2], operands[3], false);
> -  DONE;
> -})
> -
> -;; Conditional instructions.  These are comparisons with conditional moves for
> -;; vectors.  They perform the assignment:
> -;;
> -;;     Vop0 = (Vop4 <op3> Vop5) ? Vop1 : Vop2;
> -;;
> -;; where op3 is <, <=, ==, !=, >= or >.  Operations are performed
> -;; element-wise.
> -
> -(define_expand "vcond<mode><mode>"
> -  [(set (match_operand:VDQW 0 "s_register_operand")
> -	(if_then_else:VDQW
> -	  (match_operator 3 "comparison_operator"
> -	    [(match_operand:VDQW 4 "s_register_operand")
> -	     (match_operand:VDQW 5 "reg_or_zero_operand")])
> -	  (match_operand:VDQW 1 "s_register_operand")
> -	  (match_operand:VDQW 2 "s_register_operand")))]
> -  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> -{
> -  arm_expand_vcond (operands, <V_cmp_result>mode);
> -  DONE;
> -})
> -
> -(define_expand "vcond<V_cvtto><mode>"
> -  [(set (match_operand:<V_CVTTO> 0 "s_register_operand")
> -	(if_then_else:<V_CVTTO>
> -	  (match_operator 3 "comparison_operator"
> -	    [(match_operand:V32 4 "s_register_operand")
> -	     (match_operand:V32 5 "reg_or_zero_operand")])
> -	  (match_operand:<V_CVTTO> 1 "s_register_operand")
> -	  (match_operand:<V_CVTTO> 2 "s_register_operand")))]
> -  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> -{
> -  arm_expand_vcond (operands, <V_cmp_result>mode);
> -  DONE;
> -})
> -
> -(define_expand "vcondu<mode><v_cmp_result>"
> -  [(set (match_operand:VDQW 0 "s_register_operand")
> -	(if_then_else:VDQW
> -	  (match_operator 3 "arm_comparison_operator"
> -	    [(match_operand:<V_cmp_result> 4 "s_register_operand")
> -	     (match_operand:<V_cmp_result> 5 "reg_or_zero_operand")])
> -	  (match_operand:VDQW 1 "s_register_operand")
> -	  (match_operand:VDQW 2 "s_register_operand")))]
> -  "TARGET_NEON"
> -{
> -  arm_expand_vcond (operands, <V_cmp_result>mode);
> -  DONE;
> -})
> -
> -(define_expand "vcond_mask_<mode><v_cmp_result>"
> -  [(set (match_operand:VDQW 0 "s_register_operand")
> -	(if_then_else:VDQW
> -	  (match_operand:<V_cmp_result> 3 "s_register_operand")
> -	  (match_operand:VDQW 1 "s_register_operand")
> -	  (match_operand:VDQW 2 "s_register_operand")))]
> -  "TARGET_NEON"
> -{
> -  emit_insn (gen_neon_vbsl<mode> (operands[0], operands[3], operands[1],
> -				  operands[2]));
> -  DONE;
> -})
> -
>   ;; Patterns for builtins.
>   
>   ; good for plain vadd, vaddq.
> diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
> index 07ca53b..0778db1 100644
> --- a/gcc/config/arm/unspecs.md
> +++ b/gcc/config/arm/unspecs.md
> @@ -596,8 +596,6 @@ (define_c_enum "unspec" [
>     VCVTQ_N_FROM_F_U
>     VADDLVQ_P_S
>     VADDLVQ_P_U
> -  VCMPNEQ_U
> -  VCMPNEQ_S
>     VSHLQ_S
>     VSHLQ_U
>     VABDQ_S
> @@ -605,9 +603,6 @@ (define_c_enum "unspec" [
>     VADDVAQ_S
>     VADDVQ_P_S
>     VBRSRQ_N_S
> -  VCMPEQQ_S
> -  VCMPEQQ_N_S
> -  VCMPNEQ_N_S
>     VHADDQ_S
>     VHADDQ_N_S
>     VHSUBQ_S
> @@ -645,9 +640,6 @@ (define_c_enum "unspec" [
>     VADDVAQ_U
>     VADDVQ_P_U
>     VBRSRQ_N_U
> -  VCMPEQQ_U
> -  VCMPEQQ_N_U
> -  VCMPNEQ_N_U
>     VHADDQ_U
>     VHADDQ_N_U
>     VHSUBQ_U
> @@ -680,14 +672,6 @@ (define_c_enum "unspec" [
>     VSHLQ_R_U
>     VSUBQ_U
>     VSUBQ_N_U
> -  VCMPGEQ_N_S
> -  VCMPGEQ_S
> -  VCMPGTQ_N_S
> -  VCMPGTQ_S
> -  VCMPLEQ_N_S
> -  VCMPLEQ_S
> -  VCMPLTQ_N_S
> -  VCMPLTQ_S
>     VHCADDQ_ROT270_S
>     VHCADDQ_ROT90_S
>     VMAXAQ_S
> @@ -702,10 +686,6 @@ (define_c_enum "unspec" [
>     VQRDMULHQ_N_S
>     VQRDMULHQ_S
>     VQSHLUQ_N_S
> -  VCMPCSQ_N_U
> -  VCMPCSQ_U
> -  VCMPHIQ_N_U
> -  VCMPHIQ_U
>     VABDQ_M_S
>     VABDQ_M_U
>     VABDQ_F
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> index 0b2b3b1..034b48b 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -362,3 +362,110 @@ (define_expand "vlshr<mode>3"
>         DONE;
>       }
>   })
> +
> +(define_expand "vec_cmp<mode><v_cmp_result>"
> +  [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
> +	(match_operator:<V_cmp_result> 1 "comparison_operator"
> +	  [(match_operand:VDQW 2 "s_register_operand")
> +	   (match_operand:VDQW 3 "reg_or_zero_operand")]))]
> +  "ARM_HAVE_<MODE>_ARITH
> +   && !TARGET_REALLY_IWMMXT
> +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> +{
> +  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
> +			     operands[2], operands[3], false, false);
> +  DONE;
> +})
> +
> +(define_expand "vec_cmpu<mode><mode>"
> +  [(set (match_operand:VDQIW 0 "s_register_operand")
> +	(match_operator:VDQIW 1 "comparison_operator"
> +	  [(match_operand:VDQIW 2 "s_register_operand")
> +	   (match_operand:VDQIW 3 "reg_or_zero_operand")]))]
> +  "ARM_HAVE_<MODE>_ARITH
> +   && !TARGET_REALLY_IWMMXT"
> +{
> +  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
> +			     operands[2], operands[3], false, false);
> +  DONE;
> +})
> +
> +;; Conditional instructions.  These are comparisons with conditional moves for
> +;; vectors.  They perform the assignment:
> +;;
> +;;     Vop0 = (Vop4 <op3> Vop5) ? Vop1 : Vop2;
> +;;
> +;; where op3 is <, <=, ==, !=, >= or >.  Operations are performed
> +;; element-wise.
> +
> +(define_expand "vcond<mode><mode>"
> +  [(set (match_operand:VDQW 0 "s_register_operand")
> +	(if_then_else:VDQW
> +	  (match_operator 3 "comparison_operator"
> +	    [(match_operand:VDQW 4 "s_register_operand")
> +	     (match_operand:VDQW 5 "reg_or_zero_operand")])
> +	  (match_operand:VDQW 1 "s_register_operand")
> +	  (match_operand:VDQW 2 "s_register_operand")))]
> +  "ARM_HAVE_<MODE>_ARITH
> +   && !TARGET_REALLY_IWMMXT
> +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> +{
> +  arm_expand_vcond (operands, <V_cmp_result>mode);
> +  DONE;
> +})
> +
> +(define_expand "vcond<V_cvtto><mode>"
> +  [(set (match_operand:<V_CVTTO> 0 "s_register_operand")
> +	(if_then_else:<V_CVTTO>
> +	  (match_operator 3 "comparison_operator"
> +	    [(match_operand:V32 4 "s_register_operand")
> +	     (match_operand:V32 5 "reg_or_zero_operand")])
> +	  (match_operand:<V_CVTTO> 1 "s_register_operand")
> +	  (match_operand:<V_CVTTO> 2 "s_register_operand")))]
> +  "ARM_HAVE_<MODE>_ARITH
> +   && !TARGET_REALLY_IWMMXT
> +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> +{
> +  arm_expand_vcond (operands, <V_cmp_result>mode);
> +  DONE;
> +})
> +
> +(define_expand "vcondu<mode><v_cmp_result>"
> +  [(set (match_operand:VDQW 0 "s_register_operand")
> +	(if_then_else:VDQW
> +	  (match_operator 3 "arm_comparison_operator"
> +	    [(match_operand:<V_cmp_result> 4 "s_register_operand")
> +	     (match_operand:<V_cmp_result> 5 "reg_or_zero_operand")])
> +	  (match_operand:VDQW 1 "s_register_operand")
> +	  (match_operand:VDQW 2 "s_register_operand")))]
> +  "ARM_HAVE_<MODE>_ARITH
> +   && !TARGET_REALLY_IWMMXT"
> +{
> +  arm_expand_vcond (operands, <V_cmp_result>mode);
> +  DONE;
> +})
> +
> +(define_expand "vcond_mask_<mode><v_cmp_result>"
> +  [(set (match_operand:VDQW 0 "s_register_operand")
> +        (if_then_else:VDQW
> +          (match_operand:<V_cmp_result> 3 "s_register_operand")
> +          (match_operand:VDQW 1 "s_register_operand")
> +          (match_operand:VDQW 2 "s_register_operand")))]
> +  "ARM_HAVE_<MODE>_ARITH
> +   && !TARGET_REALLY_IWMMXT"
> +{
> +  if (TARGET_NEON)
> +    {
> +      emit_insn (gen_neon_vbsl (<MODE>mode, operands[0], operands[3],
> +                                operands[1], operands[2]));
> +    }
> +  else if (TARGET_HAVE_MVE)
> +    {
> +      emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0],
> +                                 operands[1], operands[2], operands[3]));
> +    }
> +  else
> +    gcc_unreachable ();
> +
> +  DONE;
> +})
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
> new file mode 100644
> index 0000000..029c931
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
> @@ -0,0 +1,80 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> +/* { dg-add-options arm_v8_1m_mve } */
> +/* { dg-additional-options "-O3" } */
> +
> +/* Integer tests.  */
> +
> +#define COMPARE_REG(NAME, OP, TYPE) \
> +  TYPE \
> +  cmp_##NAME##_##TYPE##_reg (TYPE a, TYPE b) \
> +  { \
> +    return a OP b; \
> +  }
> +
> +#define COMPARE_REG_AND_ZERO(NAME, OP, TYPE) \
> +  COMPARE_REG (NAME, OP, TYPE) \
> +  \
> +  TYPE \
> +  cmp_##NAME##_##TYPE##_zero (TYPE a) \
> +  { \
> +    return a OP (TYPE) {}; \
> +  }
> +
> +#define COMPARE_TYPE(TYPE, COMPARE_ORDERED) \
> +  COMPARE_REG_AND_ZERO (eq, ==, TYPE) \
> +  COMPARE_REG_AND_ZERO (ne, !=, TYPE) \
> +  COMPARE_ORDERED (lt, <, TYPE) \
> +  COMPARE_ORDERED (le, <=, TYPE) \
> +  COMPARE_ORDERED (gt, >, TYPE) \
> +  COMPARE_ORDERED (ge, >=, TYPE)
> +
> +#define TEST_TYPE(NAME, ELEM, COMPARE_ORDERED, SIZE)  \
> +  typedef ELEM NAME##SIZE __attribute__((vector_size(SIZE))); \
> +  COMPARE_TYPE (NAME##SIZE, COMPARE_ORDERED)
> +
> +/* 64-bits vectors, not vectorized.  */
> +TEST_TYPE (vs8, __INT8_TYPE__, COMPARE_REG_AND_ZERO, 8)
> +TEST_TYPE (vu8, __UINT8_TYPE__, COMPARE_REG, 8)
> +TEST_TYPE (vs16, __INT16_TYPE__, COMPARE_REG_AND_ZERO, 8)
> +TEST_TYPE (vu16, __UINT16_TYPE__, COMPARE_REG, 8)
> +TEST_TYPE (vs32, __INT32_TYPE__, COMPARE_REG_AND_ZERO, 8)
> +TEST_TYPE (vu32, __UINT32_TYPE__, COMPARE_REG, 8)
> +
> +/* 128-bits vectors.  */
> +TEST_TYPE (vs8, __INT8_TYPE__, COMPARE_REG_AND_ZERO, 16)
> +TEST_TYPE (vu8, __UINT8_TYPE__, COMPARE_REG, 16)
> +TEST_TYPE (vs16, __INT16_TYPE__, COMPARE_REG_AND_ZERO, 16)
> +TEST_TYPE (vu16, __UINT16_TYPE__, COMPARE_REG, 16)
> +TEST_TYPE (vs32, __INT32_TYPE__, COMPARE_REG_AND_ZERO, 16)
> +TEST_TYPE (vu32, __UINT32_TYPE__, COMPARE_REG, 16)
> +
> +/* { 8 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> +/* { dg-final { scan-assembler-times {\tvcmp.i8  eq, q[0-9]+, q[0-9]+\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.i8  ne, q[0-9]+, q[0-9]+\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s8  lt, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s8  le, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s8  gt, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s8  ge, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.u8  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.u8  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> +
> +/* { 16 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> +/* { dg-final { scan-assembler-times {\tvcmp.i16  eq, q[0-9]+, q[0-9]+\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.i16  ne, q[0-9]+, q[0-9]+\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s16  lt, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s16  le, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s16  gt, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s16  ge, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.u16  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.u16  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> +
> +/* { 32 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> +/* { dg-final { scan-assembler-times {\tvcmp.i32  eq, q[0-9]+, q[0-9]+\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.i32  ne, q[0-9]+, q[0-9]+\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s32  lt, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s32  le, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s32  gt, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s32  ge, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.u32  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.u32  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
> new file mode 100644
> index 0000000..8515195
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
> @@ -0,0 +1,38 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> +/* { dg-add-options arm_v8_1m_mve_fp } */
> +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> +
> +/* float 32 tests.  */
> +
> +#ifndef ELEM_TYPE
> +#define ELEM_TYPE float
> +#endif
> +#ifndef INT_ELEM_TYPE
> +#define INT_ELEM_TYPE __INT32_TYPE__
> +#endif
> +
> +#define COMPARE(NAME, OP)			\
> +  int_vec					\
> +  cmp_##NAME##_reg (vec a, vec b)		\
> +  {						\
> +    return a OP b;				\
> +  }
> +
> +typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));
> +typedef ELEM_TYPE vec __attribute__((vector_size(16)));
> +
> +COMPARE (eq, ==)
> +COMPARE (ne, !=)
> +COMPARE (lt, <)
> +COMPARE (le, <=)
> +COMPARE (gt, >)
> +COMPARE (ge, >=)
> +
> +/* eq, ne, lt, le, gt, ge.
> +/* { dg-final { scan-assembler-times {\tvcmp.f32\teq, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f32\tne, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f32\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f32\tle, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f32\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f32\tge, q[0-9]+, q[0-9]+\n} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
> new file mode 100644
> index 0000000..7774972
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
> @@ -0,0 +1,69 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> +/* { dg-add-options arm_v8_1m_mve } */
> +/* { dg-additional-options "-O3" } */
> +
> +#define COMPARE_REG(NAME, OP, TYPE, SCALAR)	  \
> +  TYPE						  \
> +  cmp_##NAME##_##TYPE##_scalar (TYPE a, SCALAR b) \
> +  {						  \
> +    return a OP b;				  \
> +  }
> +
> +#define COMPARE_TYPE(SCALAR, TYPE)				\
> +  COMPARE_REG (eq, ==, TYPE, SCALAR)				\
> +  COMPARE_REG (ne, !=, TYPE, SCALAR)				\
> +  COMPARE_REG (lt, <, TYPE, SCALAR)				\
> +  COMPARE_REG (le, <=, TYPE, SCALAR)				\
> +  COMPARE_REG (gt, >, TYPE, SCALAR)				\
> +  COMPARE_REG (ge, >=, TYPE, SCALAR)
> +
> +#define TEST_TYPE(NAME, ELEM, SIZE)			      \
> +  typedef ELEM NAME##SIZE __attribute__((vector_size(SIZE))); \
> +  COMPARE_TYPE (ELEM, NAME##SIZE)
> +
> +/* 64-bits vectors, not vectorized.  */
> +TEST_TYPE (vs8, __INT8_TYPE__, 8)
> +TEST_TYPE (vu8, __UINT8_TYPE__, 8)
> +TEST_TYPE (vs16, __INT16_TYPE__, 8)
> +TEST_TYPE (vu16, __UINT16_TYPE__, 8)
> +TEST_TYPE (vs32, __INT32_TYPE__, 8)
> +TEST_TYPE (vu32, __UINT32_TYPE__, 8)
> +
> +/* 128-bits vectors.  */
> +TEST_TYPE (vs8, __INT8_TYPE__, 16)
> +TEST_TYPE (vu8, __UINT8_TYPE__, 16)
> +TEST_TYPE (vs16, __INT16_TYPE__, 16)
> +TEST_TYPE (vu16, __UINT16_TYPE__, 16)
> +TEST_TYPE (vs32, __INT32_TYPE__, 16)
> +TEST_TYPE (vu32, __UINT32_TYPE__, 16)
> +
> +/* { 8 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> +/* { dg-final { scan-assembler-times {\tvcmp.i8  eq, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.i8  ne, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s8  lt, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s8  le, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s8  gt, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s8  ge, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.u8  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.u8  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> +
> +/* { 16 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> +/* { dg-final { scan-assembler-times {\tvcmp.i16  eq, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.i16  ne, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s16  lt, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s16  le, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s16  gt, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s16  ge, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.u16  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.u16  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> +
> +/* { 32 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> +/* { dg-final { scan-assembler-times {\tvcmp.i32  eq, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.i32  ne, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s32  lt, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s32  le, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s32  gt, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s32  ge, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.u32  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.u32  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
> new file mode 100644
> index 0000000..4ed449e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
> @@ -0,0 +1,30 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> +/* { dg-add-options arm_v8_1m_mve_fp } */
> +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> +
> +#include <stdint.h>
> +
> +#define NB 4
> +
> +#define FUNC(OP, NAME)							\
> +  void test_ ## NAME ##_f (float * __restrict__ dest, float *a, float *b) { \
> +    int i;								\
> +    for (i=0; i<NB; i++) {						\
> +      dest[i] = a[i] OP b[i];						\
> +    }									\
> +  }
> +
> +FUNC(==, vcmpeq)
> +FUNC(!=, vcmpne)
> +FUNC(<, vcmplt)
> +FUNC(<=, vcmple)
> +FUNC(>, vcmpgt)
> +FUNC(>=, vcmpge)
> +
> +/* { dg-final { scan-assembler-times {\tvcmp.f32\teq, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f32\tne, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f32\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f32\tle, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f32\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f32\tge, q[0-9]+, q[0-9]+\n} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c
> new file mode 100644
> index 0000000..8da15e7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c
> @@ -0,0 +1,50 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> +/* { dg-add-options arm_v8_1m_mve } */
> +/* { dg-additional-options "-O3" } */
> +
> +#include <stdint.h>
> +
> +#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)				\
> +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
> +    int i;								\
> +    for (i=0; i<NB; i++) {						\
> +      dest[i] = a[i] OP b[i];						\
> +    }									\
> +}
> +
> +#define ALL_FUNCS(OP, NAME) \
> +  FUNC(s, int, 32, 2, OP, NAME)			\
> +  FUNC(u, uint, 32, 2, OP, NAME)		\
> +  FUNC(s, int, 16, 4, OP, NAME)			\
> +  FUNC(u, uint, 16, 4, OP, NAME)		\
> +  FUNC(s, int, 8, 8, OP, NAME)			\
> +  FUNC(u, uint, 8, 8, OP, NAME)			\
> +  FUNC(s, int, 32, 4, OP, NAME)			\
> +  FUNC(u, uint, 32, 4, OP, NAME)		\
> +  FUNC(s, int, 16, 8, OP, NAME)			\
> +  FUNC(u, uint, 16, 8, OP, NAME)		\
> +  FUNC(s, int, 8, 16, OP, NAME)			\
> +  FUNC(u, uint, 8, 16, OP, NAME)
> +
> +ALL_FUNCS(==, vcmpeq)
> +ALL_FUNCS(!=, vcmpne)
> +ALL_FUNCS(<, vcmplt)
> +ALL_FUNCS(<=, vcmple)
> +ALL_FUNCS(>, vcmpgt)
> +ALL_FUNCS(>=, vcmpge)
> +
> +/* MVE has only 128-bit vectors, so we can vectorize only half of the
> +   functions above.  */
> +/* { dg-final { scan-assembler-times {\tvcmp.i[0-9]+  eq, q[0-9]+, q[0-9]+\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.i[0-9]+  ne, q[0-9]+, q[0-9]+\n} 6 } } */
> +
> +/* lt, le, gt, ge apply to signed types, cs and hi to unsigned types.  */
> +/* lt and le with unsigned types are replaced with the opposite condition, hence
> +   the double number of matches for cs and hi.  */
> +/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  lt, q[0-9]+, q[0-9]+\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  le, q[0-9]+, q[0-9]+\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  gt, q[0-9]+, q[0-9]+\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  ge, q[0-9]+, q[0-9]+\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.u[0-9]+  cs, q[0-9]+, q[0-9]+\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.u[0-9]+  hi, q[0-9]+, q[0-9]+\n} 6 } } */

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 7/9] arm: Auto-vectorization for MVE: add __fp16 support to VCMP
  2021-04-30 14:09 ` [PATCH 7/9] arm: Auto-vectorization for MVE: add __fp16 support to VCMP Christophe Lyon
@ 2021-05-04 11:48   ` Andre Vieira (lists)
  2021-05-04 13:43     ` Christophe Lyon
  0 siblings, 1 reply; 35+ messages in thread
From: Andre Vieira (lists) @ 2021-05-04 11:48 UTC (permalink / raw)
  To: Christophe Lyon, gcc-patches

It would be good to also add tests for NEON as you also enable auto-vec 
for it. I checked and I do think the necessary 'neon_vc' patterns exist 
for 'VH', so we should be OK there.

On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:
> This patch adds __fp16 support to the previous patch that added vcmp
> support with MVE. For this we update existing expanders to use VDQWH
> iterator, and add a new expander vcond<VH_cvtto><mode>.  In the
> process we need to create suitable iterators, and update v_cmp_result
> as needed.
>
> 2021-04-26  Christophe Lyon  <christophe.lyon@linaro.org>
>
> 	gcc/
> 	* config/arm/iterators.md (V16): New iterator.
> 	(VH_cvtto): New iterator.
> 	(v_cmp_result): Added V4HF and V8HF support.
> 	* config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>): Use VDQWH.
> 	(vcond<mode><mode>): Likewise.
> 	(vcond_mask_<mode><v_cmp_result>): Likewise.
> 	(vcond<VH_cvtto><mode>): New expander.
>
> 	gcc/testsuite/
> 	* gcc.target/arm/simd/mve-compare-3.c: New test with GCC vectors.
> 	* gcc.target/arm/simd/mve-vcmp-f16.c: New test for
> 	auto-vectorization.
> ---
>   gcc/config/arm/iterators.md                       |  6 ++++
>   gcc/config/arm/vec-common.md                      | 40 ++++++++++++++++-------
>   gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c | 38 +++++++++++++++++++++
>   gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c  | 30 +++++++++++++++++
>   4 files changed, 102 insertions(+), 12 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
>   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
>
> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> index a128465..3042baf 100644
> --- a/gcc/config/arm/iterators.md
> +++ b/gcc/config/arm/iterators.md
> @@ -231,6 +231,9 @@ (define_mode_iterator VU [V16QI V8HI V4SI])
>   ;; Vector modes for 16-bit floating-point support.
>   (define_mode_iterator VH [V8HF V4HF])
>   
> +;; Modes with 16-bit elements only.
> +(define_mode_iterator V16 [V4HI V4HF V8HI V8HF])
> +
>   ;; 16-bit floating-point vector modes suitable for moving (includes BFmode).
>   (define_mode_iterator VHFBF [V8HF V4HF V4BF V8BF])
>   
> @@ -571,6 +574,8 @@ (define_mode_attr V_cvtto [(V2SI "v2sf") (V2SF "v2si")
>   ;; (Opposite) mode to convert to/from for vector-half mode conversions.
>   (define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")
>   			    (V8HI "V8HF") (V8HF "V8HI")])
> +(define_mode_attr VH_cvtto [(V4HI "v4hf") (V4HF "v4hi")
> +			    (V8HI "v8hf") (V8HF "v8hi")])
>   
>   ;; Define element mode for each vector mode.
>   (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
> @@ -720,6 +725,7 @@ (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")
>   (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")
>   				(V4HI "v4hi") (V8HI  "v8hi")
>   				(V2SI "v2si") (V4SI  "v4si")
> +				(V4HF "v4hi") (V8HF  "v8hi")
>   				(DI   "di")   (V2DI  "v2di")
>   				(V2SF "v2si") (V4SF  "v4si")])
>   
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> index 034b48b..3fd341c 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -366,8 +366,8 @@ (define_expand "vlshr<mode>3"
>   (define_expand "vec_cmp<mode><v_cmp_result>"
>     [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
>   	(match_operator:<V_cmp_result> 1 "comparison_operator"
> -	  [(match_operand:VDQW 2 "s_register_operand")
> -	   (match_operand:VDQW 3 "reg_or_zero_operand")]))]
> +	  [(match_operand:VDQWH 2 "s_register_operand")
> +	   (match_operand:VDQWH 3 "reg_or_zero_operand")]))]
>     "ARM_HAVE_<MODE>_ARITH
>      && !TARGET_REALLY_IWMMXT
>      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> @@ -399,13 +399,13 @@ (define_expand "vec_cmpu<mode><mode>"
>   ;; element-wise.
>   
>   (define_expand "vcond<mode><mode>"
> -  [(set (match_operand:VDQW 0 "s_register_operand")
> -	(if_then_else:VDQW
> +  [(set (match_operand:VDQWH 0 "s_register_operand")
> +	(if_then_else:VDQWH
>   	  (match_operator 3 "comparison_operator"
> -	    [(match_operand:VDQW 4 "s_register_operand")
> -	     (match_operand:VDQW 5 "reg_or_zero_operand")])
> -	  (match_operand:VDQW 1 "s_register_operand")
> -	  (match_operand:VDQW 2 "s_register_operand")))]
> +	    [(match_operand:VDQWH 4 "s_register_operand")
> +	     (match_operand:VDQWH 5 "reg_or_zero_operand")])
> +	  (match_operand:VDQWH 1 "s_register_operand")
> +	  (match_operand:VDQWH 2 "s_register_operand")))]
>     "ARM_HAVE_<MODE>_ARITH
>      && !TARGET_REALLY_IWMMXT
>      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> @@ -430,6 +430,22 @@ (define_expand "vcond<V_cvtto><mode>"
>     DONE;
>   })
>   
> +(define_expand "vcond<VH_cvtto><mode>"
> +  [(set (match_operand:<VH_CVTTO> 0 "s_register_operand")
> +	(if_then_else:<VH_CVTTO>
> +	  (match_operator 3 "comparison_operator"
> +	    [(match_operand:V16 4 "s_register_operand")
> +	     (match_operand:V16 5 "reg_or_zero_operand")])
> +	  (match_operand:<VH_CVTTO> 1 "s_register_operand")
> +	  (match_operand:<VH_CVTTO> 2 "s_register_operand")))]
> +  "ARM_HAVE_<MODE>_ARITH
> +   && !TARGET_REALLY_IWMMXT
> +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> +{
> +  arm_expand_vcond (operands, <V_cmp_result>mode);
> +  DONE;
> +})
> +
>   (define_expand "vcondu<mode><v_cmp_result>"
>     [(set (match_operand:VDQW 0 "s_register_operand")
>   	(if_then_else:VDQW
> @@ -446,11 +462,11 @@ (define_expand "vcondu<mode><v_cmp_result>"
>   })
>   
>   (define_expand "vcond_mask_<mode><v_cmp_result>"
> -  [(set (match_operand:VDQW 0 "s_register_operand")
> -        (if_then_else:VDQW
> +  [(set (match_operand:VDQWH 0 "s_register_operand")
> +        (if_then_else:VDQWH
>             (match_operand:<V_cmp_result> 3 "s_register_operand")
> -          (match_operand:VDQW 1 "s_register_operand")
> -          (match_operand:VDQW 2 "s_register_operand")))]
> +          (match_operand:VDQWH 1 "s_register_operand")
> +          (match_operand:VDQWH 2 "s_register_operand")))]
>     "ARM_HAVE_<MODE>_ARITH
>      && !TARGET_REALLY_IWMMXT"
>   {
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
> new file mode 100644
> index 0000000..76f81e8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
> @@ -0,0 +1,38 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> +/* { dg-add-options arm_v8_1m_mve_fp } */
> +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> +
> +/* float 16 tests.  */
> +
> +#ifndef ELEM_TYPE
> +#define ELEM_TYPE __fp16
> +#endif
> +#ifndef INT_ELEM_TYPE
> +#define INT_ELEM_TYPE __INT16_TYPE__
> +#endif
> +
> +#define COMPARE(NAME, OP)			\
> +  int_vec					\
> +  cmp_##NAME##_reg (vec a, vec b)		\
> +  {						\
> +    return a OP b;				\
> +  }
> +
> +typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));
> +typedef ELEM_TYPE vec __attribute__((vector_size(16)));
> +
> +COMPARE (eq, ==)
> +COMPARE (ne, !=)
> +COMPARE (lt, <)
> +COMPARE (le, <=)
> +COMPARE (gt, >)
> +COMPARE (ge, >=)
> +
> +/* eq, ne, lt, le, gt, ge.
> +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
> new file mode 100644
> index 0000000..dbae2d1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
> @@ -0,0 +1,30 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> +/* { dg-add-options arm_v8_1m_mve_fp } */
> +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> +
> +#include <stdint.h>
> +
> +#define NB 8
> +
> +#define FUNC(OP, NAME)							\
> +  void test_ ## NAME ##_f (__fp16 * __restrict__ dest, __fp16 *a, __fp16 *b) { \
> +    int i;								\
> +    for (i=0; i<NB; i++) {						\
> +      dest[i] = a[i] OP b[i];						\
> +    }									\
> +  }
> +
> +FUNC(==, vcmpeq)
> +FUNC(!=, vcmpne)
> +FUNC(<, vcmplt)
> +FUNC(<=, vcmple)
> +FUNC(>, vcmpgt)
> +FUNC(>=, vcmpge)
> +
> +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 9/9] arm: Auto-vectorization for MVE: vld4/vst4
  2021-04-30 14:09 ` [PATCH 9/9] arm: Auto-vectorization for MVE: vld4/vst4 Christophe Lyon
@ 2021-05-04 12:03   ` Andre Vieira (lists)
  2021-05-04 14:57     ` Christophe Lyon
  2021-05-24 12:15   ` Kyrylo Tkachov
  1 sibling, 1 reply; 35+ messages in thread
From: Andre Vieira (lists) @ 2021-05-04 12:03 UTC (permalink / raw)
  To: gcc-patches

Hi Christophe,

The series LGTM but you'll need the approval of an arm port maintainer 
before committing. I only did code-review, did not try to build/run tests.

Kind regards,
Andre

On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:
> This patch enables MVE vld4/vst4 instructions for auto-vectorization.
> We move the existing expanders from neon.md and enable them for MVE,
> calling the respective emitter.
>
> 2021-03-12  Christophe Lyon  <christophe.lyon@linaro.org>
>
> 	gcc/
> 	* config/arm/neon.md (vec_load_lanesxi<mode>)
> 	(vec_store_lanexoi<mode>): Move ...
> 	* config/arm/vec-common.md: here.
>
> 	gcc/testsuite/
> 	* gcc.target/arm/simd/mve-vld4.c: New test, derived from
> 	slp-perm-3.c
> ---
>   gcc/config/arm/neon.md                       |  20 ----
>   gcc/config/arm/vec-common.md                 |  26 +++++
>   gcc/testsuite/gcc.target/arm/simd/mve-vld4.c | 140 +++++++++++++++++++++++++++
>   3 files changed, 166 insertions(+), 20 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
>
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index bc8775c..fb58baf 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -5617,16 +5617,6 @@ (define_insn "neon_vld4<mode>"
>                       (const_string "neon_load4_4reg<q>")))]
>   )
>   
> -(define_expand "vec_load_lanesxi<mode>"
> -  [(match_operand:XI 0 "s_register_operand")
> -   (match_operand:XI 1 "neon_struct_operand")
> -   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> -  "TARGET_NEON"
> -{
> -  emit_insn (gen_neon_vld4<mode> (operands[0], operands[1]));
> -  DONE;
> -})
> -
>   (define_expand "neon_vld4<mode>"
>     [(match_operand:XI 0 "s_register_operand")
>      (match_operand:XI 1 "neon_struct_operand")
> @@ -5818,16 +5808,6 @@ (define_insn "neon_vst4<mode>"
>                       (const_string "neon_store4_4reg<q>")))]
>   )
>   
> -(define_expand "vec_store_lanesxi<mode>"
> -  [(match_operand:XI 0 "neon_struct_operand")
> -   (match_operand:XI 1 "s_register_operand")
> -   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> -  "TARGET_NEON"
> -{
> -  emit_insn (gen_neon_vst4<mode> (operands[0], operands[1]));
> -  DONE;
> -})
> -
>   (define_expand "neon_vst4<mode>"
>     [(match_operand:XI 0 "neon_struct_operand")
>      (match_operand:XI 1 "s_register_operand")
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> index 7abefea..d46b78d 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -512,3 +512,29 @@ (define_expand "vec_store_lanesoi<mode>"
>       emit_insn (gen_mve_vst2q<mode> (operands[0], operands[1]));
>     DONE;
>   })
> +
> +(define_expand "vec_load_lanesxi<mode>"
> +  [(match_operand:XI 0 "s_register_operand")
> +   (match_operand:XI 1 "neon_struct_operand")
> +   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> +  "TARGET_NEON || TARGET_HAVE_MVE"
> +{
> +  if (TARGET_NEON)
> +    emit_insn (gen_neon_vld4<mode> (operands[0], operands[1]));
> +  else
> +    emit_insn (gen_mve_vld4q<mode> (operands[0], operands[1]));
> +  DONE;
> +})
> +
> +(define_expand "vec_store_lanesxi<mode>"
> +  [(match_operand:XI 0 "neon_struct_operand")
> +   (match_operand:XI 1 "s_register_operand")
> +   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> +  "TARGET_NEON || TARGET_HAVE_MVE"
> +{
> +  if (TARGET_NEON)
> +    emit_insn (gen_neon_vst4<mode> (operands[0], operands[1]));
> +  else
> +    emit_insn (gen_mve_vst4q<mode> (operands[0], operands[1]));
> +  DONE;
> +})
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c b/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
> new file mode 100644
> index 0000000..ce3e755
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
> @@ -0,0 +1,140 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> +/* { dg-add-options arm_v8_1m_mve_fp } */
> +/* { dg-additional-options "-O3" } */
> +
> +#include <stdint.h>
> +
> +#define M00 100
> +#define M10 216
> +#define M20 23
> +#define M30 237
> +#define M01 1322
> +#define M11 13
> +#define M21 27271
> +#define M31 2280
> +#define M02 74
> +#define M12 191
> +#define M22 500
> +#define M32 111
> +#define M03 134
> +#define M13 117
> +#define M23 11
> +#define M33 771
> +
> +#define N 128
> +
> +/* Integer tests.  */
> +#define FUNC(SIGN, TYPE, BITS)						\
> +  void foo_##SIGN##BITS##x (TYPE##BITS##_t *__restrict__ pInput,	\
> +			    TYPE##BITS##_t *__restrict__ pOutput)	\
> +  {									\
> +    unsigned int i;							\
> +    TYPE##BITS##_t  a, b, c, d;						\
> +    									\
> +    for (i = 0; i < N / BITS; i++)					\
> +      {									\
> +	a = *pInput++;							\
> +	b = *pInput++;							\
> +	c = *pInput++;							\
> +	d = *pInput++;							\
> +									\
> +	*pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;		\
> +	*pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;		\
> +	*pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;		\
> +	*pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;		\
> +      }									\
> +  }
> +
> +FUNC(s, int, 8)
> +FUNC(u, uint, 8)
> +FUNC(s, int, 16)
> +FUNC(u, uint, 16)
> +FUNC(s, int, 32)
> +FUNC(u, uint, 32)
> +
> +/* float test, keep the macro because it's similar to the above, but does not
> +   need the ##BITS##_t.  */
> +#define FUNC_FLOAT(SIGN, TYPE, BITS)						\
> +  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,			\
> +			    TYPE *__restrict__ pOutput)			\
> +  {									\
> +    unsigned int i;							\
> +    TYPE a, b, c, d;							\
> +    									\
> +    for (i = 0; i < N / BITS; i++)					\
> +      {									\
> +	a = *pInput++;							\
> +	b = *pInput++;							\
> +	c = *pInput++;							\
> +	d = *pInput++;							\
> +									\
> +	*pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;		\
> +	*pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;		\
> +	*pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;		\
> +	*pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;		\
> +      }									\
> +  }
> +
> +FUNC_FLOAT(f, float, 32)
> +
> +/* __fp16 test, needs explicit casts to avoid conversions to floating-point and
> +   failure to vectorize.  */
> +__fp16 M00_fp16 = 100.0f16;
> +__fp16 M10_fp16 = 216.0f16;
> +__fp16 M20_fp16 = 23.0f16;
> +__fp16 M30_fp16 = 237.0f16;
> +__fp16 M01_fp16 = 1322.0f16;
> +__fp16 M11_fp16 = 13.0f16;
> +__fp16 M21_fp16 = 27271.0f16;
> +__fp16 M31_fp16 = 2280.0f16;
> +__fp16 M02_fp16 = 74.0f16;
> +__fp16 M12_fp16 = 191.0f16;
> +__fp16 M22_fp16 = 500.0f16;
> +__fp16 M32_fp16 = 111.0f16;
> +__fp16 M03_fp16 = 134.0f16;
> +__fp16 M13_fp16 = 117.0f16;
> +__fp16 M23_fp16 = 11.0f16;
> +__fp16 M33_fp16 = 771.0f16;
> +
> +#define FUNC_FLOAT_FP16(SIGN, TYPE, BITS)				\
> +  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,			\
> +			    TYPE *__restrict__ pOutput)			\
> +  {									\
> +    unsigned int i;							\
> +    TYPE a, b, c, d;							\
> +    									\
> +    for (i = 0; i < N / BITS; i++)					\
> +      {									\
> +	a = *pInput++;							\
> +	b = *pInput++;							\
> +	c = *pInput++;							\
> +	d = *pInput++;							\
> +									\
> +	TYPE ab, cd;							\
> +	ab = (__fp16)(M00_fp16 * a) + (__fp16)(M01_fp16 * b);		\
> +	cd = (__fp16)(M02_fp16 * c) + (__fp16)(M03_fp16 * d);		\
> +	*pOutput++ = ab + cd;						\
> +	ab = (__fp16)(M10_fp16 * a) + (__fp16)(M11_fp16 * b);		\
> +	cd = (__fp16)(M12_fp16 * c) + (__fp16)(M13_fp16 * d);		\
> +	*pOutput++ = ab + cd;						\
> +	ab = (__fp16)(M20_fp16 * a) + (__fp16)(M21_fp16 * b);		\
> +	cd = (__fp16)(M22_fp16 * c) + (__fp16)(M23_fp16 * d);		\
> +	*pOutput++ = ab + cd;						\
> +	ab = (__fp16)(M30_fp16 * a) + (__fp16)(M31_fp16 * b);		\
> +	cd = (__fp16)(M32_fp16 * c) + (__fp16)(M33_fp16 * d);		\
> +	*pOutput++ = ab + cd;						\
> +      }									\
> +  }
> +
> +FUNC_FLOAT_FP16(f, __fp16, 16)
> +
> +/* vld4X.8 is used for signed and unsigned chars: 2 * 4.  */
> +/* vld4X.16 is used for signed and unsigned shorts and __fp16: 3 * 4.  */
> +/* vld4X.32 is used for signed and unsigned ints and float: 3 * 4.  */
> +/* { dg-final { scan-assembler-times {vld4[0123].8\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 8 } } */
> +/* { dg-final { scan-assembler-times {vld4[0123].16\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
> +/* { dg-final { scan-assembler-times {vld4[0123].32\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
> +/* { dg-final { scan-assembler-times {vst4[0123].8\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 8 } } */
> +/* { dg-final { scan-assembler-times {vst4[0123].16\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
> +/* { dg-final { scan-assembler-times {vst4[0123].32\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 6/9] arm: Auto-vectorization for MVE: vcmp
  2021-05-04 11:29   ` Andre Vieira (lists)
@ 2021-05-04 13:41     ` Christophe Lyon
  2021-05-05 14:08       ` Christophe Lyon
  0 siblings, 1 reply; 35+ messages in thread
From: Christophe Lyon @ 2021-05-04 13:41 UTC (permalink / raw)
  To: Andre Vieira (lists); +Cc: gcc Patches

On Tue, 4 May 2021 at 13:29, Andre Vieira (lists)
<andre.simoesdiasvieira@arm.com> wrote:
>
> Hi Christophe,
>
> On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:
> > Since MVE has a different set of vector comparison operators from
> > Neon, we have to update the expansion to take into account the new
> > ones, for instance 'NE' for which MVE does not require to use 'EQ'
> > with the inverted condition.
> >
> > Conversely, Neon supports comparisons with #0, MVE does not.
> >
> > For:
> > typedef long int vs32 __attribute__((vector_size(16)));
> > vs32 cmp_eq_vs32_reg (vs32 a, vs32 b) { return a == b; }
> >
> > we now generate:
> > cmp_eq_vs32_reg:
> >       vldr.64 d4, .L123       @ 8     [c=8 l=4]  *mve_movv4si/8
> >       vldr.64 d5, .L123+8
> >       vldr.64 d6, .L123+16    @ 9     [c=8 l=4]  *mve_movv4si/8
> >       vldr.64 d7, .L123+24
> >       vcmp.i32  eq, q0, q1    @ 7     [c=16 l=4]  mve_vcmpeqq_v4si
> >       vpsel q0, q3, q2        @ 15    [c=8 l=4]  mve_vpselq_sv4si
> >       bx      lr      @ 26    [c=8 l=4]  *thumb2_return
> > .L124:
> >       .align  3
> > .L123:
> >       .word   0
> >       .word   0
> >       .word   0
> >       .word   0
> >       .word   1
> >       .word   1
> >       .word   1
> >       .word   1
> >
> > For some reason emit_move_insn (zero, CONST0_RTX (cmp_mode)) produces
> > a pair of vldr instead of vmov.i32, qX, #0
> I think ideally we would even want:
> vpte  eq, q0, q1
> vmovt.i32 q0, #0
> vmove.i32 q0, #1
>
> But we don't have a way to generate VPT blocks with multiple
> instructions yet unfortunately so I guess VPSEL will have to do for now.

TBH,  I looked at what LLVM generates currently ;-)

>
> >
> > 2021-03-01  Christophe Lyon  <christophe.lyon@linaro.org>
> >
> >       gcc/
> >       * config/arm/arm-protos.h (arm_expand_vector_compare): Update
> >       prototype.
> >       * config/arm/arm.c (arm_expand_vector_compare): Add support for
> >       MVE.
> >       (arm_expand_vcond): Likewise.
> >       * config/arm/iterators.md (supf): Remove VCMPNEQ_S, VCMPEQQ_S,
> >       VCMPEQQ_N_S, VCMPNEQ_N_S.
> >       (VCMPNEQ, VCMPEQQ, VCMPEQQ_N, VCMPNEQ_N): Remove.
> >       * config/arm/mve.md (@mve_vcmp<mve_cmp_op>q_<mode>): Add '@' prefix.
> >       (@mve_vcmp<mve_cmp_op>q_f<mode>): Likewise.
> >       (@mve_vcmp<mve_cmp_op>q_n_f<mode>): Likewise.
> >       (@mve_vpselq_<supf><mode>): Likewise.
> >       (@mve_vpselq_f<mode>"): Likewise.
> >       * config/arm/neon.md (vec_cmp<mode><v_cmp_result): Enable for MVE
> >       and move to vec-common.md.
> >       (vec_cmpu<mode><mode>): Likewise.
> >       (vcond<mode><mode>): Likewise.
> >       (vcond<V_cvtto><mode>): Likewise.
> >       (vcondu<mode><v_cmp_result>): Likewise.
> >       (vcond_mask_<mode><v_cmp_result>): Likewise.
> >       * config/arm/unspecs.md (VCMPNEQ_U, VCMPNEQ_S, VCMPEQQ_S)
> >       (VCMPEQQ_N_S, VCMPNEQ_N_S, VCMPEQQ_U, CMPEQQ_N_U, VCMPNEQ_N_U)
> >       (VCMPGEQ_N_S, VCMPGEQ_S, VCMPGTQ_N_S, VCMPGTQ_S, VCMPLEQ_N_S)
> >       (VCMPLEQ_S, VCMPLTQ_N_S, VCMPLTQ_S, VCMPCSQ_N_U, VCMPCSQ_U)
> >       (VCMPHIQ_N_U, VCMPHIQ_U): Remove.
> >       * config/arm/vec-common.md (vec_cmp<mode><v_cmp_result): Moved
> >       from neon.md.
> >       (vec_cmpu<mode><mode>): Likewise.
> >       (vcond<mode><mode>): Likewise.
> >       (vcond<V_cvtto><mode>): Likewise.
> >       (vcondu<mode><v_cmp_result>): Likewise.
> >       (vcond_mask_<mode><v_cmp_result>): Likewise.
> >
> >       gcc/testsuite
> >       * gcc.target/arm/simd/mve-compare-1.c: New test with GCC vectors.
> >       * gcc.target/arm/simd/mve-compare-2.c: New test with GCC vectors.
> >       * gcc.target/arm/simd/mve-compare-scalar-1.c: New test with GCC
> >       vectors.
> >       * gcc.target/arm/simd/mve-vcmp-f32.c: New test for
> >       auto-vectorization.
> >       * gcc.target/arm/simd/mve-vcmp.c: New test for auto-vectorization.
> >
> > add gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
> > ---
> >   gcc/config/arm/arm-protos.h                        |   2 +-
> >   gcc/config/arm/arm.c                               | 211 ++++++++++++++++-----
> >   gcc/config/arm/iterators.md                        |   9 +-
> >   gcc/config/arm/mve.md                              |  10 +-
> >   gcc/config/arm/neon.md                             |  87 ---------
> >   gcc/config/arm/unspecs.md                          |  20 --
> >   gcc/config/arm/vec-common.md                       | 107 +++++++++++
> >   gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c  |  80 ++++++++
> >   gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c  |  38 ++++
> >   .../gcc.target/arm/simd/mve-compare-scalar-1.c     |  69 +++++++
> >   gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c   |  30 +++
> >   gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c       |  50 +++++
> >   12 files changed, 547 insertions(+), 166 deletions(-)
> >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
> >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
> >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
> >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
> >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c
> >
> > diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
> > index 2521541..ffccaa7 100644
> > --- a/gcc/config/arm/arm-protos.h
> > +++ b/gcc/config/arm/arm-protos.h
> > @@ -373,7 +373,7 @@ extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
> >   extern bool arm_fusion_enabled_p (tune_params::fuse_ops);
> >   extern bool arm_valid_symbolic_address_p (rtx);
> >   extern bool arm_validize_comparison (rtx *, rtx *, rtx *);
> > -extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool);
> > +extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool, bool);
> >   #endif /* RTX_CODE */
> >
> >   extern bool arm_gen_setmem (rtx *);
> > diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
> > index 0371d98..80e28ef 100644
> > --- a/gcc/config/arm/arm.c
> > +++ b/gcc/config/arm/arm.c
> > @@ -30933,66 +30933,114 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
> >      and return true if TARGET contains the inverse.  If !CAN_INVERT,
> >      always store the result in TARGET, never its inverse.
> >
> > +   If VCOND_MVE, do not emit the vpsel instruction here, let arm_expand_vcond do
> > +   it with the right destination type to avoid emiting two vpsel, one here and
> > +   one in arm_expand_vcond.
> > +
> >      Note that the handling of floating-point comparisons is not
> >      IEEE compliant.  */
> >
> >   bool
> >   arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
> > -                        bool can_invert)
> > +                        bool can_invert, bool vcond_mve)
> >   {
> >     machine_mode cmp_result_mode = GET_MODE (target);
> >     machine_mode cmp_mode = GET_MODE (op0);
> >
> >     bool inverted;
> > -  switch (code)
> > -    {
> > -    /* For these we need to compute the inverse of the requested
> > -       comparison.  */
> > -    case UNORDERED:
> > -    case UNLT:
> > -    case UNLE:
> > -    case UNGT:
> > -    case UNGE:
> > -    case UNEQ:
> > -    case NE:
> > -      code = reverse_condition_maybe_unordered (code);
> > -      if (!can_invert)
> > -     {
> > -       /* Recursively emit the inverted comparison into a temporary
> > -          and then store its inverse in TARGET.  This avoids reusing
> > -          TARGET (which for integer NE could be one of the inputs).  */
> > -       rtx tmp = gen_reg_rtx (cmp_result_mode);
> > -       if (arm_expand_vector_compare (tmp, code, op0, op1, true))
> > -         gcc_unreachable ();
> > -       emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp)));
> > -       return false;
> > -     }
> > -      inverted = true;
> > -      break;
> >
> > -    default:
> > +  /* MVE supports more comparisons than Neon.  */
> > +  if (TARGET_HAVE_MVE)
> >         inverted = false;
> > -      break;
> > -    }
> > +  else
> > +    switch (code)
> > +      {
> > +     /* For these we need to compute the inverse of the requested
> > +        comparison.  */
> > +      case UNORDERED:
> > +      case UNLT:
> > +      case UNLE:
> > +      case UNGT:
> > +      case UNGE:
> > +      case UNEQ:
> > +      case NE:
> > +     code = reverse_condition_maybe_unordered (code);
> > +     if (!can_invert)
> > +       {
> > +         /* Recursively emit the inverted comparison into a temporary
> > +            and then store its inverse in TARGET.  This avoids reusing
> > +            TARGET (which for integer NE could be one of the inputs).  */
> > +         rtx tmp = gen_reg_rtx (cmp_result_mode);
> > +         if (arm_expand_vector_compare (tmp, code, op0, op1, true, vcond_mve))
> > +           gcc_unreachable ();
> > +         emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp)));
> > +         return false;
> > +       }
> > +     inverted = true;
> > +     break;
> > +
> > +      default:
> > +     inverted = false;
> > +     break;
> > +      }
> >
> >     switch (code)
> >       {
> > -    /* These are natively supported for zero comparisons, but otherwise
> > -       require the operands to be swapped.  */
> > +    /* These are natively supported by Neon for zero comparisons, but otherwise
> > +       require the operands to be swapped. For MVE, we can only compare
> > +       registers.  */
> >       case LE:
> >       case LT:
> > -      if (op1 != CONST0_RTX (cmp_mode))
> > -     {
> > -       code = swap_condition (code);
> > -       std::swap (op0, op1);
> > -     }
> > +      if (!TARGET_HAVE_MVE)
> > +     if (op1 != CONST0_RTX (cmp_mode))
> > +       {
> > +         code = swap_condition (code);
> > +         std::swap (op0, op1);
> > +       }
> >         /* Fall through.  */
> >
> > -    /* These are natively supported for both register and zero operands.  */
> > +    /* These are natively supported by Neon for both register and zero
> > +       operands. MVE supports registers only.  */
> >       case EQ:
> >       case GE:
> >       case GT:
> > -      emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1));
> > +    case NE:
> > +      if (TARGET_HAVE_MVE) {
> > +     rtx vpr_p0;
> > +     if (vcond_mve)
> > +       vpr_p0 = target;
> > +     else
> > +       vpr_p0 = gen_reg_rtx (HImode);
> > +
> > +     switch (cmp_mode)
> > +       {
> > +       case E_V16QImode:
> > +       case E_V8HImode:
> > +       case E_V4SImode:
> > +         emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
> > +         break;
> > +       case E_V8HFmode:
> > +       case E_V4SFmode:
> > +         if (TARGET_HAVE_MVE_FLOAT)
> > +           emit_insn (gen_mve_vcmpq_f (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
> > +         else
> > +           gcc_unreachable ();
> > +         break;
> > +       default:
> > +         gcc_unreachable ();
> > +       }
> > +
> > +     /* If we are not expanding a vcond, build the result here.  */
> > +     if (!vcond_mve) {
> > +       rtx zero = gen_reg_rtx (cmp_result_mode);
> > +       rtx one = gen_reg_rtx (cmp_result_mode);
> > +       emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
> > +       emit_move_insn (one, CONST1_RTX (cmp_result_mode));
> > +       emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
> > +     }
> > +      }
> > +      else
> > +     emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1));
> >         return inverted;
> >
> >       /* These are natively supported for register operands only.
> > @@ -31000,16 +31048,50 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
> >          or canonicalized by target-independent code.  */
> >       case GEU:
> >       case GTU:
> > -      emit_insn (gen_neon_vc (code, cmp_mode, target,
> > -                           op0, force_reg (cmp_mode, op1)));
> > +      if (TARGET_HAVE_MVE) {
> > +     rtx vpr_p0;
> > +     if (vcond_mve)
> > +       vpr_p0 = target;
> > +     else
> > +       vpr_p0 = gen_reg_rtx (HImode);
> > +
> > +     emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
> > +     if (!vcond_mve) {
> > +       rtx zero = gen_reg_rtx (cmp_result_mode);
> > +       rtx one = gen_reg_rtx (cmp_result_mode);
> > +       emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
> > +       emit_move_insn (one, CONST1_RTX (cmp_result_mode));
> > +       emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
> > +     }
> > +      }
> > +       else
> > +     emit_insn (gen_neon_vc (code, cmp_mode, target,
> > +                             op0, force_reg (cmp_mode, op1)));
> >         return inverted;
> >
> >       /* These require the operands to be swapped and likewise do not
> >          support comparisons with zero.  */
> >       case LEU:
> >       case LTU:
> > -      emit_insn (gen_neon_vc (swap_condition (code), cmp_mode,
> > -                           target, force_reg (cmp_mode, op1), op0));
> > +      if (TARGET_HAVE_MVE) {
> > +     rtx vpr_p0;
> > +     if (vcond_mve)
> > +       vpr_p0 = target;
> > +     else
> > +       vpr_p0 = gen_reg_rtx (HImode);
> > +
> > +     emit_insn (gen_mve_vcmpq (swap_condition (code), cmp_mode, vpr_p0, force_reg (cmp_mode, op1), op0));
> > +     if (!vcond_mve) {
> > +       rtx zero = gen_reg_rtx (cmp_result_mode);
> > +       rtx one = gen_reg_rtx (cmp_result_mode);
> > +       emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
> > +       emit_move_insn (one, CONST1_RTX (cmp_result_mode));
> > +       emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
> > +     }
> > +      }
> > +      else
> > +     emit_insn (gen_neon_vc (swap_condition (code), cmp_mode,
> > +                             target, force_reg (cmp_mode, op1), op0));
> >         return inverted;
> >
> >       /* These need a combination of two comparisons.  */
> > @@ -31021,8 +31103,8 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
> >       rtx gt_res = gen_reg_rtx (cmp_result_mode);
> >       rtx alt_res = gen_reg_rtx (cmp_result_mode);
> >       rtx_code alt_code = (code == LTGT ? LT : LE);
> > -     if (arm_expand_vector_compare (gt_res, GT, op0, op1, true)
> > -         || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true))
> > +     if (arm_expand_vector_compare (gt_res, GT, op0, op1, true, vcond_mve)
> > +         || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true, vcond_mve))
> >         gcc_unreachable ();
> >       emit_insn (gen_rtx_SET (target, gen_rtx_IOR (cmp_result_mode,
> >                                                    gt_res, alt_res)));
> > @@ -31040,13 +31122,50 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
> >   void
> >   arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode)
> >   {
> > -  rtx mask = gen_reg_rtx (cmp_result_mode);
> > +  /* When expanding for MVE, we do not want to emit a (useless) vpsel in
> > +     arm_expand_vector_compare, and another one here.  */
> > +  bool vcond_mve=false;
> > +  rtx mask;
> > +
> > +  if (TARGET_HAVE_MVE)
> > +    {
> > +      vcond_mve=true;
> > +      mask = gen_reg_rtx (HImode);
> > +    }
> > +  else
> > +    mask = gen_reg_rtx (cmp_result_mode);
> > +
> >     bool inverted = arm_expand_vector_compare (mask, GET_CODE (operands[3]),
> > -                                          operands[4], operands[5], true);
> > +                                          operands[4], operands[5], true, vcond_mve);
> >     if (inverted)
> >       std::swap (operands[1], operands[2]);
> > +  if (TARGET_NEON)
> >     emit_insn (gen_neon_vbsl (GET_MODE (operands[0]), operands[0],
> >                           mask, operands[1], operands[2]));
> > +  else
> > +    {
> > +      machine_mode cmp_mode = GET_MODE (operands[4]);
> > +      rtx vpr_p0 = mask;
> > +      rtx zero = gen_reg_rtx (cmp_mode);
> > +      rtx one = gen_reg_rtx (cmp_mode);
> > +      emit_move_insn (zero, CONST0_RTX (cmp_mode));
> > +      emit_move_insn (one, CONST1_RTX (cmp_mode));
> > +      switch (cmp_mode)
> > +     {
> > +     case E_V16QImode:
> > +     case E_V8HImode:
> > +     case E_V4SImode:
> > +       emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, operands[0], one, zero, vpr_p0));
> > +       break;
> > +     case E_V8HFmode:
> > +     case E_V4SFmode:
> > +       if (TARGET_HAVE_MVE_FLOAT)
> > +         emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], one, zero, vpr_p0));
> > +       break;
> > +     default:
> > +       gcc_unreachable ();
> > +     }
> > +    }
> >   }
> >
> >   #define MAX_VECT_LEN 16
> > diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> > index 95df8bd..a128465 100644
> > --- a/gcc/config/arm/iterators.md
> > +++ b/gcc/config/arm/iterators.md
> > @@ -1288,12 +1288,11 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
> >                      (VCREATEQ_U "u") (VCREATEQ_S "s") (VSHRQ_N_S "s")
> >                      (VSHRQ_N_U "u") (VCVTQ_N_FROM_F_S "s") (VSHLQ_U "u")
> >                      (VCVTQ_N_FROM_F_U "u") (VADDLVQ_P_S "s") (VSHLQ_S "s")
> > -                    (VADDLVQ_P_U "u") (VCMPNEQ_S "s")
> > +                    (VADDLVQ_P_U "u")
> >                      (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s")
> >                      (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
> >                      (VADDVQ_P_S "s") (VADDVQ_P_U "u") (VBRSRQ_N_S "s")
> > -                    (VBRSRQ_N_U "u") (VCMPEQQ_S "s")
> > -                    (VCMPEQQ_N_S "s") (VCMPNEQ_N_S "s")
> > +                    (VBRSRQ_N_U "u")
> >                      (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s")
> >                      (VHADDQ_U "u") (VHSUBQ_N_S "s")  (VHSUBQ_N_U "u")
> >                      (VHSUBQ_S "s") (VMAXQ_S "s") (VMAXQ_U "u") (VHSUBQ_U "u")
> > @@ -1549,16 +1548,12 @@ (define_int_iterator VCREATEQ [VCREATEQ_U VCREATEQ_S])
> >   (define_int_iterator VSHRQ_N [VSHRQ_N_S VSHRQ_N_U])
> >   (define_int_iterator VCVTQ_N_FROM_F [VCVTQ_N_FROM_F_S VCVTQ_N_FROM_F_U])
> >   (define_int_iterator VADDLVQ_P [VADDLVQ_P_S VADDLVQ_P_U])
> > -(define_int_iterator VCMPNEQ [VCMPNEQ_S])
> >   (define_int_iterator VSHLQ [VSHLQ_S VSHLQ_U])
> >   (define_int_iterator VABDQ [VABDQ_S VABDQ_U])
> >   (define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U])
> >   (define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U])
> >   (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
> >   (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
> > -(define_int_iterator VCMPEQQ [VCMPEQQ_S])
> > -(define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S])
> > -(define_int_iterator VCMPNEQ_N [VCMPNEQ_N_S])
> >   (define_int_iterator VHADDQ [VHADDQ_S VHADDQ_U])
> >   (define_int_iterator VHADDQ_N [VHADDQ_N_U VHADDQ_N_S])
> >   (define_int_iterator VHSUBQ [VHSUBQ_S VHSUBQ_U])
> > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> > index 7c846a4..97f0a87 100644
> > --- a/gcc/config/arm/mve.md
> > +++ b/gcc/config/arm/mve.md
> > @@ -838,7 +838,7 @@ (define_insn "mve_vaddlvq_p_<supf>v4si"
> >   ;;
> >   ;; [vcmpneq_, vcmpcsq_, vcmpeqq_, vcmpgeq_, vcmpgtq_, vcmphiq_, vcmpleq_, vcmpltq_])
> >   ;;
> > -(define_insn "mve_vcmp<mve_cmp_op>q_<mode>"
> > +(define_insn "@mve_vcmp<mve_cmp_op>q_<mode>"
> >     [
> >      (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> >       (MVE_COMPARISONS:HI (match_operand:MVE_2 1 "s_register_operand" "w")
> > @@ -1928,7 +1928,7 @@ (define_insn "mve_vcaddq<mve_rot><mode>"
> >   ;;
> >   ;; [vcmpeqq_f, vcmpgeq_f, vcmpgtq_f, vcmpleq_f, vcmpltq_f, vcmpneq_f])
> >   ;;
> > -(define_insn "mve_vcmp<mve_cmp_op>q_f<mode>"
> > +(define_insn "@mve_vcmp<mve_cmp_op>q_f<mode>"
> >     [
> >      (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> >       (MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w")
> > @@ -1942,7 +1942,7 @@ (define_insn "mve_vcmp<mve_cmp_op>q_f<mode>"
> >   ;;
> >   ;; [vcmpeqq_n_f, vcmpgeq_n_f, vcmpgtq_n_f, vcmpleq_n_f, vcmpltq_n_f, vcmpneq_n_f])
> >   ;;
> > -(define_insn "mve_vcmp<mve_cmp_op>q_n_f<mode>"
> > +(define_insn "@mve_vcmp<mve_cmp_op>q_n_f<mode>"
> >     [
> >      (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> >       (MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w")
> > @@ -3307,7 +3307,7 @@ (define_insn "mve_vnegq_m_s<mode>"
> >   ;;
> >   ;; [vpselq_u, vpselq_s])
> >   ;;
> > -(define_insn "mve_vpselq_<supf><mode>"
> > +(define_insn "@mve_vpselq_<supf><mode>"
> >     [
> >      (set (match_operand:MVE_1 0 "s_register_operand" "=w")
> >       (unspec:MVE_1 [(match_operand:MVE_1 1 "s_register_operand" "w")
> > @@ -4402,7 +4402,7 @@ (define_insn "mve_vorrq_m_n_<supf><mode>"
> >   ;;
> >   ;; [vpselq_f])
> >   ;;
> > -(define_insn "mve_vpselq_f<mode>"
> > +(define_insn "@mve_vpselq_f<mode>"
> >     [
> >      (set (match_operand:MVE_0 0 "s_register_operand" "=w")
> >       (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
> > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> > index fec2cc9..6660846 100644
> > --- a/gcc/config/arm/neon.md
> > +++ b/gcc/config/arm/neon.md
> > @@ -1416,93 +1416,6 @@ (define_insn "*us_sub<mode>_neon"
> >     [(set_attr "type" "neon_qsub<q>")]
> >   )
> >
> > -(define_expand "vec_cmp<mode><v_cmp_result>"
> > -  [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
> > -     (match_operator:<V_cmp_result> 1 "comparison_operator"
> > -       [(match_operand:VDQW 2 "s_register_operand")
> > -        (match_operand:VDQW 3 "reg_or_zero_operand")]))]
> > -  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > -{
> > -  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
> > -                          operands[2], operands[3], false);
> > -  DONE;
> > -})
> > -
> > -(define_expand "vec_cmpu<mode><mode>"
> > -  [(set (match_operand:VDQIW 0 "s_register_operand")
> > -     (match_operator:VDQIW 1 "comparison_operator"
> > -       [(match_operand:VDQIW 2 "s_register_operand")
> > -        (match_operand:VDQIW 3 "reg_or_zero_operand")]))]
> > -  "TARGET_NEON"
> > -{
> > -  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
> > -                          operands[2], operands[3], false);
> > -  DONE;
> > -})
> > -
> > -;; Conditional instructions.  These are comparisons with conditional moves for
> > -;; vectors.  They perform the assignment:
> > -;;
> > -;;     Vop0 = (Vop4 <op3> Vop5) ? Vop1 : Vop2;
> > -;;
> > -;; where op3 is <, <=, ==, !=, >= or >.  Operations are performed
> > -;; element-wise.
> > -
> > -(define_expand "vcond<mode><mode>"
> > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > -     (if_then_else:VDQW
> > -       (match_operator 3 "comparison_operator"
> > -         [(match_operand:VDQW 4 "s_register_operand")
> > -          (match_operand:VDQW 5 "reg_or_zero_operand")])
> > -       (match_operand:VDQW 1 "s_register_operand")
> > -       (match_operand:VDQW 2 "s_register_operand")))]
> > -  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > -{
> > -  arm_expand_vcond (operands, <V_cmp_result>mode);
> > -  DONE;
> > -})
> > -
> > -(define_expand "vcond<V_cvtto><mode>"
> > -  [(set (match_operand:<V_CVTTO> 0 "s_register_operand")
> > -     (if_then_else:<V_CVTTO>
> > -       (match_operator 3 "comparison_operator"
> > -         [(match_operand:V32 4 "s_register_operand")
> > -          (match_operand:V32 5 "reg_or_zero_operand")])
> > -       (match_operand:<V_CVTTO> 1 "s_register_operand")
> > -       (match_operand:<V_CVTTO> 2 "s_register_operand")))]
> > -  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > -{
> > -  arm_expand_vcond (operands, <V_cmp_result>mode);
> > -  DONE;
> > -})
> > -
> > -(define_expand "vcondu<mode><v_cmp_result>"
> > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > -     (if_then_else:VDQW
> > -       (match_operator 3 "arm_comparison_operator"
> > -         [(match_operand:<V_cmp_result> 4 "s_register_operand")
> > -          (match_operand:<V_cmp_result> 5 "reg_or_zero_operand")])
> > -       (match_operand:VDQW 1 "s_register_operand")
> > -       (match_operand:VDQW 2 "s_register_operand")))]
> > -  "TARGET_NEON"
> > -{
> > -  arm_expand_vcond (operands, <V_cmp_result>mode);
> > -  DONE;
> > -})
> > -
> > -(define_expand "vcond_mask_<mode><v_cmp_result>"
> > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > -     (if_then_else:VDQW
> > -       (match_operand:<V_cmp_result> 3 "s_register_operand")
> > -       (match_operand:VDQW 1 "s_register_operand")
> > -       (match_operand:VDQW 2 "s_register_operand")))]
> > -  "TARGET_NEON"
> > -{
> > -  emit_insn (gen_neon_vbsl<mode> (operands[0], operands[3], operands[1],
> > -                               operands[2]));
> > -  DONE;
> > -})
> > -
> >   ;; Patterns for builtins.
> >
> >   ; good for plain vadd, vaddq.
> > diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
> > index 07ca53b..0778db1 100644
> > --- a/gcc/config/arm/unspecs.md
> > +++ b/gcc/config/arm/unspecs.md
> > @@ -596,8 +596,6 @@ (define_c_enum "unspec" [
> >     VCVTQ_N_FROM_F_U
> >     VADDLVQ_P_S
> >     VADDLVQ_P_U
> > -  VCMPNEQ_U
> > -  VCMPNEQ_S
> >     VSHLQ_S
> >     VSHLQ_U
> >     VABDQ_S
> > @@ -605,9 +603,6 @@ (define_c_enum "unspec" [
> >     VADDVAQ_S
> >     VADDVQ_P_S
> >     VBRSRQ_N_S
> > -  VCMPEQQ_S
> > -  VCMPEQQ_N_S
> > -  VCMPNEQ_N_S
> >     VHADDQ_S
> >     VHADDQ_N_S
> >     VHSUBQ_S
> > @@ -645,9 +640,6 @@ (define_c_enum "unspec" [
> >     VADDVAQ_U
> >     VADDVQ_P_U
> >     VBRSRQ_N_U
> > -  VCMPEQQ_U
> > -  VCMPEQQ_N_U
> > -  VCMPNEQ_N_U
> >     VHADDQ_U
> >     VHADDQ_N_U
> >     VHSUBQ_U
> > @@ -680,14 +672,6 @@ (define_c_enum "unspec" [
> >     VSHLQ_R_U
> >     VSUBQ_U
> >     VSUBQ_N_U
> > -  VCMPGEQ_N_S
> > -  VCMPGEQ_S
> > -  VCMPGTQ_N_S
> > -  VCMPGTQ_S
> > -  VCMPLEQ_N_S
> > -  VCMPLEQ_S
> > -  VCMPLTQ_N_S
> > -  VCMPLTQ_S
> >     VHCADDQ_ROT270_S
> >     VHCADDQ_ROT90_S
> >     VMAXAQ_S
> > @@ -702,10 +686,6 @@ (define_c_enum "unspec" [
> >     VQRDMULHQ_N_S
> >     VQRDMULHQ_S
> >     VQSHLUQ_N_S
> > -  VCMPCSQ_N_U
> > -  VCMPCSQ_U
> > -  VCMPHIQ_N_U
> > -  VCMPHIQ_U
> >     VABDQ_M_S
> >     VABDQ_M_U
> >     VABDQ_F
> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > index 0b2b3b1..034b48b 100644
> > --- a/gcc/config/arm/vec-common.md
> > +++ b/gcc/config/arm/vec-common.md
> > @@ -362,3 +362,110 @@ (define_expand "vlshr<mode>3"
> >         DONE;
> >       }
> >   })
> > +
> > +(define_expand "vec_cmp<mode><v_cmp_result>"
> > +  [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
> > +     (match_operator:<V_cmp_result> 1 "comparison_operator"
> > +       [(match_operand:VDQW 2 "s_register_operand")
> > +        (match_operand:VDQW 3 "reg_or_zero_operand")]))]
> > +  "ARM_HAVE_<MODE>_ARITH
> > +   && !TARGET_REALLY_IWMMXT
> > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > +{
> > +  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
> > +                          operands[2], operands[3], false, false);
> > +  DONE;
> > +})
> > +
> > +(define_expand "vec_cmpu<mode><mode>"
> > +  [(set (match_operand:VDQIW 0 "s_register_operand")
> > +     (match_operator:VDQIW 1 "comparison_operator"
> > +       [(match_operand:VDQIW 2 "s_register_operand")
> > +        (match_operand:VDQIW 3 "reg_or_zero_operand")]))]
> > +  "ARM_HAVE_<MODE>_ARITH
> > +   && !TARGET_REALLY_IWMMXT"
> > +{
> > +  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
> > +                          operands[2], operands[3], false, false);
> > +  DONE;
> > +})
> > +
> > +;; Conditional instructions.  These are comparisons with conditional moves for
> > +;; vectors.  They perform the assignment:
> > +;;
> > +;;     Vop0 = (Vop4 <op3> Vop5) ? Vop1 : Vop2;
> > +;;
> > +;; where op3 is <, <=, ==, !=, >= or >.  Operations are performed
> > +;; element-wise.
> > +
> > +(define_expand "vcond<mode><mode>"
> > +  [(set (match_operand:VDQW 0 "s_register_operand")
> > +     (if_then_else:VDQW
> > +       (match_operator 3 "comparison_operator"
> > +         [(match_operand:VDQW 4 "s_register_operand")
> > +          (match_operand:VDQW 5 "reg_or_zero_operand")])
> > +       (match_operand:VDQW 1 "s_register_operand")
> > +       (match_operand:VDQW 2 "s_register_operand")))]
> > +  "ARM_HAVE_<MODE>_ARITH
> > +   && !TARGET_REALLY_IWMMXT
> > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > +{
> > +  arm_expand_vcond (operands, <V_cmp_result>mode);
> > +  DONE;
> > +})
> > +
> > +(define_expand "vcond<V_cvtto><mode>"
> > +  [(set (match_operand:<V_CVTTO> 0 "s_register_operand")
> > +     (if_then_else:<V_CVTTO>
> > +       (match_operator 3 "comparison_operator"
> > +         [(match_operand:V32 4 "s_register_operand")
> > +          (match_operand:V32 5 "reg_or_zero_operand")])
> > +       (match_operand:<V_CVTTO> 1 "s_register_operand")
> > +       (match_operand:<V_CVTTO> 2 "s_register_operand")))]
> > +  "ARM_HAVE_<MODE>_ARITH
> > +   && !TARGET_REALLY_IWMMXT
> > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > +{
> > +  arm_expand_vcond (operands, <V_cmp_result>mode);
> > +  DONE;
> > +})
> > +
> > +(define_expand "vcondu<mode><v_cmp_result>"
> > +  [(set (match_operand:VDQW 0 "s_register_operand")
> > +     (if_then_else:VDQW
> > +       (match_operator 3 "arm_comparison_operator"
> > +         [(match_operand:<V_cmp_result> 4 "s_register_operand")
> > +          (match_operand:<V_cmp_result> 5 "reg_or_zero_operand")])
> > +       (match_operand:VDQW 1 "s_register_operand")
> > +       (match_operand:VDQW 2 "s_register_operand")))]
> > +  "ARM_HAVE_<MODE>_ARITH
> > +   && !TARGET_REALLY_IWMMXT"
> > +{
> > +  arm_expand_vcond (operands, <V_cmp_result>mode);
> > +  DONE;
> > +})
> > +
> > +(define_expand "vcond_mask_<mode><v_cmp_result>"
> > +  [(set (match_operand:VDQW 0 "s_register_operand")
> > +        (if_then_else:VDQW
> > +          (match_operand:<V_cmp_result> 3 "s_register_operand")
> > +          (match_operand:VDQW 1 "s_register_operand")
> > +          (match_operand:VDQW 2 "s_register_operand")))]
> > +  "ARM_HAVE_<MODE>_ARITH
> > +   && !TARGET_REALLY_IWMMXT"
> > +{
> > +  if (TARGET_NEON)
> > +    {
> > +      emit_insn (gen_neon_vbsl (<MODE>mode, operands[0], operands[3],
> > +                                operands[1], operands[2]));
> > +    }
> > +  else if (TARGET_HAVE_MVE)
> > +    {
> > +      emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0],
> > +                                 operands[1], operands[2], operands[3]));
> > +    }
> > +  else
> > +    gcc_unreachable ();
> > +
> > +  DONE;
> > +})
> > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
> > new file mode 100644
> > index 0000000..029c931
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
> > @@ -0,0 +1,80 @@
> > +/* { dg-do assemble } */
> > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> > +/* { dg-add-options arm_v8_1m_mve } */
> > +/* { dg-additional-options "-O3" } */
> > +
> > +/* Integer tests.  */
> > +
> > +#define COMPARE_REG(NAME, OP, TYPE) \
> > +  TYPE \
> > +  cmp_##NAME##_##TYPE##_reg (TYPE a, TYPE b) \
> > +  { \
> > +    return a OP b; \
> > +  }
> > +
> > +#define COMPARE_REG_AND_ZERO(NAME, OP, TYPE) \
> > +  COMPARE_REG (NAME, OP, TYPE) \
> > +  \
> > +  TYPE \
> > +  cmp_##NAME##_##TYPE##_zero (TYPE a) \
> > +  { \
> > +    return a OP (TYPE) {}; \
> > +  }
> > +
> > +#define COMPARE_TYPE(TYPE, COMPARE_ORDERED) \
> > +  COMPARE_REG_AND_ZERO (eq, ==, TYPE) \
> > +  COMPARE_REG_AND_ZERO (ne, !=, TYPE) \
> > +  COMPARE_ORDERED (lt, <, TYPE) \
> > +  COMPARE_ORDERED (le, <=, TYPE) \
> > +  COMPARE_ORDERED (gt, >, TYPE) \
> > +  COMPARE_ORDERED (ge, >=, TYPE)
> > +
> > +#define TEST_TYPE(NAME, ELEM, COMPARE_ORDERED, SIZE)  \
> > +  typedef ELEM NAME##SIZE __attribute__((vector_size(SIZE))); \
> > +  COMPARE_TYPE (NAME##SIZE, COMPARE_ORDERED)
> > +
> > +/* 64-bits vectors, not vectorized.  */
> > +TEST_TYPE (vs8, __INT8_TYPE__, COMPARE_REG_AND_ZERO, 8)
> > +TEST_TYPE (vu8, __UINT8_TYPE__, COMPARE_REG, 8)
> > +TEST_TYPE (vs16, __INT16_TYPE__, COMPARE_REG_AND_ZERO, 8)
> > +TEST_TYPE (vu16, __UINT16_TYPE__, COMPARE_REG, 8)
> > +TEST_TYPE (vs32, __INT32_TYPE__, COMPARE_REG_AND_ZERO, 8)
> > +TEST_TYPE (vu32, __UINT32_TYPE__, COMPARE_REG, 8)
> > +
> > +/* 128-bits vectors.  */
> > +TEST_TYPE (vs8, __INT8_TYPE__, COMPARE_REG_AND_ZERO, 16)
> > +TEST_TYPE (vu8, __UINT8_TYPE__, COMPARE_REG, 16)
> > +TEST_TYPE (vs16, __INT16_TYPE__, COMPARE_REG_AND_ZERO, 16)
> > +TEST_TYPE (vu16, __UINT16_TYPE__, COMPARE_REG, 16)
> > +TEST_TYPE (vs32, __INT32_TYPE__, COMPARE_REG_AND_ZERO, 16)
> > +TEST_TYPE (vu32, __UINT32_TYPE__, COMPARE_REG, 16)
> > +
> > +/* { 8 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> > +/* { dg-final { scan-assembler-times {\tvcmp.i8  eq, q[0-9]+, q[0-9]+\n} 4 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.i8  ne, q[0-9]+, q[0-9]+\n} 4 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s8  lt, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s8  le, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s8  gt, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s8  ge, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.u8  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.u8  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> > +
> > +/* { 16 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> > +/* { dg-final { scan-assembler-times {\tvcmp.i16  eq, q[0-9]+, q[0-9]+\n} 4 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.i16  ne, q[0-9]+, q[0-9]+\n} 4 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s16  lt, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s16  le, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s16  gt, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s16  ge, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.u16  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.u16  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> > +
> > +/* { 32 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> > +/* { dg-final { scan-assembler-times {\tvcmp.i32  eq, q[0-9]+, q[0-9]+\n} 4 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.i32  ne, q[0-9]+, q[0-9]+\n} 4 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s32  lt, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s32  le, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s32  gt, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s32  ge, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.u32  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.u32  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
> > new file mode 100644
> > index 0000000..8515195
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
> > @@ -0,0 +1,38 @@
> > +/* { dg-do assemble } */
> > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> > +
> > +/* float 32 tests.  */
> > +
> > +#ifndef ELEM_TYPE
> > +#define ELEM_TYPE float
> > +#endif
> > +#ifndef INT_ELEM_TYPE
> > +#define INT_ELEM_TYPE __INT32_TYPE__
> > +#endif
> > +
> > +#define COMPARE(NAME, OP)                    \
> > +  int_vec                                    \
> > +  cmp_##NAME##_reg (vec a, vec b)            \
> > +  {                                          \
> > +    return a OP b;                           \
> > +  }
> > +
> > +typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));
> > +typedef ELEM_TYPE vec __attribute__((vector_size(16)));
> > +
> > +COMPARE (eq, ==)
> > +COMPARE (ne, !=)
> > +COMPARE (lt, <)
> > +COMPARE (le, <=)
> > +COMPARE (gt, >)
> > +COMPARE (ge, >=)
> > +
> > +/* eq, ne, lt, le, gt, ge.
> > +/* { dg-final { scan-assembler-times {\tvcmp.f32\teq, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tne, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tle, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tge, q[0-9]+, q[0-9]+\n} 1 } } */
> > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
> > new file mode 100644
> > index 0000000..7774972
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
> > @@ -0,0 +1,69 @@
> > +/* { dg-do assemble } */
> > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> > +/* { dg-add-options arm_v8_1m_mve } */
> > +/* { dg-additional-options "-O3" } */
> > +
> > +#define COMPARE_REG(NAME, OP, TYPE, SCALAR)    \
> > +  TYPE                                                 \
> > +  cmp_##NAME##_##TYPE##_scalar (TYPE a, SCALAR b) \
> > +  {                                            \
> > +    return a OP b;                             \
> > +  }
> > +
> > +#define COMPARE_TYPE(SCALAR, TYPE)                           \
> > +  COMPARE_REG (eq, ==, TYPE, SCALAR)                         \
> > +  COMPARE_REG (ne, !=, TYPE, SCALAR)                         \
> > +  COMPARE_REG (lt, <, TYPE, SCALAR)                          \
> > +  COMPARE_REG (le, <=, TYPE, SCALAR)                         \
> > +  COMPARE_REG (gt, >, TYPE, SCALAR)                          \
> > +  COMPARE_REG (ge, >=, TYPE, SCALAR)
> > +
> > +#define TEST_TYPE(NAME, ELEM, SIZE)                        \
> > +  typedef ELEM NAME##SIZE __attribute__((vector_size(SIZE))); \
> > +  COMPARE_TYPE (ELEM, NAME##SIZE)
> > +
> > +/* 64-bits vectors, not vectorized.  */
> > +TEST_TYPE (vs8, __INT8_TYPE__, 8)
> > +TEST_TYPE (vu8, __UINT8_TYPE__, 8)
> > +TEST_TYPE (vs16, __INT16_TYPE__, 8)
> > +TEST_TYPE (vu16, __UINT16_TYPE__, 8)
> > +TEST_TYPE (vs32, __INT32_TYPE__, 8)
> > +TEST_TYPE (vu32, __UINT32_TYPE__, 8)
> > +
> > +/* 128-bits vectors.  */
> > +TEST_TYPE (vs8, __INT8_TYPE__, 16)
> > +TEST_TYPE (vu8, __UINT8_TYPE__, 16)
> > +TEST_TYPE (vs16, __INT16_TYPE__, 16)
> > +TEST_TYPE (vu16, __UINT16_TYPE__, 16)
> > +TEST_TYPE (vs32, __INT32_TYPE__, 16)
> > +TEST_TYPE (vu32, __UINT32_TYPE__, 16)
> > +
> > +/* { 8 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> > +/* { dg-final { scan-assembler-times {\tvcmp.i8  eq, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.i8  ne, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s8  lt, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s8  le, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s8  gt, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s8  ge, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.u8  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.u8  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> > +
> > +/* { 16 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> > +/* { dg-final { scan-assembler-times {\tvcmp.i16  eq, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.i16  ne, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s16  lt, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s16  le, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s16  gt, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s16  ge, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.u16  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.u16  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> > +
> > +/* { 32 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> > +/* { dg-final { scan-assembler-times {\tvcmp.i32  eq, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.i32  ne, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s32  lt, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s32  le, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s32  gt, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s32  ge, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.u32  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.u32  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
> > new file mode 100644
> > index 0000000..4ed449e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do assemble } */
> > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> > +
> > +#include <stdint.h>
> > +
> > +#define NB 4
> > +
> > +#define FUNC(OP, NAME)                                                       \
> > +  void test_ ## NAME ##_f (float * __restrict__ dest, float *a, float *b) { \
> > +    int i;                                                           \
> > +    for (i=0; i<NB; i++) {                                           \
> > +      dest[i] = a[i] OP b[i];                                                \
> > +    }                                                                        \
> > +  }
> > +
> > +FUNC(==, vcmpeq)
> > +FUNC(!=, vcmpne)
> > +FUNC(<, vcmplt)
> > +FUNC(<=, vcmple)
> > +FUNC(>, vcmpgt)
> > +FUNC(>=, vcmpge)
> > +
> > +/* { dg-final { scan-assembler-times {\tvcmp.f32\teq, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tne, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tle, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tge, q[0-9]+, q[0-9]+\n} 1 } } */
> > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c
> > new file mode 100644
> > index 0000000..8da15e7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c
> > @@ -0,0 +1,50 @@
> > +/* { dg-do assemble } */
> > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> > +/* { dg-add-options arm_v8_1m_mve } */
> > +/* { dg-additional-options "-O3" } */
> > +
> > +#include <stdint.h>
> > +
> > +#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)                         \
> > +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
> > +    int i;                                                           \
> > +    for (i=0; i<NB; i++) {                                           \
> > +      dest[i] = a[i] OP b[i];                                                \
> > +    }                                                                        \
> > +}
> > +
> > +#define ALL_FUNCS(OP, NAME) \
> > +  FUNC(s, int, 32, 2, OP, NAME)                      \
> > +  FUNC(u, uint, 32, 2, OP, NAME)             \
> > +  FUNC(s, int, 16, 4, OP, NAME)                      \
> > +  FUNC(u, uint, 16, 4, OP, NAME)             \
> > +  FUNC(s, int, 8, 8, OP, NAME)                       \
> > +  FUNC(u, uint, 8, 8, OP, NAME)                      \
> > +  FUNC(s, int, 32, 4, OP, NAME)                      \
> > +  FUNC(u, uint, 32, 4, OP, NAME)             \
> > +  FUNC(s, int, 16, 8, OP, NAME)                      \
> > +  FUNC(u, uint, 16, 8, OP, NAME)             \
> > +  FUNC(s, int, 8, 16, OP, NAME)                      \
> > +  FUNC(u, uint, 8, 16, OP, NAME)
> > +
> > +ALL_FUNCS(==, vcmpeq)
> > +ALL_FUNCS(!=, vcmpne)
> > +ALL_FUNCS(<, vcmplt)
> > +ALL_FUNCS(<=, vcmple)
> > +ALL_FUNCS(>, vcmpgt)
> > +ALL_FUNCS(>=, vcmpge)
> > +
> > +/* MVE has only 128-bit vectors, so we can vectorize only half of the
> > +   functions above.  */
> > +/* { dg-final { scan-assembler-times {\tvcmp.i[0-9]+  eq, q[0-9]+, q[0-9]+\n} 6 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.i[0-9]+  ne, q[0-9]+, q[0-9]+\n} 6 } } */
> > +
> > +/* lt, le, gt, ge apply to signed types, cs and hi to unsigned types.  */
> > +/* lt and le with unsigned types are replaced with the opposite condition, hence
> > +   the double number of matches for cs and hi.  */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  lt, q[0-9]+, q[0-9]+\n} 3 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  le, q[0-9]+, q[0-9]+\n} 3 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  gt, q[0-9]+, q[0-9]+\n} 3 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  ge, q[0-9]+, q[0-9]+\n} 3 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.u[0-9]+  cs, q[0-9]+, q[0-9]+\n} 6 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.u[0-9]+  hi, q[0-9]+, q[0-9]+\n} 6 } } */

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 7/9] arm: Auto-vectorization for MVE: add __fp16 support to VCMP
  2021-05-04 11:48   ` Andre Vieira (lists)
@ 2021-05-04 13:43     ` Christophe Lyon
  2021-05-04 17:03       ` Christophe Lyon
  0 siblings, 1 reply; 35+ messages in thread
From: Christophe Lyon @ 2021-05-04 13:43 UTC (permalink / raw)
  To: Andre Vieira (lists); +Cc: gcc-patches

On Tue, 4 May 2021 at 13:48, Andre Vieira (lists)
<andre.simoesdiasvieira@arm.com> wrote:
>
> It would be good to also add tests for NEON as you also enable auto-vec
> for it. I checked and I do think the necessary 'neon_vc' patterns exist
> for 'VH', so we should be OK there.
>

Actually since I posted the patch series, I've noticed a regression in
armv8_2-fp16-arith-1.c, because we now vectorize all the float16x[48]_t loops,
but we lose the fact that some FP comparisons can throw exceptions.

I'll have to revisit this patch.

Thanks,

Christophe

> On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:
> > This patch adds __fp16 support to the previous patch that added vcmp
> > support with MVE. For this we update existing expanders to use VDQWH
> > iterator, and add a new expander vcond<VH_cvtto><mode>.  In the
> > process we need to create suitable iterators, and update v_cmp_result
> > as needed.
> >
> > 2021-04-26  Christophe Lyon  <christophe.lyon@linaro.org>
> >
> >       gcc/
> >       * config/arm/iterators.md (V16): New iterator.
> >       (VH_cvtto): New iterator.
> >       (v_cmp_result): Added V4HF and V8HF support.
> >       * config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>): Use VDQWH.
> >       (vcond<mode><mode>): Likewise.
> >       (vcond_mask_<mode><v_cmp_result>): Likewise.
> >       (vcond<VH_cvtto><mode>): New expander.
> >
> >       gcc/testsuite/
> >       * gcc.target/arm/simd/mve-compare-3.c: New test with GCC vectors.
> >       * gcc.target/arm/simd/mve-vcmp-f16.c: New test for
> >       auto-vectorization.
> > ---
> >   gcc/config/arm/iterators.md                       |  6 ++++
> >   gcc/config/arm/vec-common.md                      | 40 ++++++++++++++++-------
> >   gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c | 38 +++++++++++++++++++++
> >   gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c  | 30 +++++++++++++++++
> >   4 files changed, 102 insertions(+), 12 deletions(-)
> >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
> >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
> >
> > diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> > index a128465..3042baf 100644
> > --- a/gcc/config/arm/iterators.md
> > +++ b/gcc/config/arm/iterators.md
> > @@ -231,6 +231,9 @@ (define_mode_iterator VU [V16QI V8HI V4SI])
> >   ;; Vector modes for 16-bit floating-point support.
> >   (define_mode_iterator VH [V8HF V4HF])
> >
> > +;; Modes with 16-bit elements only.
> > +(define_mode_iterator V16 [V4HI V4HF V8HI V8HF])
> > +
> >   ;; 16-bit floating-point vector modes suitable for moving (includes BFmode).
> >   (define_mode_iterator VHFBF [V8HF V4HF V4BF V8BF])
> >
> > @@ -571,6 +574,8 @@ (define_mode_attr V_cvtto [(V2SI "v2sf") (V2SF "v2si")
> >   ;; (Opposite) mode to convert to/from for vector-half mode conversions.
> >   (define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")
> >                           (V8HI "V8HF") (V8HF "V8HI")])
> > +(define_mode_attr VH_cvtto [(V4HI "v4hf") (V4HF "v4hi")
> > +                         (V8HI "v8hf") (V8HF "v8hi")])
> >
> >   ;; Define element mode for each vector mode.
> >   (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
> > @@ -720,6 +725,7 @@ (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")
> >   (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")
> >                               (V4HI "v4hi") (V8HI  "v8hi")
> >                               (V2SI "v2si") (V4SI  "v4si")
> > +                             (V4HF "v4hi") (V8HF  "v8hi")
> >                               (DI   "di")   (V2DI  "v2di")
> >                               (V2SF "v2si") (V4SF  "v4si")])
> >
> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > index 034b48b..3fd341c 100644
> > --- a/gcc/config/arm/vec-common.md
> > +++ b/gcc/config/arm/vec-common.md
> > @@ -366,8 +366,8 @@ (define_expand "vlshr<mode>3"
> >   (define_expand "vec_cmp<mode><v_cmp_result>"
> >     [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
> >       (match_operator:<V_cmp_result> 1 "comparison_operator"
> > -       [(match_operand:VDQW 2 "s_register_operand")
> > -        (match_operand:VDQW 3 "reg_or_zero_operand")]))]
> > +       [(match_operand:VDQWH 2 "s_register_operand")
> > +        (match_operand:VDQWH 3 "reg_or_zero_operand")]))]
> >     "ARM_HAVE_<MODE>_ARITH
> >      && !TARGET_REALLY_IWMMXT
> >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > @@ -399,13 +399,13 @@ (define_expand "vec_cmpu<mode><mode>"
> >   ;; element-wise.
> >
> >   (define_expand "vcond<mode><mode>"
> > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > -     (if_then_else:VDQW
> > +  [(set (match_operand:VDQWH 0 "s_register_operand")
> > +     (if_then_else:VDQWH
> >         (match_operator 3 "comparison_operator"
> > -         [(match_operand:VDQW 4 "s_register_operand")
> > -          (match_operand:VDQW 5 "reg_or_zero_operand")])
> > -       (match_operand:VDQW 1 "s_register_operand")
> > -       (match_operand:VDQW 2 "s_register_operand")))]
> > +         [(match_operand:VDQWH 4 "s_register_operand")
> > +          (match_operand:VDQWH 5 "reg_or_zero_operand")])
> > +       (match_operand:VDQWH 1 "s_register_operand")
> > +       (match_operand:VDQWH 2 "s_register_operand")))]
> >     "ARM_HAVE_<MODE>_ARITH
> >      && !TARGET_REALLY_IWMMXT
> >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > @@ -430,6 +430,22 @@ (define_expand "vcond<V_cvtto><mode>"
> >     DONE;
> >   })
> >
> > +(define_expand "vcond<VH_cvtto><mode>"
> > +  [(set (match_operand:<VH_CVTTO> 0 "s_register_operand")
> > +     (if_then_else:<VH_CVTTO>
> > +       (match_operator 3 "comparison_operator"
> > +         [(match_operand:V16 4 "s_register_operand")
> > +          (match_operand:V16 5 "reg_or_zero_operand")])
> > +       (match_operand:<VH_CVTTO> 1 "s_register_operand")
> > +       (match_operand:<VH_CVTTO> 2 "s_register_operand")))]
> > +  "ARM_HAVE_<MODE>_ARITH
> > +   && !TARGET_REALLY_IWMMXT
> > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > +{
> > +  arm_expand_vcond (operands, <V_cmp_result>mode);
> > +  DONE;
> > +})
> > +
> >   (define_expand "vcondu<mode><v_cmp_result>"
> >     [(set (match_operand:VDQW 0 "s_register_operand")
> >       (if_then_else:VDQW
> > @@ -446,11 +462,11 @@ (define_expand "vcondu<mode><v_cmp_result>"
> >   })
> >
> >   (define_expand "vcond_mask_<mode><v_cmp_result>"
> > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > -        (if_then_else:VDQW
> > +  [(set (match_operand:VDQWH 0 "s_register_operand")
> > +        (if_then_else:VDQWH
> >             (match_operand:<V_cmp_result> 3 "s_register_operand")
> > -          (match_operand:VDQW 1 "s_register_operand")
> > -          (match_operand:VDQW 2 "s_register_operand")))]
> > +          (match_operand:VDQWH 1 "s_register_operand")
> > +          (match_operand:VDQWH 2 "s_register_operand")))]
> >     "ARM_HAVE_<MODE>_ARITH
> >      && !TARGET_REALLY_IWMMXT"
> >   {
> > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
> > new file mode 100644
> > index 0000000..76f81e8
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
> > @@ -0,0 +1,38 @@
> > +/* { dg-do assemble } */
> > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> > +
> > +/* float 16 tests.  */
> > +
> > +#ifndef ELEM_TYPE
> > +#define ELEM_TYPE __fp16
> > +#endif
> > +#ifndef INT_ELEM_TYPE
> > +#define INT_ELEM_TYPE __INT16_TYPE__
> > +#endif
> > +
> > +#define COMPARE(NAME, OP)                    \
> > +  int_vec                                    \
> > +  cmp_##NAME##_reg (vec a, vec b)            \
> > +  {                                          \
> > +    return a OP b;                           \
> > +  }
> > +
> > +typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));
> > +typedef ELEM_TYPE vec __attribute__((vector_size(16)));
> > +
> > +COMPARE (eq, ==)
> > +COMPARE (ne, !=)
> > +COMPARE (lt, <)
> > +COMPARE (le, <=)
> > +COMPARE (gt, >)
> > +COMPARE (ge, >=)
> > +
> > +/* eq, ne, lt, le, gt, ge.
> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */
> > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
> > new file mode 100644
> > index 0000000..dbae2d1
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do assemble } */
> > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> > +
> > +#include <stdint.h>
> > +
> > +#define NB 8
> > +
> > +#define FUNC(OP, NAME)                                                       \
> > +  void test_ ## NAME ##_f (__fp16 * __restrict__ dest, __fp16 *a, __fp16 *b) { \
> > +    int i;                                                           \
> > +    for (i=0; i<NB; i++) {                                           \
> > +      dest[i] = a[i] OP b[i];                                                \
> > +    }                                                                        \
> > +  }
> > +
> > +FUNC(==, vcmpeq)
> > +FUNC(!=, vcmpne)
> > +FUNC(<, vcmplt)
> > +FUNC(<=, vcmple)
> > +FUNC(>, vcmpgt)
> > +FUNC(>=, vcmpge)
> > +
> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
> > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 9/9] arm: Auto-vectorization for MVE: vld4/vst4
  2021-05-04 12:03   ` Andre Vieira (lists)
@ 2021-05-04 14:57     ` Christophe Lyon
  2021-05-17  9:55       ` Christophe Lyon
  0 siblings, 1 reply; 35+ messages in thread
From: Christophe Lyon @ 2021-05-04 14:57 UTC (permalink / raw)
  To: Andre Vieira (lists); +Cc: gcc Patches

On Tue, 4 May 2021 at 14:03, Andre Vieira (lists)
<andre.simoesdiasvieira@arm.com> wrote:
>
> Hi Christophe,
>
> The series LGTM but you'll need the approval of an arm port maintainer
> before committing. I only did code-review, did not try to build/run tests.
>

Hi Andre,

Thanks for the comments!

> Kind regards,
> Andre
>
> On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:
> > This patch enables MVE vld4/vst4 instructions for auto-vectorization.
> > We move the existing expanders from neon.md and enable them for MVE,
> > calling the respective emitter.
> >
> > 2021-03-12  Christophe Lyon  <christophe.lyon@linaro.org>
> >
> >       gcc/
> >       * config/arm/neon.md (vec_load_lanesxi<mode>)
> >       (vec_store_lanexoi<mode>): Move ...
> >       * config/arm/vec-common.md: here.
> >
> >       gcc/testsuite/
> >       * gcc.target/arm/simd/mve-vld4.c: New test, derived from
> >       slp-perm-3.c
> > ---
> >   gcc/config/arm/neon.md                       |  20 ----
> >   gcc/config/arm/vec-common.md                 |  26 +++++
> >   gcc/testsuite/gcc.target/arm/simd/mve-vld4.c | 140 +++++++++++++++++++++++++++
> >   3 files changed, 166 insertions(+), 20 deletions(-)
> >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
> >
> > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> > index bc8775c..fb58baf 100644
> > --- a/gcc/config/arm/neon.md
> > +++ b/gcc/config/arm/neon.md
> > @@ -5617,16 +5617,6 @@ (define_insn "neon_vld4<mode>"
> >                       (const_string "neon_load4_4reg<q>")))]
> >   )
> >
> > -(define_expand "vec_load_lanesxi<mode>"
> > -  [(match_operand:XI 0 "s_register_operand")
> > -   (match_operand:XI 1 "neon_struct_operand")
> > -   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> > -  "TARGET_NEON"
> > -{
> > -  emit_insn (gen_neon_vld4<mode> (operands[0], operands[1]));
> > -  DONE;
> > -})
> > -
> >   (define_expand "neon_vld4<mode>"
> >     [(match_operand:XI 0 "s_register_operand")
> >      (match_operand:XI 1 "neon_struct_operand")
> > @@ -5818,16 +5808,6 @@ (define_insn "neon_vst4<mode>"
> >                       (const_string "neon_store4_4reg<q>")))]
> >   )
> >
> > -(define_expand "vec_store_lanesxi<mode>"
> > -  [(match_operand:XI 0 "neon_struct_operand")
> > -   (match_operand:XI 1 "s_register_operand")
> > -   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> > -  "TARGET_NEON"
> > -{
> > -  emit_insn (gen_neon_vst4<mode> (operands[0], operands[1]));
> > -  DONE;
> > -})
> > -
> >   (define_expand "neon_vst4<mode>"
> >     [(match_operand:XI 0 "neon_struct_operand")
> >      (match_operand:XI 1 "s_register_operand")
> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > index 7abefea..d46b78d 100644
> > --- a/gcc/config/arm/vec-common.md
> > +++ b/gcc/config/arm/vec-common.md
> > @@ -512,3 +512,29 @@ (define_expand "vec_store_lanesoi<mode>"
> >       emit_insn (gen_mve_vst2q<mode> (operands[0], operands[1]));
> >     DONE;
> >   })
> > +
> > +(define_expand "vec_load_lanesxi<mode>"
> > +  [(match_operand:XI 0 "s_register_operand")
> > +   (match_operand:XI 1 "neon_struct_operand")
> > +   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> > +  "TARGET_NEON || TARGET_HAVE_MVE"
> > +{
> > +  if (TARGET_NEON)
> > +    emit_insn (gen_neon_vld4<mode> (operands[0], operands[1]));
> > +  else
> > +    emit_insn (gen_mve_vld4q<mode> (operands[0], operands[1]));
> > +  DONE;
> > +})
> > +
> > +(define_expand "vec_store_lanesxi<mode>"
> > +  [(match_operand:XI 0 "neon_struct_operand")
> > +   (match_operand:XI 1 "s_register_operand")
> > +   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> > +  "TARGET_NEON || TARGET_HAVE_MVE"
> > +{
> > +  if (TARGET_NEON)
> > +    emit_insn (gen_neon_vst4<mode> (operands[0], operands[1]));
> > +  else
> > +    emit_insn (gen_mve_vst4q<mode> (operands[0], operands[1]));
> > +  DONE;
> > +})
> > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c b/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
> > new file mode 100644
> > index 0000000..ce3e755
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
> > @@ -0,0 +1,140 @@
> > +/* { dg-do assemble } */
> > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > +/* { dg-additional-options "-O3" } */
> > +
> > +#include <stdint.h>
> > +
> > +#define M00 100
> > +#define M10 216
> > +#define M20 23
> > +#define M30 237
> > +#define M01 1322
> > +#define M11 13
> > +#define M21 27271
> > +#define M31 2280
> > +#define M02 74
> > +#define M12 191
> > +#define M22 500
> > +#define M32 111
> > +#define M03 134
> > +#define M13 117
> > +#define M23 11
> > +#define M33 771
> > +
> > +#define N 128
> > +
> > +/* Integer tests.  */
> > +#define FUNC(SIGN, TYPE, BITS)                                               \
> > +  void foo_##SIGN##BITS##x (TYPE##BITS##_t *__restrict__ pInput,     \
> > +                         TYPE##BITS##_t *__restrict__ pOutput)       \
> > +  {                                                                  \
> > +    unsigned int i;                                                  \
> > +    TYPE##BITS##_t  a, b, c, d;                                              \
> > +                                                                     \
> > +    for (i = 0; i < N / BITS; i++)                                   \
> > +      {                                                                      \
> > +     a = *pInput++;                                                  \
> > +     b = *pInput++;                                                  \
> > +     c = *pInput++;                                                  \
> > +     d = *pInput++;                                                  \
> > +                                                                     \
> > +     *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;             \
> > +     *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;             \
> > +     *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;             \
> > +     *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;             \
> > +      }                                                                      \
> > +  }
> > +
> > +FUNC(s, int, 8)
> > +FUNC(u, uint, 8)
> > +FUNC(s, int, 16)
> > +FUNC(u, uint, 16)
> > +FUNC(s, int, 32)
> > +FUNC(u, uint, 32)
> > +
> > +/* float test, keep the macro because it's similar to the above, but does not
> > +   need the ##BITS##_t.  */
> > +#define FUNC_FLOAT(SIGN, TYPE, BITS)                                         \
> > +  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,                       \
> > +                         TYPE *__restrict__ pOutput)                 \
> > +  {                                                                  \
> > +    unsigned int i;                                                  \
> > +    TYPE a, b, c, d;                                                 \
> > +                                                                     \
> > +    for (i = 0; i < N / BITS; i++)                                   \
> > +      {                                                                      \
> > +     a = *pInput++;                                                  \
> > +     b = *pInput++;                                                  \
> > +     c = *pInput++;                                                  \
> > +     d = *pInput++;                                                  \
> > +                                                                     \
> > +     *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;             \
> > +     *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;             \
> > +     *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;             \
> > +     *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;             \
> > +      }                                                                      \
> > +  }
> > +
> > +FUNC_FLOAT(f, float, 32)
> > +
> > +/* __fp16 test, needs explicit casts to avoid conversions to floating-point and
> > +   failure to vectorize.  */
> > +__fp16 M00_fp16 = 100.0f16;
> > +__fp16 M10_fp16 = 216.0f16;
> > +__fp16 M20_fp16 = 23.0f16;
> > +__fp16 M30_fp16 = 237.0f16;
> > +__fp16 M01_fp16 = 1322.0f16;
> > +__fp16 M11_fp16 = 13.0f16;
> > +__fp16 M21_fp16 = 27271.0f16;
> > +__fp16 M31_fp16 = 2280.0f16;
> > +__fp16 M02_fp16 = 74.0f16;
> > +__fp16 M12_fp16 = 191.0f16;
> > +__fp16 M22_fp16 = 500.0f16;
> > +__fp16 M32_fp16 = 111.0f16;
> > +__fp16 M03_fp16 = 134.0f16;
> > +__fp16 M13_fp16 = 117.0f16;
> > +__fp16 M23_fp16 = 11.0f16;
> > +__fp16 M33_fp16 = 771.0f16;
> > +
> > +#define FUNC_FLOAT_FP16(SIGN, TYPE, BITS)                            \
> > +  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,                       \
> > +                         TYPE *__restrict__ pOutput)                 \
> > +  {                                                                  \
> > +    unsigned int i;                                                  \
> > +    TYPE a, b, c, d;                                                 \
> > +                                                                     \
> > +    for (i = 0; i < N / BITS; i++)                                   \
> > +      {                                                                      \
> > +     a = *pInput++;                                                  \
> > +     b = *pInput++;                                                  \
> > +     c = *pInput++;                                                  \
> > +     d = *pInput++;                                                  \
> > +                                                                     \
> > +     TYPE ab, cd;                                                    \
> > +     ab = (__fp16)(M00_fp16 * a) + (__fp16)(M01_fp16 * b);           \
> > +     cd = (__fp16)(M02_fp16 * c) + (__fp16)(M03_fp16 * d);           \
> > +     *pOutput++ = ab + cd;                                           \
> > +     ab = (__fp16)(M10_fp16 * a) + (__fp16)(M11_fp16 * b);           \
> > +     cd = (__fp16)(M12_fp16 * c) + (__fp16)(M13_fp16 * d);           \
> > +     *pOutput++ = ab + cd;                                           \
> > +     ab = (__fp16)(M20_fp16 * a) + (__fp16)(M21_fp16 * b);           \
> > +     cd = (__fp16)(M22_fp16 * c) + (__fp16)(M23_fp16 * d);           \
> > +     *pOutput++ = ab + cd;                                           \
> > +     ab = (__fp16)(M30_fp16 * a) + (__fp16)(M31_fp16 * b);           \
> > +     cd = (__fp16)(M32_fp16 * c) + (__fp16)(M33_fp16 * d);           \
> > +     *pOutput++ = ab + cd;                                           \
> > +      }                                                                      \
> > +  }
> > +
> > +FUNC_FLOAT_FP16(f, __fp16, 16)
> > +
> > +/* vld4X.8 is used for signed and unsigned chars: 2 * 4.  */
> > +/* vld4X.16 is used for signed and unsigned shorts and __fp16: 3 * 4.  */
> > +/* vld4X.32 is used for signed and unsigned ints and float: 3 * 4.  */
> > +/* { dg-final { scan-assembler-times {vld4[0123].8\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 8 } } */
> > +/* { dg-final { scan-assembler-times {vld4[0123].16\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
> > +/* { dg-final { scan-assembler-times {vld4[0123].32\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
> > +/* { dg-final { scan-assembler-times {vst4[0123].8\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 8 } } */
> > +/* { dg-final { scan-assembler-times {vst4[0123].16\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
> > +/* { dg-final { scan-assembler-times {vst4[0123].32\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 7/9] arm: Auto-vectorization for MVE: add __fp16 support to VCMP
  2021-05-04 13:43     ` Christophe Lyon
@ 2021-05-04 17:03       ` Christophe Lyon
  2021-05-05 14:09         ` Christophe Lyon
  0 siblings, 1 reply; 35+ messages in thread
From: Christophe Lyon @ 2021-05-04 17:03 UTC (permalink / raw)
  To: Andre Vieira (lists); +Cc: gcc-patches

On Tue, 4 May 2021 at 15:43, Christophe Lyon <christophe.lyon@linaro.org> wrote:
>
> On Tue, 4 May 2021 at 13:48, Andre Vieira (lists)
> <andre.simoesdiasvieira@arm.com> wrote:
> >
> > It would be good to also add tests for NEON as you also enable auto-vec
> > for it. I checked and I do think the necessary 'neon_vc' patterns exist
> > for 'VH', so we should be OK there.
> >
>
> Actually since I posted the patch series, I've noticed a regression in
> armv8_2-fp16-arith-1.c, because we now vectorize all the float16x[48]_t loops,
> but we lose the fact that some FP comparisons can throw exceptions.
>
> I'll have to revisit this patch.

Actually it looks like my patch does the right thing: we now vectorize
appropriately, given that the testcase is compiled with -ffast-math.
I need to update the testcase, though.

>
> Thanks,
>
> Christophe
>
> > On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:
> > > This patch adds __fp16 support to the previous patch that added vcmp
> > > support with MVE. For this we update existing expanders to use VDQWH
> > > iterator, and add a new expander vcond<VH_cvtto><mode>.  In the
> > > process we need to create suitable iterators, and update v_cmp_result
> > > as needed.
> > >
> > > 2021-04-26  Christophe Lyon  <christophe.lyon@linaro.org>
> > >
> > >       gcc/
> > >       * config/arm/iterators.md (V16): New iterator.
> > >       (VH_cvtto): New iterator.
> > >       (v_cmp_result): Added V4HF and V8HF support.
> > >       * config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>): Use VDQWH.
> > >       (vcond<mode><mode>): Likewise.
> > >       (vcond_mask_<mode><v_cmp_result>): Likewise.
> > >       (vcond<VH_cvtto><mode>): New expander.
> > >
> > >       gcc/testsuite/
> > >       * gcc.target/arm/simd/mve-compare-3.c: New test with GCC vectors.
> > >       * gcc.target/arm/simd/mve-vcmp-f16.c: New test for
> > >       auto-vectorization.
> > > ---
> > >   gcc/config/arm/iterators.md                       |  6 ++++
> > >   gcc/config/arm/vec-common.md                      | 40 ++++++++++++++++-------
> > >   gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c | 38 +++++++++++++++++++++
> > >   gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c  | 30 +++++++++++++++++
> > >   4 files changed, 102 insertions(+), 12 deletions(-)
> > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
> > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
> > >
> > > diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> > > index a128465..3042baf 100644
> > > --- a/gcc/config/arm/iterators.md
> > > +++ b/gcc/config/arm/iterators.md
> > > @@ -231,6 +231,9 @@ (define_mode_iterator VU [V16QI V8HI V4SI])
> > >   ;; Vector modes for 16-bit floating-point support.
> > >   (define_mode_iterator VH [V8HF V4HF])
> > >
> > > +;; Modes with 16-bit elements only.
> > > +(define_mode_iterator V16 [V4HI V4HF V8HI V8HF])
> > > +
> > >   ;; 16-bit floating-point vector modes suitable for moving (includes BFmode).
> > >   (define_mode_iterator VHFBF [V8HF V4HF V4BF V8BF])
> > >
> > > @@ -571,6 +574,8 @@ (define_mode_attr V_cvtto [(V2SI "v2sf") (V2SF "v2si")
> > >   ;; (Opposite) mode to convert to/from for vector-half mode conversions.
> > >   (define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")
> > >                           (V8HI "V8HF") (V8HF "V8HI")])
> > > +(define_mode_attr VH_cvtto [(V4HI "v4hf") (V4HF "v4hi")
> > > +                         (V8HI "v8hf") (V8HF "v8hi")])
> > >
> > >   ;; Define element mode for each vector mode.
> > >   (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
> > > @@ -720,6 +725,7 @@ (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")
> > >   (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")
> > >                               (V4HI "v4hi") (V8HI  "v8hi")
> > >                               (V2SI "v2si") (V4SI  "v4si")
> > > +                             (V4HF "v4hi") (V8HF  "v8hi")
> > >                               (DI   "di")   (V2DI  "v2di")
> > >                               (V2SF "v2si") (V4SF  "v4si")])
> > >
> > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > > index 034b48b..3fd341c 100644
> > > --- a/gcc/config/arm/vec-common.md
> > > +++ b/gcc/config/arm/vec-common.md
> > > @@ -366,8 +366,8 @@ (define_expand "vlshr<mode>3"
> > >   (define_expand "vec_cmp<mode><v_cmp_result>"
> > >     [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
> > >       (match_operator:<V_cmp_result> 1 "comparison_operator"
> > > -       [(match_operand:VDQW 2 "s_register_operand")
> > > -        (match_operand:VDQW 3 "reg_or_zero_operand")]))]
> > > +       [(match_operand:VDQWH 2 "s_register_operand")
> > > +        (match_operand:VDQWH 3 "reg_or_zero_operand")]))]
> > >     "ARM_HAVE_<MODE>_ARITH
> > >      && !TARGET_REALLY_IWMMXT
> > >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > @@ -399,13 +399,13 @@ (define_expand "vec_cmpu<mode><mode>"
> > >   ;; element-wise.
> > >
> > >   (define_expand "vcond<mode><mode>"
> > > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > > -     (if_then_else:VDQW
> > > +  [(set (match_operand:VDQWH 0 "s_register_operand")
> > > +     (if_then_else:VDQWH
> > >         (match_operator 3 "comparison_operator"
> > > -         [(match_operand:VDQW 4 "s_register_operand")
> > > -          (match_operand:VDQW 5 "reg_or_zero_operand")])
> > > -       (match_operand:VDQW 1 "s_register_operand")
> > > -       (match_operand:VDQW 2 "s_register_operand")))]
> > > +         [(match_operand:VDQWH 4 "s_register_operand")
> > > +          (match_operand:VDQWH 5 "reg_or_zero_operand")])
> > > +       (match_operand:VDQWH 1 "s_register_operand")
> > > +       (match_operand:VDQWH 2 "s_register_operand")))]
> > >     "ARM_HAVE_<MODE>_ARITH
> > >      && !TARGET_REALLY_IWMMXT
> > >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > @@ -430,6 +430,22 @@ (define_expand "vcond<V_cvtto><mode>"
> > >     DONE;
> > >   })
> > >
> > > +(define_expand "vcond<VH_cvtto><mode>"
> > > +  [(set (match_operand:<VH_CVTTO> 0 "s_register_operand")
> > > +     (if_then_else:<VH_CVTTO>
> > > +       (match_operator 3 "comparison_operator"
> > > +         [(match_operand:V16 4 "s_register_operand")
> > > +          (match_operand:V16 5 "reg_or_zero_operand")])
> > > +       (match_operand:<VH_CVTTO> 1 "s_register_operand")
> > > +       (match_operand:<VH_CVTTO> 2 "s_register_operand")))]
> > > +  "ARM_HAVE_<MODE>_ARITH
> > > +   && !TARGET_REALLY_IWMMXT
> > > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > +{
> > > +  arm_expand_vcond (operands, <V_cmp_result>mode);
> > > +  DONE;
> > > +})
> > > +
> > >   (define_expand "vcondu<mode><v_cmp_result>"
> > >     [(set (match_operand:VDQW 0 "s_register_operand")
> > >       (if_then_else:VDQW
> > > @@ -446,11 +462,11 @@ (define_expand "vcondu<mode><v_cmp_result>"
> > >   })
> > >
> > >   (define_expand "vcond_mask_<mode><v_cmp_result>"
> > > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > > -        (if_then_else:VDQW
> > > +  [(set (match_operand:VDQWH 0 "s_register_operand")
> > > +        (if_then_else:VDQWH
> > >             (match_operand:<V_cmp_result> 3 "s_register_operand")
> > > -          (match_operand:VDQW 1 "s_register_operand")
> > > -          (match_operand:VDQW 2 "s_register_operand")))]
> > > +          (match_operand:VDQWH 1 "s_register_operand")
> > > +          (match_operand:VDQWH 2 "s_register_operand")))]
> > >     "ARM_HAVE_<MODE>_ARITH
> > >      && !TARGET_REALLY_IWMMXT"
> > >   {
> > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
> > > new file mode 100644
> > > index 0000000..76f81e8
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
> > > @@ -0,0 +1,38 @@
> > > +/* { dg-do assemble } */
> > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> > > +
> > > +/* float 16 tests.  */
> > > +
> > > +#ifndef ELEM_TYPE
> > > +#define ELEM_TYPE __fp16
> > > +#endif
> > > +#ifndef INT_ELEM_TYPE
> > > +#define INT_ELEM_TYPE __INT16_TYPE__
> > > +#endif
> > > +
> > > +#define COMPARE(NAME, OP)                    \
> > > +  int_vec                                    \
> > > +  cmp_##NAME##_reg (vec a, vec b)            \
> > > +  {                                          \
> > > +    return a OP b;                           \
> > > +  }
> > > +
> > > +typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));
> > > +typedef ELEM_TYPE vec __attribute__((vector_size(16)));
> > > +
> > > +COMPARE (eq, ==)
> > > +COMPARE (ne, !=)
> > > +COMPARE (lt, <)
> > > +COMPARE (le, <=)
> > > +COMPARE (gt, >)
> > > +COMPARE (ge, >=)
> > > +
> > > +/* eq, ne, lt, le, gt, ge.
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */
> > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
> > > new file mode 100644
> > > index 0000000..dbae2d1
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
> > > @@ -0,0 +1,30 @@
> > > +/* { dg-do assemble } */
> > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +#define NB 8
> > > +
> > > +#define FUNC(OP, NAME)                                                       \
> > > +  void test_ ## NAME ##_f (__fp16 * __restrict__ dest, __fp16 *a, __fp16 *b) { \
> > > +    int i;                                                           \
> > > +    for (i=0; i<NB; i++) {                                           \
> > > +      dest[i] = a[i] OP b[i];                                                \
> > > +    }                                                                        \
> > > +  }
> > > +
> > > +FUNC(==, vcmpeq)
> > > +FUNC(!=, vcmpne)
> > > +FUNC(<, vcmplt)
> > > +FUNC(<=, vcmple)
> > > +FUNC(>, vcmpgt)
> > > +FUNC(>=, vcmpge)
> > > +
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 6/9] arm: Auto-vectorization for MVE: vcmp
  2021-05-04 13:41     ` Christophe Lyon
@ 2021-05-05 14:08       ` Christophe Lyon
  2021-05-17  9:54         ` Christophe Lyon
  2021-05-17 10:35         ` Kyrylo Tkachov
  0 siblings, 2 replies; 35+ messages in thread
From: Christophe Lyon @ 2021-05-05 14:08 UTC (permalink / raw)
  To: Andre Vieira (lists); +Cc: gcc Patches

[-- Attachment #1: Type: text/plain, Size: 48551 bytes --]

On Tue, 4 May 2021 at 15:41, Christophe Lyon <christophe.lyon@linaro.org> wrote:
>
> On Tue, 4 May 2021 at 13:29, Andre Vieira (lists)
> <andre.simoesdiasvieira@arm.com> wrote:
> >
> > Hi Christophe,
> >
> > On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:
> > > Since MVE has a different set of vector comparison operators from
> > > Neon, we have to update the expansion to take into account the new
> > > ones, for instance 'NE' for which MVE does not require to use 'EQ'
> > > with the inverted condition.
> > >
> > > Conversely, Neon supports comparisons with #0, MVE does not.
> > >
> > > For:
> > > typedef long int vs32 __attribute__((vector_size(16)));
> > > vs32 cmp_eq_vs32_reg (vs32 a, vs32 b) { return a == b; }
> > >
> > > we now generate:
> > > cmp_eq_vs32_reg:
> > >       vldr.64 d4, .L123       @ 8     [c=8 l=4]  *mve_movv4si/8
> > >       vldr.64 d5, .L123+8
> > >       vldr.64 d6, .L123+16    @ 9     [c=8 l=4]  *mve_movv4si/8
> > >       vldr.64 d7, .L123+24
> > >       vcmp.i32  eq, q0, q1    @ 7     [c=16 l=4]  mve_vcmpeqq_v4si
> > >       vpsel q0, q3, q2        @ 15    [c=8 l=4]  mve_vpselq_sv4si
> > >       bx      lr      @ 26    [c=8 l=4]  *thumb2_return
> > > .L124:
> > >       .align  3
> > > .L123:
> > >       .word   0
> > >       .word   0
> > >       .word   0
> > >       .word   0
> > >       .word   1
> > >       .word   1
> > >       .word   1
> > >       .word   1
> > >
> > > For some reason emit_move_insn (zero, CONST0_RTX (cmp_mode)) produces
> > > a pair of vldr instead of vmov.i32, qX, #0
> > I think ideally we would even want:
> > vpte  eq, q0, q1
> > vmovt.i32 q0, #0
> > vmove.i32 q0, #1
> >
> > But we don't have a way to generate VPT blocks with multiple
> > instructions yet unfortunately so I guess VPSEL will have to do for now.
>
> TBH,  I looked at what LLVM generates currently ;-)
>

Here is an updated version, which adds
&& (!<Is_float_mode> || flag_unsafe_math_optimizations)
to vcond_mask_

This condition was not present in the neon.md version I move to vec-common.md,
but since the VDQW iterator includes V2SF and V4SF, it should take
float-point flags into account.

Christophe

> >
> > >
> > > 2021-03-01  Christophe Lyon  <christophe.lyon@linaro.org>
> > >
> > >       gcc/
> > >       * config/arm/arm-protos.h (arm_expand_vector_compare): Update
> > >       prototype.
> > >       * config/arm/arm.c (arm_expand_vector_compare): Add support for
> > >       MVE.
> > >       (arm_expand_vcond): Likewise.
> > >       * config/arm/iterators.md (supf): Remove VCMPNEQ_S, VCMPEQQ_S,
> > >       VCMPEQQ_N_S, VCMPNEQ_N_S.
> > >       (VCMPNEQ, VCMPEQQ, VCMPEQQ_N, VCMPNEQ_N): Remove.
> > >       * config/arm/mve.md (@mve_vcmp<mve_cmp_op>q_<mode>): Add '@' prefix.
> > >       (@mve_vcmp<mve_cmp_op>q_f<mode>): Likewise.
> > >       (@mve_vcmp<mve_cmp_op>q_n_f<mode>): Likewise.
> > >       (@mve_vpselq_<supf><mode>): Likewise.
> > >       (@mve_vpselq_f<mode>"): Likewise.
> > >       * config/arm/neon.md (vec_cmp<mode><v_cmp_result): Enable for MVE
> > >       and move to vec-common.md.
> > >       (vec_cmpu<mode><mode>): Likewise.
> > >       (vcond<mode><mode>): Likewise.
> > >       (vcond<V_cvtto><mode>): Likewise.
> > >       (vcondu<mode><v_cmp_result>): Likewise.
> > >       (vcond_mask_<mode><v_cmp_result>): Likewise.
> > >       * config/arm/unspecs.md (VCMPNEQ_U, VCMPNEQ_S, VCMPEQQ_S)
> > >       (VCMPEQQ_N_S, VCMPNEQ_N_S, VCMPEQQ_U, CMPEQQ_N_U, VCMPNEQ_N_U)
> > >       (VCMPGEQ_N_S, VCMPGEQ_S, VCMPGTQ_N_S, VCMPGTQ_S, VCMPLEQ_N_S)
> > >       (VCMPLEQ_S, VCMPLTQ_N_S, VCMPLTQ_S, VCMPCSQ_N_U, VCMPCSQ_U)
> > >       (VCMPHIQ_N_U, VCMPHIQ_U): Remove.
> > >       * config/arm/vec-common.md (vec_cmp<mode><v_cmp_result): Moved
> > >       from neon.md.
> > >       (vec_cmpu<mode><mode>): Likewise.
> > >       (vcond<mode><mode>): Likewise.
> > >       (vcond<V_cvtto><mode>): Likewise.
> > >       (vcondu<mode><v_cmp_result>): Likewise.
> > >       (vcond_mask_<mode><v_cmp_result>): Likewise.
> > >
> > >       gcc/testsuite
> > >       * gcc.target/arm/simd/mve-compare-1.c: New test with GCC vectors.
> > >       * gcc.target/arm/simd/mve-compare-2.c: New test with GCC vectors.
> > >       * gcc.target/arm/simd/mve-compare-scalar-1.c: New test with GCC
> > >       vectors.
> > >       * gcc.target/arm/simd/mve-vcmp-f32.c: New test for
> > >       auto-vectorization.
> > >       * gcc.target/arm/simd/mve-vcmp.c: New test for auto-vectorization.
> > >
> > > add gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
> > > ---
> > >   gcc/config/arm/arm-protos.h                        |   2 +-
> > >   gcc/config/arm/arm.c                               | 211 ++++++++++++++++-----
> > >   gcc/config/arm/iterators.md                        |   9 +-
> > >   gcc/config/arm/mve.md                              |  10 +-
> > >   gcc/config/arm/neon.md                             |  87 ---------
> > >   gcc/config/arm/unspecs.md                          |  20 --
> > >   gcc/config/arm/vec-common.md                       | 107 +++++++++++
> > >   gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c  |  80 ++++++++
> > >   gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c  |  38 ++++
> > >   .../gcc.target/arm/simd/mve-compare-scalar-1.c     |  69 +++++++
> > >   gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c   |  30 +++
> > >   gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c       |  50 +++++
> > >   12 files changed, 547 insertions(+), 166 deletions(-)
> > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
> > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
> > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
> > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
> > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c
> > >
> > > diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
> > > index 2521541..ffccaa7 100644
> > > --- a/gcc/config/arm/arm-protos.h
> > > +++ b/gcc/config/arm/arm-protos.h
> > > @@ -373,7 +373,7 @@ extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
> > >   extern bool arm_fusion_enabled_p (tune_params::fuse_ops);
> > >   extern bool arm_valid_symbolic_address_p (rtx);
> > >   extern bool arm_validize_comparison (rtx *, rtx *, rtx *);
> > > -extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool);
> > > +extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool, bool);
> > >   #endif /* RTX_CODE */
> > >
> > >   extern bool arm_gen_setmem (rtx *);
> > > diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
> > > index 0371d98..80e28ef 100644
> > > --- a/gcc/config/arm/arm.c
> > > +++ b/gcc/config/arm/arm.c
> > > @@ -30933,66 +30933,114 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
> > >      and return true if TARGET contains the inverse.  If !CAN_INVERT,
> > >      always store the result in TARGET, never its inverse.
> > >
> > > +   If VCOND_MVE, do not emit the vpsel instruction here, let arm_expand_vcond do
> > > +   it with the right destination type to avoid emiting two vpsel, one here and
> > > +   one in arm_expand_vcond.
> > > +
> > >      Note that the handling of floating-point comparisons is not
> > >      IEEE compliant.  */
> > >
> > >   bool
> > >   arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
> > > -                        bool can_invert)
> > > +                        bool can_invert, bool vcond_mve)
> > >   {
> > >     machine_mode cmp_result_mode = GET_MODE (target);
> > >     machine_mode cmp_mode = GET_MODE (op0);
> > >
> > >     bool inverted;
> > > -  switch (code)
> > > -    {
> > > -    /* For these we need to compute the inverse of the requested
> > > -       comparison.  */
> > > -    case UNORDERED:
> > > -    case UNLT:
> > > -    case UNLE:
> > > -    case UNGT:
> > > -    case UNGE:
> > > -    case UNEQ:
> > > -    case NE:
> > > -      code = reverse_condition_maybe_unordered (code);
> > > -      if (!can_invert)
> > > -     {
> > > -       /* Recursively emit the inverted comparison into a temporary
> > > -          and then store its inverse in TARGET.  This avoids reusing
> > > -          TARGET (which for integer NE could be one of the inputs).  */
> > > -       rtx tmp = gen_reg_rtx (cmp_result_mode);
> > > -       if (arm_expand_vector_compare (tmp, code, op0, op1, true))
> > > -         gcc_unreachable ();
> > > -       emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp)));
> > > -       return false;
> > > -     }
> > > -      inverted = true;
> > > -      break;
> > >
> > > -    default:
> > > +  /* MVE supports more comparisons than Neon.  */
> > > +  if (TARGET_HAVE_MVE)
> > >         inverted = false;
> > > -      break;
> > > -    }
> > > +  else
> > > +    switch (code)
> > > +      {
> > > +     /* For these we need to compute the inverse of the requested
> > > +        comparison.  */
> > > +      case UNORDERED:
> > > +      case UNLT:
> > > +      case UNLE:
> > > +      case UNGT:
> > > +      case UNGE:
> > > +      case UNEQ:
> > > +      case NE:
> > > +     code = reverse_condition_maybe_unordered (code);
> > > +     if (!can_invert)
> > > +       {
> > > +         /* Recursively emit the inverted comparison into a temporary
> > > +            and then store its inverse in TARGET.  This avoids reusing
> > > +            TARGET (which for integer NE could be one of the inputs).  */
> > > +         rtx tmp = gen_reg_rtx (cmp_result_mode);
> > > +         if (arm_expand_vector_compare (tmp, code, op0, op1, true, vcond_mve))
> > > +           gcc_unreachable ();
> > > +         emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp)));
> > > +         return false;
> > > +       }
> > > +     inverted = true;
> > > +     break;
> > > +
> > > +      default:
> > > +     inverted = false;
> > > +     break;
> > > +      }
> > >
> > >     switch (code)
> > >       {
> > > -    /* These are natively supported for zero comparisons, but otherwise
> > > -       require the operands to be swapped.  */
> > > +    /* These are natively supported by Neon for zero comparisons, but otherwise
> > > +       require the operands to be swapped. For MVE, we can only compare
> > > +       registers.  */
> > >       case LE:
> > >       case LT:
> > > -      if (op1 != CONST0_RTX (cmp_mode))
> > > -     {
> > > -       code = swap_condition (code);
> > > -       std::swap (op0, op1);
> > > -     }
> > > +      if (!TARGET_HAVE_MVE)
> > > +     if (op1 != CONST0_RTX (cmp_mode))
> > > +       {
> > > +         code = swap_condition (code);
> > > +         std::swap (op0, op1);
> > > +       }
> > >         /* Fall through.  */
> > >
> > > -    /* These are natively supported for both register and zero operands.  */
> > > +    /* These are natively supported by Neon for both register and zero
> > > +       operands. MVE supports registers only.  */
> > >       case EQ:
> > >       case GE:
> > >       case GT:
> > > -      emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1));
> > > +    case NE:
> > > +      if (TARGET_HAVE_MVE) {
> > > +     rtx vpr_p0;
> > > +     if (vcond_mve)
> > > +       vpr_p0 = target;
> > > +     else
> > > +       vpr_p0 = gen_reg_rtx (HImode);
> > > +
> > > +     switch (cmp_mode)
> > > +       {
> > > +       case E_V16QImode:
> > > +       case E_V8HImode:
> > > +       case E_V4SImode:
> > > +         emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
> > > +         break;
> > > +       case E_V8HFmode:
> > > +       case E_V4SFmode:
> > > +         if (TARGET_HAVE_MVE_FLOAT)
> > > +           emit_insn (gen_mve_vcmpq_f (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
> > > +         else
> > > +           gcc_unreachable ();
> > > +         break;
> > > +       default:
> > > +         gcc_unreachable ();
> > > +       }
> > > +
> > > +     /* If we are not expanding a vcond, build the result here.  */
> > > +     if (!vcond_mve) {
> > > +       rtx zero = gen_reg_rtx (cmp_result_mode);
> > > +       rtx one = gen_reg_rtx (cmp_result_mode);
> > > +       emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
> > > +       emit_move_insn (one, CONST1_RTX (cmp_result_mode));
> > > +       emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
> > > +     }
> > > +      }
> > > +      else
> > > +     emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1));
> > >         return inverted;
> > >
> > >       /* These are natively supported for register operands only.
> > > @@ -31000,16 +31048,50 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
> > >          or canonicalized by target-independent code.  */
> > >       case GEU:
> > >       case GTU:
> > > -      emit_insn (gen_neon_vc (code, cmp_mode, target,
> > > -                           op0, force_reg (cmp_mode, op1)));
> > > +      if (TARGET_HAVE_MVE) {
> > > +     rtx vpr_p0;
> > > +     if (vcond_mve)
> > > +       vpr_p0 = target;
> > > +     else
> > > +       vpr_p0 = gen_reg_rtx (HImode);
> > > +
> > > +     emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
> > > +     if (!vcond_mve) {
> > > +       rtx zero = gen_reg_rtx (cmp_result_mode);
> > > +       rtx one = gen_reg_rtx (cmp_result_mode);
> > > +       emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
> > > +       emit_move_insn (one, CONST1_RTX (cmp_result_mode));
> > > +       emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
> > > +     }
> > > +      }
> > > +       else
> > > +     emit_insn (gen_neon_vc (code, cmp_mode, target,
> > > +                             op0, force_reg (cmp_mode, op1)));
> > >         return inverted;
> > >
> > >       /* These require the operands to be swapped and likewise do not
> > >          support comparisons with zero.  */
> > >       case LEU:
> > >       case LTU:
> > > -      emit_insn (gen_neon_vc (swap_condition (code), cmp_mode,
> > > -                           target, force_reg (cmp_mode, op1), op0));
> > > +      if (TARGET_HAVE_MVE) {
> > > +     rtx vpr_p0;
> > > +     if (vcond_mve)
> > > +       vpr_p0 = target;
> > > +     else
> > > +       vpr_p0 = gen_reg_rtx (HImode);
> > > +
> > > +     emit_insn (gen_mve_vcmpq (swap_condition (code), cmp_mode, vpr_p0, force_reg (cmp_mode, op1), op0));
> > > +     if (!vcond_mve) {
> > > +       rtx zero = gen_reg_rtx (cmp_result_mode);
> > > +       rtx one = gen_reg_rtx (cmp_result_mode);
> > > +       emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
> > > +       emit_move_insn (one, CONST1_RTX (cmp_result_mode));
> > > +       emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
> > > +     }
> > > +      }
> > > +      else
> > > +     emit_insn (gen_neon_vc (swap_condition (code), cmp_mode,
> > > +                             target, force_reg (cmp_mode, op1), op0));
> > >         return inverted;
> > >
> > >       /* These need a combination of two comparisons.  */
> > > @@ -31021,8 +31103,8 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
> > >       rtx gt_res = gen_reg_rtx (cmp_result_mode);
> > >       rtx alt_res = gen_reg_rtx (cmp_result_mode);
> > >       rtx_code alt_code = (code == LTGT ? LT : LE);
> > > -     if (arm_expand_vector_compare (gt_res, GT, op0, op1, true)
> > > -         || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true))
> > > +     if (arm_expand_vector_compare (gt_res, GT, op0, op1, true, vcond_mve)
> > > +         || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true, vcond_mve))
> > >         gcc_unreachable ();
> > >       emit_insn (gen_rtx_SET (target, gen_rtx_IOR (cmp_result_mode,
> > >                                                    gt_res, alt_res)));
> > > @@ -31040,13 +31122,50 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
> > >   void
> > >   arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode)
> > >   {
> > > -  rtx mask = gen_reg_rtx (cmp_result_mode);
> > > +  /* When expanding for MVE, we do not want to emit a (useless) vpsel in
> > > +     arm_expand_vector_compare, and another one here.  */
> > > +  bool vcond_mve=false;
> > > +  rtx mask;
> > > +
> > > +  if (TARGET_HAVE_MVE)
> > > +    {
> > > +      vcond_mve=true;
> > > +      mask = gen_reg_rtx (HImode);
> > > +    }
> > > +  else
> > > +    mask = gen_reg_rtx (cmp_result_mode);
> > > +
> > >     bool inverted = arm_expand_vector_compare (mask, GET_CODE (operands[3]),
> > > -                                          operands[4], operands[5], true);
> > > +                                          operands[4], operands[5], true, vcond_mve);
> > >     if (inverted)
> > >       std::swap (operands[1], operands[2]);
> > > +  if (TARGET_NEON)
> > >     emit_insn (gen_neon_vbsl (GET_MODE (operands[0]), operands[0],
> > >                           mask, operands[1], operands[2]));
> > > +  else
> > > +    {
> > > +      machine_mode cmp_mode = GET_MODE (operands[4]);
> > > +      rtx vpr_p0 = mask;
> > > +      rtx zero = gen_reg_rtx (cmp_mode);
> > > +      rtx one = gen_reg_rtx (cmp_mode);
> > > +      emit_move_insn (zero, CONST0_RTX (cmp_mode));
> > > +      emit_move_insn (one, CONST1_RTX (cmp_mode));
> > > +      switch (cmp_mode)
> > > +     {
> > > +     case E_V16QImode:
> > > +     case E_V8HImode:
> > > +     case E_V4SImode:
> > > +       emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, operands[0], one, zero, vpr_p0));
> > > +       break;
> > > +     case E_V8HFmode:
> > > +     case E_V4SFmode:
> > > +       if (TARGET_HAVE_MVE_FLOAT)
> > > +         emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], one, zero, vpr_p0));
> > > +       break;
> > > +     default:
> > > +       gcc_unreachable ();
> > > +     }
> > > +    }
> > >   }
> > >
> > >   #define MAX_VECT_LEN 16
> > > diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> > > index 95df8bd..a128465 100644
> > > --- a/gcc/config/arm/iterators.md
> > > +++ b/gcc/config/arm/iterators.md
> > > @@ -1288,12 +1288,11 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
> > >                      (VCREATEQ_U "u") (VCREATEQ_S "s") (VSHRQ_N_S "s")
> > >                      (VSHRQ_N_U "u") (VCVTQ_N_FROM_F_S "s") (VSHLQ_U "u")
> > >                      (VCVTQ_N_FROM_F_U "u") (VADDLVQ_P_S "s") (VSHLQ_S "s")
> > > -                    (VADDLVQ_P_U "u") (VCMPNEQ_S "s")
> > > +                    (VADDLVQ_P_U "u")
> > >                      (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s")
> > >                      (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
> > >                      (VADDVQ_P_S "s") (VADDVQ_P_U "u") (VBRSRQ_N_S "s")
> > > -                    (VBRSRQ_N_U "u") (VCMPEQQ_S "s")
> > > -                    (VCMPEQQ_N_S "s") (VCMPNEQ_N_S "s")
> > > +                    (VBRSRQ_N_U "u")
> > >                      (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s")
> > >                      (VHADDQ_U "u") (VHSUBQ_N_S "s")  (VHSUBQ_N_U "u")
> > >                      (VHSUBQ_S "s") (VMAXQ_S "s") (VMAXQ_U "u") (VHSUBQ_U "u")
> > > @@ -1549,16 +1548,12 @@ (define_int_iterator VCREATEQ [VCREATEQ_U VCREATEQ_S])
> > >   (define_int_iterator VSHRQ_N [VSHRQ_N_S VSHRQ_N_U])
> > >   (define_int_iterator VCVTQ_N_FROM_F [VCVTQ_N_FROM_F_S VCVTQ_N_FROM_F_U])
> > >   (define_int_iterator VADDLVQ_P [VADDLVQ_P_S VADDLVQ_P_U])
> > > -(define_int_iterator VCMPNEQ [VCMPNEQ_S])
> > >   (define_int_iterator VSHLQ [VSHLQ_S VSHLQ_U])
> > >   (define_int_iterator VABDQ [VABDQ_S VABDQ_U])
> > >   (define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U])
> > >   (define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U])
> > >   (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
> > >   (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
> > > -(define_int_iterator VCMPEQQ [VCMPEQQ_S])
> > > -(define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S])
> > > -(define_int_iterator VCMPNEQ_N [VCMPNEQ_N_S])
> > >   (define_int_iterator VHADDQ [VHADDQ_S VHADDQ_U])
> > >   (define_int_iterator VHADDQ_N [VHADDQ_N_U VHADDQ_N_S])
> > >   (define_int_iterator VHSUBQ [VHSUBQ_S VHSUBQ_U])
> > > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> > > index 7c846a4..97f0a87 100644
> > > --- a/gcc/config/arm/mve.md
> > > +++ b/gcc/config/arm/mve.md
> > > @@ -838,7 +838,7 @@ (define_insn "mve_vaddlvq_p_<supf>v4si"
> > >   ;;
> > >   ;; [vcmpneq_, vcmpcsq_, vcmpeqq_, vcmpgeq_, vcmpgtq_, vcmphiq_, vcmpleq_, vcmpltq_])
> > >   ;;
> > > -(define_insn "mve_vcmp<mve_cmp_op>q_<mode>"
> > > +(define_insn "@mve_vcmp<mve_cmp_op>q_<mode>"
> > >     [
> > >      (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> > >       (MVE_COMPARISONS:HI (match_operand:MVE_2 1 "s_register_operand" "w")
> > > @@ -1928,7 +1928,7 @@ (define_insn "mve_vcaddq<mve_rot><mode>"
> > >   ;;
> > >   ;; [vcmpeqq_f, vcmpgeq_f, vcmpgtq_f, vcmpleq_f, vcmpltq_f, vcmpneq_f])
> > >   ;;
> > > -(define_insn "mve_vcmp<mve_cmp_op>q_f<mode>"
> > > +(define_insn "@mve_vcmp<mve_cmp_op>q_f<mode>"
> > >     [
> > >      (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> > >       (MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w")
> > > @@ -1942,7 +1942,7 @@ (define_insn "mve_vcmp<mve_cmp_op>q_f<mode>"
> > >   ;;
> > >   ;; [vcmpeqq_n_f, vcmpgeq_n_f, vcmpgtq_n_f, vcmpleq_n_f, vcmpltq_n_f, vcmpneq_n_f])
> > >   ;;
> > > -(define_insn "mve_vcmp<mve_cmp_op>q_n_f<mode>"
> > > +(define_insn "@mve_vcmp<mve_cmp_op>q_n_f<mode>"
> > >     [
> > >      (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> > >       (MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w")
> > > @@ -3307,7 +3307,7 @@ (define_insn "mve_vnegq_m_s<mode>"
> > >   ;;
> > >   ;; [vpselq_u, vpselq_s])
> > >   ;;
> > > -(define_insn "mve_vpselq_<supf><mode>"
> > > +(define_insn "@mve_vpselq_<supf><mode>"
> > >     [
> > >      (set (match_operand:MVE_1 0 "s_register_operand" "=w")
> > >       (unspec:MVE_1 [(match_operand:MVE_1 1 "s_register_operand" "w")
> > > @@ -4402,7 +4402,7 @@ (define_insn "mve_vorrq_m_n_<supf><mode>"
> > >   ;;
> > >   ;; [vpselq_f])
> > >   ;;
> > > -(define_insn "mve_vpselq_f<mode>"
> > > +(define_insn "@mve_vpselq_f<mode>"
> > >     [
> > >      (set (match_operand:MVE_0 0 "s_register_operand" "=w")
> > >       (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
> > > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> > > index fec2cc9..6660846 100644
> > > --- a/gcc/config/arm/neon.md
> > > +++ b/gcc/config/arm/neon.md
> > > @@ -1416,93 +1416,6 @@ (define_insn "*us_sub<mode>_neon"
> > >     [(set_attr "type" "neon_qsub<q>")]
> > >   )
> > >
> > > -(define_expand "vec_cmp<mode><v_cmp_result>"
> > > -  [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
> > > -     (match_operator:<V_cmp_result> 1 "comparison_operator"
> > > -       [(match_operand:VDQW 2 "s_register_operand")
> > > -        (match_operand:VDQW 3 "reg_or_zero_operand")]))]
> > > -  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > -{
> > > -  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
> > > -                          operands[2], operands[3], false);
> > > -  DONE;
> > > -})
> > > -
> > > -(define_expand "vec_cmpu<mode><mode>"
> > > -  [(set (match_operand:VDQIW 0 "s_register_operand")
> > > -     (match_operator:VDQIW 1 "comparison_operator"
> > > -       [(match_operand:VDQIW 2 "s_register_operand")
> > > -        (match_operand:VDQIW 3 "reg_or_zero_operand")]))]
> > > -  "TARGET_NEON"
> > > -{
> > > -  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
> > > -                          operands[2], operands[3], false);
> > > -  DONE;
> > > -})
> > > -
> > > -;; Conditional instructions.  These are comparisons with conditional moves for
> > > -;; vectors.  They perform the assignment:
> > > -;;
> > > -;;     Vop0 = (Vop4 <op3> Vop5) ? Vop1 : Vop2;
> > > -;;
> > > -;; where op3 is <, <=, ==, !=, >= or >.  Operations are performed
> > > -;; element-wise.
> > > -
> > > -(define_expand "vcond<mode><mode>"
> > > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > > -     (if_then_else:VDQW
> > > -       (match_operator 3 "comparison_operator"
> > > -         [(match_operand:VDQW 4 "s_register_operand")
> > > -          (match_operand:VDQW 5 "reg_or_zero_operand")])
> > > -       (match_operand:VDQW 1 "s_register_operand")
> > > -       (match_operand:VDQW 2 "s_register_operand")))]
> > > -  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > -{
> > > -  arm_expand_vcond (operands, <V_cmp_result>mode);
> > > -  DONE;
> > > -})
> > > -
> > > -(define_expand "vcond<V_cvtto><mode>"
> > > -  [(set (match_operand:<V_CVTTO> 0 "s_register_operand")
> > > -     (if_then_else:<V_CVTTO>
> > > -       (match_operator 3 "comparison_operator"
> > > -         [(match_operand:V32 4 "s_register_operand")
> > > -          (match_operand:V32 5 "reg_or_zero_operand")])
> > > -       (match_operand:<V_CVTTO> 1 "s_register_operand")
> > > -       (match_operand:<V_CVTTO> 2 "s_register_operand")))]
> > > -  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > -{
> > > -  arm_expand_vcond (operands, <V_cmp_result>mode);
> > > -  DONE;
> > > -})
> > > -
> > > -(define_expand "vcondu<mode><v_cmp_result>"
> > > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > > -     (if_then_else:VDQW
> > > -       (match_operator 3 "arm_comparison_operator"
> > > -         [(match_operand:<V_cmp_result> 4 "s_register_operand")
> > > -          (match_operand:<V_cmp_result> 5 "reg_or_zero_operand")])
> > > -       (match_operand:VDQW 1 "s_register_operand")
> > > -       (match_operand:VDQW 2 "s_register_operand")))]
> > > -  "TARGET_NEON"
> > > -{
> > > -  arm_expand_vcond (operands, <V_cmp_result>mode);
> > > -  DONE;
> > > -})
> > > -
> > > -(define_expand "vcond_mask_<mode><v_cmp_result>"
> > > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > > -     (if_then_else:VDQW
> > > -       (match_operand:<V_cmp_result> 3 "s_register_operand")
> > > -       (match_operand:VDQW 1 "s_register_operand")
> > > -       (match_operand:VDQW 2 "s_register_operand")))]
> > > -  "TARGET_NEON"
> > > -{
> > > -  emit_insn (gen_neon_vbsl<mode> (operands[0], operands[3], operands[1],
> > > -                               operands[2]));
> > > -  DONE;
> > > -})
> > > -
> > >   ;; Patterns for builtins.
> > >
> > >   ; good for plain vadd, vaddq.
> > > diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
> > > index 07ca53b..0778db1 100644
> > > --- a/gcc/config/arm/unspecs.md
> > > +++ b/gcc/config/arm/unspecs.md
> > > @@ -596,8 +596,6 @@ (define_c_enum "unspec" [
> > >     VCVTQ_N_FROM_F_U
> > >     VADDLVQ_P_S
> > >     VADDLVQ_P_U
> > > -  VCMPNEQ_U
> > > -  VCMPNEQ_S
> > >     VSHLQ_S
> > >     VSHLQ_U
> > >     VABDQ_S
> > > @@ -605,9 +603,6 @@ (define_c_enum "unspec" [
> > >     VADDVAQ_S
> > >     VADDVQ_P_S
> > >     VBRSRQ_N_S
> > > -  VCMPEQQ_S
> > > -  VCMPEQQ_N_S
> > > -  VCMPNEQ_N_S
> > >     VHADDQ_S
> > >     VHADDQ_N_S
> > >     VHSUBQ_S
> > > @@ -645,9 +640,6 @@ (define_c_enum "unspec" [
> > >     VADDVAQ_U
> > >     VADDVQ_P_U
> > >     VBRSRQ_N_U
> > > -  VCMPEQQ_U
> > > -  VCMPEQQ_N_U
> > > -  VCMPNEQ_N_U
> > >     VHADDQ_U
> > >     VHADDQ_N_U
> > >     VHSUBQ_U
> > > @@ -680,14 +672,6 @@ (define_c_enum "unspec" [
> > >     VSHLQ_R_U
> > >     VSUBQ_U
> > >     VSUBQ_N_U
> > > -  VCMPGEQ_N_S
> > > -  VCMPGEQ_S
> > > -  VCMPGTQ_N_S
> > > -  VCMPGTQ_S
> > > -  VCMPLEQ_N_S
> > > -  VCMPLEQ_S
> > > -  VCMPLTQ_N_S
> > > -  VCMPLTQ_S
> > >     VHCADDQ_ROT270_S
> > >     VHCADDQ_ROT90_S
> > >     VMAXAQ_S
> > > @@ -702,10 +686,6 @@ (define_c_enum "unspec" [
> > >     VQRDMULHQ_N_S
> > >     VQRDMULHQ_S
> > >     VQSHLUQ_N_S
> > > -  VCMPCSQ_N_U
> > > -  VCMPCSQ_U
> > > -  VCMPHIQ_N_U
> > > -  VCMPHIQ_U
> > >     VABDQ_M_S
> > >     VABDQ_M_U
> > >     VABDQ_F
> > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > > index 0b2b3b1..034b48b 100644
> > > --- a/gcc/config/arm/vec-common.md
> > > +++ b/gcc/config/arm/vec-common.md
> > > @@ -362,3 +362,110 @@ (define_expand "vlshr<mode>3"
> > >         DONE;
> > >       }
> > >   })
> > > +
> > > +(define_expand "vec_cmp<mode><v_cmp_result>"
> > > +  [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
> > > +     (match_operator:<V_cmp_result> 1 "comparison_operator"
> > > +       [(match_operand:VDQW 2 "s_register_operand")
> > > +        (match_operand:VDQW 3 "reg_or_zero_operand")]))]
> > > +  "ARM_HAVE_<MODE>_ARITH
> > > +   && !TARGET_REALLY_IWMMXT
> > > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > +{
> > > +  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
> > > +                          operands[2], operands[3], false, false);
> > > +  DONE;
> > > +})
> > > +
> > > +(define_expand "vec_cmpu<mode><mode>"
> > > +  [(set (match_operand:VDQIW 0 "s_register_operand")
> > > +     (match_operator:VDQIW 1 "comparison_operator"
> > > +       [(match_operand:VDQIW 2 "s_register_operand")
> > > +        (match_operand:VDQIW 3 "reg_or_zero_operand")]))]
> > > +  "ARM_HAVE_<MODE>_ARITH
> > > +   && !TARGET_REALLY_IWMMXT"
> > > +{
> > > +  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
> > > +                          operands[2], operands[3], false, false);
> > > +  DONE;
> > > +})
> > > +
> > > +;; Conditional instructions.  These are comparisons with conditional moves for
> > > +;; vectors.  They perform the assignment:
> > > +;;
> > > +;;     Vop0 = (Vop4 <op3> Vop5) ? Vop1 : Vop2;
> > > +;;
> > > +;; where op3 is <, <=, ==, !=, >= or >.  Operations are performed
> > > +;; element-wise.
> > > +
> > > +(define_expand "vcond<mode><mode>"
> > > +  [(set (match_operand:VDQW 0 "s_register_operand")
> > > +     (if_then_else:VDQW
> > > +       (match_operator 3 "comparison_operator"
> > > +         [(match_operand:VDQW 4 "s_register_operand")
> > > +          (match_operand:VDQW 5 "reg_or_zero_operand")])
> > > +       (match_operand:VDQW 1 "s_register_operand")
> > > +       (match_operand:VDQW 2 "s_register_operand")))]
> > > +  "ARM_HAVE_<MODE>_ARITH
> > > +   && !TARGET_REALLY_IWMMXT
> > > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > +{
> > > +  arm_expand_vcond (operands, <V_cmp_result>mode);
> > > +  DONE;
> > > +})
> > > +
> > > +(define_expand "vcond<V_cvtto><mode>"
> > > +  [(set (match_operand:<V_CVTTO> 0 "s_register_operand")
> > > +     (if_then_else:<V_CVTTO>
> > > +       (match_operator 3 "comparison_operator"
> > > +         [(match_operand:V32 4 "s_register_operand")
> > > +          (match_operand:V32 5 "reg_or_zero_operand")])
> > > +       (match_operand:<V_CVTTO> 1 "s_register_operand")
> > > +       (match_operand:<V_CVTTO> 2 "s_register_operand")))]
> > > +  "ARM_HAVE_<MODE>_ARITH
> > > +   && !TARGET_REALLY_IWMMXT
> > > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > +{
> > > +  arm_expand_vcond (operands, <V_cmp_result>mode);
> > > +  DONE;
> > > +})
> > > +
> > > +(define_expand "vcondu<mode><v_cmp_result>"
> > > +  [(set (match_operand:VDQW 0 "s_register_operand")
> > > +     (if_then_else:VDQW
> > > +       (match_operator 3 "arm_comparison_operator"
> > > +         [(match_operand:<V_cmp_result> 4 "s_register_operand")
> > > +          (match_operand:<V_cmp_result> 5 "reg_or_zero_operand")])
> > > +       (match_operand:VDQW 1 "s_register_operand")
> > > +       (match_operand:VDQW 2 "s_register_operand")))]
> > > +  "ARM_HAVE_<MODE>_ARITH
> > > +   && !TARGET_REALLY_IWMMXT"
> > > +{
> > > +  arm_expand_vcond (operands, <V_cmp_result>mode);
> > > +  DONE;
> > > +})
> > > +
> > > +(define_expand "vcond_mask_<mode><v_cmp_result>"
> > > +  [(set (match_operand:VDQW 0 "s_register_operand")
> > > +        (if_then_else:VDQW
> > > +          (match_operand:<V_cmp_result> 3 "s_register_operand")
> > > +          (match_operand:VDQW 1 "s_register_operand")
> > > +          (match_operand:VDQW 2 "s_register_operand")))]
> > > +  "ARM_HAVE_<MODE>_ARITH
> > > +   && !TARGET_REALLY_IWMMXT"
> > > +{
> > > +  if (TARGET_NEON)
> > > +    {
> > > +      emit_insn (gen_neon_vbsl (<MODE>mode, operands[0], operands[3],
> > > +                                operands[1], operands[2]));
> > > +    }
> > > +  else if (TARGET_HAVE_MVE)
> > > +    {
> > > +      emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0],
> > > +                                 operands[1], operands[2], operands[3]));
> > > +    }
> > > +  else
> > > +    gcc_unreachable ();
> > > +
> > > +  DONE;
> > > +})
> > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
> > > new file mode 100644
> > > index 0000000..029c931
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
> > > @@ -0,0 +1,80 @@
> > > +/* { dg-do assemble } */
> > > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> > > +/* { dg-add-options arm_v8_1m_mve } */
> > > +/* { dg-additional-options "-O3" } */
> > > +
> > > +/* Integer tests.  */
> > > +
> > > +#define COMPARE_REG(NAME, OP, TYPE) \
> > > +  TYPE \
> > > +  cmp_##NAME##_##TYPE##_reg (TYPE a, TYPE b) \
> > > +  { \
> > > +    return a OP b; \
> > > +  }
> > > +
> > > +#define COMPARE_REG_AND_ZERO(NAME, OP, TYPE) \
> > > +  COMPARE_REG (NAME, OP, TYPE) \
> > > +  \
> > > +  TYPE \
> > > +  cmp_##NAME##_##TYPE##_zero (TYPE a) \
> > > +  { \
> > > +    return a OP (TYPE) {}; \
> > > +  }
> > > +
> > > +#define COMPARE_TYPE(TYPE, COMPARE_ORDERED) \
> > > +  COMPARE_REG_AND_ZERO (eq, ==, TYPE) \
> > > +  COMPARE_REG_AND_ZERO (ne, !=, TYPE) \
> > > +  COMPARE_ORDERED (lt, <, TYPE) \
> > > +  COMPARE_ORDERED (le, <=, TYPE) \
> > > +  COMPARE_ORDERED (gt, >, TYPE) \
> > > +  COMPARE_ORDERED (ge, >=, TYPE)
> > > +
> > > +#define TEST_TYPE(NAME, ELEM, COMPARE_ORDERED, SIZE)  \
> > > +  typedef ELEM NAME##SIZE __attribute__((vector_size(SIZE))); \
> > > +  COMPARE_TYPE (NAME##SIZE, COMPARE_ORDERED)
> > > +
> > > +/* 64-bits vectors, not vectorized.  */
> > > +TEST_TYPE (vs8, __INT8_TYPE__, COMPARE_REG_AND_ZERO, 8)
> > > +TEST_TYPE (vu8, __UINT8_TYPE__, COMPARE_REG, 8)
> > > +TEST_TYPE (vs16, __INT16_TYPE__, COMPARE_REG_AND_ZERO, 8)
> > > +TEST_TYPE (vu16, __UINT16_TYPE__, COMPARE_REG, 8)
> > > +TEST_TYPE (vs32, __INT32_TYPE__, COMPARE_REG_AND_ZERO, 8)
> > > +TEST_TYPE (vu32, __UINT32_TYPE__, COMPARE_REG, 8)
> > > +
> > > +/* 128-bits vectors.  */
> > > +TEST_TYPE (vs8, __INT8_TYPE__, COMPARE_REG_AND_ZERO, 16)
> > > +TEST_TYPE (vu8, __UINT8_TYPE__, COMPARE_REG, 16)
> > > +TEST_TYPE (vs16, __INT16_TYPE__, COMPARE_REG_AND_ZERO, 16)
> > > +TEST_TYPE (vu16, __UINT16_TYPE__, COMPARE_REG, 16)
> > > +TEST_TYPE (vs32, __INT32_TYPE__, COMPARE_REG_AND_ZERO, 16)
> > > +TEST_TYPE (vu32, __UINT32_TYPE__, COMPARE_REG, 16)
> > > +
> > > +/* { 8 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> > > +/* { dg-final { scan-assembler-times {\tvcmp.i8  eq, q[0-9]+, q[0-9]+\n} 4 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.i8  ne, q[0-9]+, q[0-9]+\n} 4 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s8  lt, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s8  le, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s8  gt, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s8  ge, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.u8  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.u8  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +
> > > +/* { 16 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> > > +/* { dg-final { scan-assembler-times {\tvcmp.i16  eq, q[0-9]+, q[0-9]+\n} 4 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.i16  ne, q[0-9]+, q[0-9]+\n} 4 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s16  lt, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s16  le, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s16  gt, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s16  ge, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.u16  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.u16  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +
> > > +/* { 32 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> > > +/* { dg-final { scan-assembler-times {\tvcmp.i32  eq, q[0-9]+, q[0-9]+\n} 4 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.i32  ne, q[0-9]+, q[0-9]+\n} 4 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s32  lt, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s32  le, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s32  gt, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s32  ge, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.u32  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.u32  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
> > > new file mode 100644
> > > index 0000000..8515195
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
> > > @@ -0,0 +1,38 @@
> > > +/* { dg-do assemble } */
> > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> > > +
> > > +/* float 32 tests.  */
> > > +
> > > +#ifndef ELEM_TYPE
> > > +#define ELEM_TYPE float
> > > +#endif
> > > +#ifndef INT_ELEM_TYPE
> > > +#define INT_ELEM_TYPE __INT32_TYPE__
> > > +#endif
> > > +
> > > +#define COMPARE(NAME, OP)                    \
> > > +  int_vec                                    \
> > > +  cmp_##NAME##_reg (vec a, vec b)            \
> > > +  {                                          \
> > > +    return a OP b;                           \
> > > +  }
> > > +
> > > +typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));
> > > +typedef ELEM_TYPE vec __attribute__((vector_size(16)));
> > > +
> > > +COMPARE (eq, ==)
> > > +COMPARE (ne, !=)
> > > +COMPARE (lt, <)
> > > +COMPARE (le, <=)
> > > +COMPARE (gt, >)
> > > +COMPARE (ge, >=)
> > > +
> > > +/* eq, ne, lt, le, gt, ge.
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\teq, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tne, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tle, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tge, q[0-9]+, q[0-9]+\n} 1 } } */
> > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
> > > new file mode 100644
> > > index 0000000..7774972
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
> > > @@ -0,0 +1,69 @@
> > > +/* { dg-do assemble } */
> > > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> > > +/* { dg-add-options arm_v8_1m_mve } */
> > > +/* { dg-additional-options "-O3" } */
> > > +
> > > +#define COMPARE_REG(NAME, OP, TYPE, SCALAR)    \
> > > +  TYPE                                                 \
> > > +  cmp_##NAME##_##TYPE##_scalar (TYPE a, SCALAR b) \
> > > +  {                                            \
> > > +    return a OP b;                             \
> > > +  }
> > > +
> > > +#define COMPARE_TYPE(SCALAR, TYPE)                           \
> > > +  COMPARE_REG (eq, ==, TYPE, SCALAR)                         \
> > > +  COMPARE_REG (ne, !=, TYPE, SCALAR)                         \
> > > +  COMPARE_REG (lt, <, TYPE, SCALAR)                          \
> > > +  COMPARE_REG (le, <=, TYPE, SCALAR)                         \
> > > +  COMPARE_REG (gt, >, TYPE, SCALAR)                          \
> > > +  COMPARE_REG (ge, >=, TYPE, SCALAR)
> > > +
> > > +#define TEST_TYPE(NAME, ELEM, SIZE)                        \
> > > +  typedef ELEM NAME##SIZE __attribute__((vector_size(SIZE))); \
> > > +  COMPARE_TYPE (ELEM, NAME##SIZE)
> > > +
> > > +/* 64-bits vectors, not vectorized.  */
> > > +TEST_TYPE (vs8, __INT8_TYPE__, 8)
> > > +TEST_TYPE (vu8, __UINT8_TYPE__, 8)
> > > +TEST_TYPE (vs16, __INT16_TYPE__, 8)
> > > +TEST_TYPE (vu16, __UINT16_TYPE__, 8)
> > > +TEST_TYPE (vs32, __INT32_TYPE__, 8)
> > > +TEST_TYPE (vu32, __UINT32_TYPE__, 8)
> > > +
> > > +/* 128-bits vectors.  */
> > > +TEST_TYPE (vs8, __INT8_TYPE__, 16)
> > > +TEST_TYPE (vu8, __UINT8_TYPE__, 16)
> > > +TEST_TYPE (vs16, __INT16_TYPE__, 16)
> > > +TEST_TYPE (vu16, __UINT16_TYPE__, 16)
> > > +TEST_TYPE (vs32, __INT32_TYPE__, 16)
> > > +TEST_TYPE (vu32, __UINT32_TYPE__, 16)
> > > +
> > > +/* { 8 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> > > +/* { dg-final { scan-assembler-times {\tvcmp.i8  eq, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.i8  ne, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s8  lt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s8  le, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s8  gt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s8  ge, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.u8  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.u8  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +
> > > +/* { 16 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> > > +/* { dg-final { scan-assembler-times {\tvcmp.i16  eq, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.i16  ne, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s16  lt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s16  le, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s16  gt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s16  ge, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.u16  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.u16  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +
> > > +/* { 32 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> > > +/* { dg-final { scan-assembler-times {\tvcmp.i32  eq, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.i32  ne, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s32  lt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s32  le, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s32  gt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s32  ge, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.u32  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.u32  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
> > > new file mode 100644
> > > index 0000000..4ed449e
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
> > > @@ -0,0 +1,30 @@
> > > +/* { dg-do assemble } */
> > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +#define NB 4
> > > +
> > > +#define FUNC(OP, NAME)                                                       \
> > > +  void test_ ## NAME ##_f (float * __restrict__ dest, float *a, float *b) { \
> > > +    int i;                                                           \
> > > +    for (i=0; i<NB; i++) {                                           \
> > > +      dest[i] = a[i] OP b[i];                                                \
> > > +    }                                                                        \
> > > +  }
> > > +
> > > +FUNC(==, vcmpeq)
> > > +FUNC(!=, vcmpne)
> > > +FUNC(<, vcmplt)
> > > +FUNC(<=, vcmple)
> > > +FUNC(>, vcmpgt)
> > > +FUNC(>=, vcmpge)
> > > +
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\teq, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tne, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tle, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tge, q[0-9]+, q[0-9]+\n} 1 } } */
> > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c
> > > new file mode 100644
> > > index 0000000..8da15e7
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c
> > > @@ -0,0 +1,50 @@
> > > +/* { dg-do assemble } */
> > > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> > > +/* { dg-add-options arm_v8_1m_mve } */
> > > +/* { dg-additional-options "-O3" } */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)                         \
> > > +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
> > > +    int i;                                                           \
> > > +    for (i=0; i<NB; i++) {                                           \
> > > +      dest[i] = a[i] OP b[i];                                                \
> > > +    }                                                                        \
> > > +}
> > > +
> > > +#define ALL_FUNCS(OP, NAME) \
> > > +  FUNC(s, int, 32, 2, OP, NAME)                      \
> > > +  FUNC(u, uint, 32, 2, OP, NAME)             \
> > > +  FUNC(s, int, 16, 4, OP, NAME)                      \
> > > +  FUNC(u, uint, 16, 4, OP, NAME)             \
> > > +  FUNC(s, int, 8, 8, OP, NAME)                       \
> > > +  FUNC(u, uint, 8, 8, OP, NAME)                      \
> > > +  FUNC(s, int, 32, 4, OP, NAME)                      \
> > > +  FUNC(u, uint, 32, 4, OP, NAME)             \
> > > +  FUNC(s, int, 16, 8, OP, NAME)                      \
> > > +  FUNC(u, uint, 16, 8, OP, NAME)             \
> > > +  FUNC(s, int, 8, 16, OP, NAME)                      \
> > > +  FUNC(u, uint, 8, 16, OP, NAME)
> > > +
> > > +ALL_FUNCS(==, vcmpeq)
> > > +ALL_FUNCS(!=, vcmpne)
> > > +ALL_FUNCS(<, vcmplt)
> > > +ALL_FUNCS(<=, vcmple)
> > > +ALL_FUNCS(>, vcmpgt)
> > > +ALL_FUNCS(>=, vcmpge)
> > > +
> > > +/* MVE has only 128-bit vectors, so we can vectorize only half of the
> > > +   functions above.  */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.i[0-9]+  eq, q[0-9]+, q[0-9]+\n} 6 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.i[0-9]+  ne, q[0-9]+, q[0-9]+\n} 6 } } */
> > > +
> > > +/* lt, le, gt, ge apply to signed types, cs and hi to unsigned types.  */
> > > +/* lt and le with unsigned types are replaced with the opposite condition, hence
> > > +   the double number of matches for cs and hi.  */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  lt, q[0-9]+, q[0-9]+\n} 3 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  le, q[0-9]+, q[0-9]+\n} 3 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  gt, q[0-9]+, q[0-9]+\n} 3 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  ge, q[0-9]+, q[0-9]+\n} 3 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.u[0-9]+  cs, q[0-9]+, q[0-9]+\n} 6 } } */
> > > +/* { dg-final { scan-assembler-times {\tvcmp.u[0-9]+  hi, q[0-9]+, q[0-9]+\n} 6 } } */

[-- Attachment #2: v2-0006-arm-Auto-vectorization-for-MVE-vcmp.patch --]
[-- Type: text/x-patch, Size: 39088 bytes --]

From 6f7184595a5c856b12031abab58d4a7be021d48d Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.lyon@linaro.org>
Date: Mon, 8 Mar 2021 13:17:27 +0000
Subject: [PATCH v2 6/9] arm: Auto-vectorization for MVE: vcmp

Since MVE has a different set of vector comparison operators from
Neon, we have to update the expansion to take into account the new
ones, for instance 'NE' for which MVE does not require to use 'EQ'
with the inverted condition.

Conversely, Neon supports comparisons with #0, MVE does not.

For:
typedef long int vs32 __attribute__((vector_size(16)));
vs32 cmp_eq_vs32_reg (vs32 a, vs32 b) { return a == b; }

we now generate:
cmp_eq_vs32_reg:
	vldr.64 d4, .L123       @ 8     [c=8 l=4]  *mve_movv4si/8
	vldr.64 d5, .L123+8
	vldr.64 d6, .L123+16    @ 9     [c=8 l=4]  *mve_movv4si/8
	vldr.64 d7, .L123+24
	vcmp.i32  eq, q0, q1    @ 7     [c=16 l=4]  mve_vcmpeqq_v4si
	vpsel q0, q3, q2        @ 15    [c=8 l=4]  mve_vpselq_sv4si
	bx      lr      @ 26    [c=8 l=4]  *thumb2_return
.L124:
	.align  3
.L123:
	.word   0
	.word   0
	.word   0
	.word   0
	.word   1
	.word   1
	.word   1
	.word   1

For some reason emit_move_insn (zero, CONST0_RTX (cmp_mode)) produces
a pair of vldr instead of vmov.i32, qX, #0

2021-03-01  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/arm-protos.h (arm_expand_vector_compare): Update
	prototype.
	* config/arm/arm.c (arm_expand_vector_compare): Add support for
	MVE.
	(arm_expand_vcond): Likewise.
	* config/arm/iterators.md (supf): Remove VCMPNEQ_S, VCMPEQQ_S,
	VCMPEQQ_N_S, VCMPNEQ_N_S.
	(VCMPNEQ, VCMPEQQ, VCMPEQQ_N, VCMPNEQ_N): Remove.
	* config/arm/mve.md (@mve_vcmp<mve_cmp_op>q_<mode>): Add '@' prefix.
	(@mve_vcmp<mve_cmp_op>q_f<mode>): Likewise.
	(@mve_vcmp<mve_cmp_op>q_n_f<mode>): Likewise.
	(@mve_vpselq_<supf><mode>): Likewise.
	(@mve_vpselq_f<mode>"): Likewise.
	* config/arm/neon.md (vec_cmp<mode><v_cmp_result): Enable for MVE
	and move to vec-common.md.
	(vec_cmpu<mode><mode>): Likewise.
	(vcond<mode><mode>): Likewise.
	(vcond<V_cvtto><mode>): Likewise.
	(vcondu<mode><v_cmp_result>): Likewise.
	(vcond_mask_<mode><v_cmp_result>): Likewise.
	* config/arm/unspecs.md (VCMPNEQ_U, VCMPNEQ_S, VCMPEQQ_S)
	(VCMPEQQ_N_S, VCMPNEQ_N_S, VCMPEQQ_U, CMPEQQ_N_U, VCMPNEQ_N_U)
	(VCMPGEQ_N_S, VCMPGEQ_S, VCMPGTQ_N_S, VCMPGTQ_S, VCMPLEQ_N_S)
	(VCMPLEQ_S, VCMPLTQ_N_S, VCMPLTQ_S, VCMPCSQ_N_U, VCMPCSQ_U)
	(VCMPHIQ_N_U, VCMPHIQ_U): Remove.
	* config/arm/vec-common.md (vec_cmp<mode><v_cmp_result): Moved
	from neon.md.
	(vec_cmpu<mode><mode>): Likewise.
	(vcond<mode><mode>): Likewise.
	(vcond<V_cvtto><mode>): Likewise.
	(vcondu<mode><v_cmp_result>): Likewise.
	(vcond_mask_<mode><v_cmp_result>): Likewise. Added unsafe math
	condition.

	gcc/testsuite
	* gcc.target/arm/simd/mve-compare-1.c: New test with GCC vectors.
	* gcc.target/arm/simd/mve-compare-2.c: New test with GCC vectors.
	* gcc.target/arm/simd/mve-compare-scalar-1.c: New test with GCC
	vectors.
	* gcc.target/arm/simd/mve-vcmp-f32.c: New test for
	auto-vectorization.
	* gcc.target/arm/simd/mve-vcmp.c: New test for auto-vectorization.
---
 gcc/config/arm/arm-protos.h                        |   2 +-
 gcc/config/arm/arm.c                               | 211 ++++++++++++++++-----
 gcc/config/arm/iterators.md                        |   9 +-
 gcc/config/arm/mve.md                              |  10 +-
 gcc/config/arm/neon.md                             |  87 ---------
 gcc/config/arm/unspecs.md                          |  20 --
 gcc/config/arm/vec-common.md                       | 108 +++++++++++
 gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c  |  80 ++++++++
 gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c  |  38 ++++
 .../gcc.target/arm/simd/mve-compare-scalar-1.c     |  69 +++++++
 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c   |  30 +++
 gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c       |  50 +++++
 12 files changed, 548 insertions(+), 166 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 2521541..ffccaa7 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -373,7 +373,7 @@ extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
 extern bool arm_fusion_enabled_p (tune_params::fuse_ops);
 extern bool arm_valid_symbolic_address_p (rtx);
 extern bool arm_validize_comparison (rtx *, rtx *, rtx *);
-extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool);
+extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool, bool);
 #endif /* RTX_CODE */
 
 extern bool arm_gen_setmem (rtx *);
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 0371d98..80e28ef 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -30933,66 +30933,114 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
    and return true if TARGET contains the inverse.  If !CAN_INVERT,
    always store the result in TARGET, never its inverse.
 
+   If VCOND_MVE, do not emit the vpsel instruction here, let arm_expand_vcond do
+   it with the right destination type to avoid emiting two vpsel, one here and
+   one in arm_expand_vcond.
+
    Note that the handling of floating-point comparisons is not
    IEEE compliant.  */
 
 bool
 arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
-			   bool can_invert)
+			   bool can_invert, bool vcond_mve)
 {
   machine_mode cmp_result_mode = GET_MODE (target);
   machine_mode cmp_mode = GET_MODE (op0);
 
   bool inverted;
-  switch (code)
-    {
-    /* For these we need to compute the inverse of the requested
-       comparison.  */
-    case UNORDERED:
-    case UNLT:
-    case UNLE:
-    case UNGT:
-    case UNGE:
-    case UNEQ:
-    case NE:
-      code = reverse_condition_maybe_unordered (code);
-      if (!can_invert)
-	{
-	  /* Recursively emit the inverted comparison into a temporary
-	     and then store its inverse in TARGET.  This avoids reusing
-	     TARGET (which for integer NE could be one of the inputs).  */
-	  rtx tmp = gen_reg_rtx (cmp_result_mode);
-	  if (arm_expand_vector_compare (tmp, code, op0, op1, true))
-	    gcc_unreachable ();
-	  emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp)));
-	  return false;
-	}
-      inverted = true;
-      break;
 
-    default:
+  /* MVE supports more comparisons than Neon.  */
+  if (TARGET_HAVE_MVE)
       inverted = false;
-      break;
-    }
+  else
+    switch (code)
+      {
+	/* For these we need to compute the inverse of the requested
+	   comparison.  */
+      case UNORDERED:
+      case UNLT:
+      case UNLE:
+      case UNGT:
+      case UNGE:
+      case UNEQ:
+      case NE:
+	code = reverse_condition_maybe_unordered (code);
+	if (!can_invert)
+	  {
+	    /* Recursively emit the inverted comparison into a temporary
+	       and then store its inverse in TARGET.  This avoids reusing
+	       TARGET (which for integer NE could be one of the inputs).  */
+	    rtx tmp = gen_reg_rtx (cmp_result_mode);
+	    if (arm_expand_vector_compare (tmp, code, op0, op1, true, vcond_mve))
+	      gcc_unreachable ();
+	    emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp)));
+	    return false;
+	  }
+	inverted = true;
+	break;
+
+      default:
+	inverted = false;
+	break;
+      }
 
   switch (code)
     {
-    /* These are natively supported for zero comparisons, but otherwise
-       require the operands to be swapped.  */
+    /* These are natively supported by Neon for zero comparisons, but otherwise
+       require the operands to be swapped. For MVE, we can only compare
+       registers.  */
     case LE:
     case LT:
-      if (op1 != CONST0_RTX (cmp_mode))
-	{
-	  code = swap_condition (code);
-	  std::swap (op0, op1);
-	}
+      if (!TARGET_HAVE_MVE)
+	if (op1 != CONST0_RTX (cmp_mode))
+	  {
+	    code = swap_condition (code);
+	    std::swap (op0, op1);
+	  }
       /* Fall through.  */
 
-    /* These are natively supported for both register and zero operands.  */
+    /* These are natively supported by Neon for both register and zero
+       operands. MVE supports registers only.  */
     case EQ:
     case GE:
     case GT:
-      emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1));
+    case NE:
+      if (TARGET_HAVE_MVE) {
+	rtx vpr_p0;
+	if (vcond_mve)
+	  vpr_p0 = target;
+	else
+	  vpr_p0 = gen_reg_rtx (HImode);
+
+	switch (cmp_mode)
+	  {
+	  case E_V16QImode:
+	  case E_V8HImode:
+	  case E_V4SImode:
+	    emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
+	    break;
+	  case E_V8HFmode:
+	  case E_V4SFmode:
+	    if (TARGET_HAVE_MVE_FLOAT)
+	      emit_insn (gen_mve_vcmpq_f (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
+	    else
+	      gcc_unreachable ();
+	    break;
+	  default:
+	    gcc_unreachable ();
+	  }
+
+	/* If we are not expanding a vcond, build the result here.  */
+	if (!vcond_mve) {
+	  rtx zero = gen_reg_rtx (cmp_result_mode);
+	  rtx one = gen_reg_rtx (cmp_result_mode);
+	  emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
+	  emit_move_insn (one, CONST1_RTX (cmp_result_mode));
+	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
+	}
+      }
+      else
+	emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1));
       return inverted;
 
     /* These are natively supported for register operands only.
@@ -31000,16 +31048,50 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
        or canonicalized by target-independent code.  */
     case GEU:
     case GTU:
-      emit_insn (gen_neon_vc (code, cmp_mode, target,
-			      op0, force_reg (cmp_mode, op1)));
+      if (TARGET_HAVE_MVE) {
+	rtx vpr_p0;
+	if (vcond_mve)
+	  vpr_p0 = target;
+	else
+	  vpr_p0 = gen_reg_rtx (HImode);
+
+	emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
+	if (!vcond_mve) {
+	  rtx zero = gen_reg_rtx (cmp_result_mode);
+	  rtx one = gen_reg_rtx (cmp_result_mode);
+	  emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
+	  emit_move_insn (one, CONST1_RTX (cmp_result_mode));
+	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
+	}
+      }
+       else
+	emit_insn (gen_neon_vc (code, cmp_mode, target,
+				op0, force_reg (cmp_mode, op1)));
       return inverted;
 
     /* These require the operands to be swapped and likewise do not
        support comparisons with zero.  */
     case LEU:
     case LTU:
-      emit_insn (gen_neon_vc (swap_condition (code), cmp_mode,
-			      target, force_reg (cmp_mode, op1), op0));
+      if (TARGET_HAVE_MVE) {
+	rtx vpr_p0;
+	if (vcond_mve)
+	  vpr_p0 = target;
+	else
+	  vpr_p0 = gen_reg_rtx (HImode);
+
+	emit_insn (gen_mve_vcmpq (swap_condition (code), cmp_mode, vpr_p0, force_reg (cmp_mode, op1), op0));
+	if (!vcond_mve) {
+	  rtx zero = gen_reg_rtx (cmp_result_mode);
+	  rtx one = gen_reg_rtx (cmp_result_mode);
+	  emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
+	  emit_move_insn (one, CONST1_RTX (cmp_result_mode));
+	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
+	}
+      }
+      else
+	emit_insn (gen_neon_vc (swap_condition (code), cmp_mode,
+				target, force_reg (cmp_mode, op1), op0));
       return inverted;
 
     /* These need a combination of two comparisons.  */
@@ -31021,8 +31103,8 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
 	rtx gt_res = gen_reg_rtx (cmp_result_mode);
 	rtx alt_res = gen_reg_rtx (cmp_result_mode);
 	rtx_code alt_code = (code == LTGT ? LT : LE);
-	if (arm_expand_vector_compare (gt_res, GT, op0, op1, true)
-	    || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true))
+	if (arm_expand_vector_compare (gt_res, GT, op0, op1, true, vcond_mve)
+	    || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true, vcond_mve))
 	  gcc_unreachable ();
 	emit_insn (gen_rtx_SET (target, gen_rtx_IOR (cmp_result_mode,
 						     gt_res, alt_res)));
@@ -31040,13 +31122,50 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
 void
 arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode)
 {
-  rtx mask = gen_reg_rtx (cmp_result_mode);
+  /* When expanding for MVE, we do not want to emit a (useless) vpsel in
+     arm_expand_vector_compare, and another one here.  */
+  bool vcond_mve=false;
+  rtx mask;
+
+  if (TARGET_HAVE_MVE)
+    {
+      vcond_mve=true;
+      mask = gen_reg_rtx (HImode);
+    }
+  else
+    mask = gen_reg_rtx (cmp_result_mode);
+
   bool inverted = arm_expand_vector_compare (mask, GET_CODE (operands[3]),
-					     operands[4], operands[5], true);
+					     operands[4], operands[5], true, vcond_mve);
   if (inverted)
     std::swap (operands[1], operands[2]);
+  if (TARGET_NEON)
   emit_insn (gen_neon_vbsl (GET_MODE (operands[0]), operands[0],
 			    mask, operands[1], operands[2]));
+  else
+    {
+      machine_mode cmp_mode = GET_MODE (operands[4]);
+      rtx vpr_p0 = mask;
+      rtx zero = gen_reg_rtx (cmp_mode);
+      rtx one = gen_reg_rtx (cmp_mode);
+      emit_move_insn (zero, CONST0_RTX (cmp_mode));
+      emit_move_insn (one, CONST1_RTX (cmp_mode));
+      switch (cmp_mode)
+	{
+	case E_V16QImode:
+	case E_V8HImode:
+	case E_V4SImode:
+	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, operands[0], one, zero, vpr_p0));
+	  break;
+	case E_V8HFmode:
+	case E_V4SFmode:
+	  if (TARGET_HAVE_MVE_FLOAT)
+	    emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], one, zero, vpr_p0));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+    }
 }
 \f
 #define MAX_VECT_LEN 16
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 95df8bd..a128465 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -1288,12 +1288,11 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
 		       (VCREATEQ_U "u") (VCREATEQ_S "s") (VSHRQ_N_S "s")
 		       (VSHRQ_N_U "u") (VCVTQ_N_FROM_F_S "s") (VSHLQ_U "u")
 		       (VCVTQ_N_FROM_F_U "u") (VADDLVQ_P_S "s") (VSHLQ_S "s")
-		       (VADDLVQ_P_U "u") (VCMPNEQ_S "s")
+		       (VADDLVQ_P_U "u")
 		       (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s")
 		       (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
 		       (VADDVQ_P_S "s")	(VADDVQ_P_U "u") (VBRSRQ_N_S "s")
-		       (VBRSRQ_N_U "u") (VCMPEQQ_S "s")
-		       (VCMPEQQ_N_S "s") (VCMPNEQ_N_S "s")
+		       (VBRSRQ_N_U "u")
 		       (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s")
 		       (VHADDQ_U "u") (VHSUBQ_N_S "s")	(VHSUBQ_N_U "u")
 		       (VHSUBQ_S "s") (VMAXQ_S "s") (VMAXQ_U "u") (VHSUBQ_U "u")
@@ -1549,16 +1548,12 @@ (define_int_iterator VCREATEQ [VCREATEQ_U VCREATEQ_S])
 (define_int_iterator VSHRQ_N [VSHRQ_N_S VSHRQ_N_U])
 (define_int_iterator VCVTQ_N_FROM_F [VCVTQ_N_FROM_F_S VCVTQ_N_FROM_F_U])
 (define_int_iterator VADDLVQ_P [VADDLVQ_P_S VADDLVQ_P_U])
-(define_int_iterator VCMPNEQ [VCMPNEQ_S])
 (define_int_iterator VSHLQ [VSHLQ_S VSHLQ_U])
 (define_int_iterator VABDQ [VABDQ_S VABDQ_U])
 (define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U])
 (define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U])
 (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
 (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
-(define_int_iterator VCMPEQQ [VCMPEQQ_S])
-(define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S])
-(define_int_iterator VCMPNEQ_N [VCMPNEQ_N_S])
 (define_int_iterator VHADDQ [VHADDQ_S VHADDQ_U])
 (define_int_iterator VHADDQ_N [VHADDQ_N_U VHADDQ_N_S])
 (define_int_iterator VHSUBQ [VHSUBQ_S VHSUBQ_U])
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 7c846a4..97f0a87 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -838,7 +838,7 @@ (define_insn "mve_vaddlvq_p_<supf>v4si"
 ;;
 ;; [vcmpneq_, vcmpcsq_, vcmpeqq_, vcmpgeq_, vcmpgtq_, vcmphiq_, vcmpleq_, vcmpltq_])
 ;;
-(define_insn "mve_vcmp<mve_cmp_op>q_<mode>"
+(define_insn "@mve_vcmp<mve_cmp_op>q_<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(MVE_COMPARISONS:HI (match_operand:MVE_2 1 "s_register_operand" "w")
@@ -1928,7 +1928,7 @@ (define_insn "mve_vcaddq<mve_rot><mode>"
 ;;
 ;; [vcmpeqq_f, vcmpgeq_f, vcmpgtq_f, vcmpleq_f, vcmpltq_f, vcmpneq_f])
 ;;
-(define_insn "mve_vcmp<mve_cmp_op>q_f<mode>"
+(define_insn "@mve_vcmp<mve_cmp_op>q_f<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w")
@@ -1942,7 +1942,7 @@ (define_insn "mve_vcmp<mve_cmp_op>q_f<mode>"
 ;;
 ;; [vcmpeqq_n_f, vcmpgeq_n_f, vcmpgtq_n_f, vcmpleq_n_f, vcmpltq_n_f, vcmpneq_n_f])
 ;;
-(define_insn "mve_vcmp<mve_cmp_op>q_n_f<mode>"
+(define_insn "@mve_vcmp<mve_cmp_op>q_n_f<mode>"
   [
    (set (match_operand:HI 0 "vpr_register_operand" "=Up")
 	(MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w")
@@ -3307,7 +3307,7 @@ (define_insn "mve_vnegq_m_s<mode>"
 ;;
 ;; [vpselq_u, vpselq_s])
 ;;
-(define_insn "mve_vpselq_<supf><mode>"
+(define_insn "@mve_vpselq_<supf><mode>"
   [
    (set (match_operand:MVE_1 0 "s_register_operand" "=w")
 	(unspec:MVE_1 [(match_operand:MVE_1 1 "s_register_operand" "w")
@@ -4402,7 +4402,7 @@ (define_insn "mve_vorrq_m_n_<supf><mode>"
 ;;
 ;; [vpselq_f])
 ;;
-(define_insn "mve_vpselq_f<mode>"
+(define_insn "@mve_vpselq_f<mode>"
   [
    (set (match_operand:MVE_0 0 "s_register_operand" "=w")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index fec2cc9..6660846 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1416,93 +1416,6 @@ (define_insn "*us_sub<mode>_neon"
   [(set_attr "type" "neon_qsub<q>")]
 )
 
-(define_expand "vec_cmp<mode><v_cmp_result>"
-  [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
-	(match_operator:<V_cmp_result> 1 "comparison_operator"
-	  [(match_operand:VDQW 2 "s_register_operand")
-	   (match_operand:VDQW 3 "reg_or_zero_operand")]))]
-  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
-{
-  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
-			     operands[2], operands[3], false);
-  DONE;
-})
-
-(define_expand "vec_cmpu<mode><mode>"
-  [(set (match_operand:VDQIW 0 "s_register_operand")
-	(match_operator:VDQIW 1 "comparison_operator"
-	  [(match_operand:VDQIW 2 "s_register_operand")
-	   (match_operand:VDQIW 3 "reg_or_zero_operand")]))]
-  "TARGET_NEON"
-{
-  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
-			     operands[2], operands[3], false);
-  DONE;
-})
-
-;; Conditional instructions.  These are comparisons with conditional moves for
-;; vectors.  They perform the assignment:
-;;   
-;;     Vop0 = (Vop4 <op3> Vop5) ? Vop1 : Vop2;
-;;
-;; where op3 is <, <=, ==, !=, >= or >.  Operations are performed
-;; element-wise.
-
-(define_expand "vcond<mode><mode>"
-  [(set (match_operand:VDQW 0 "s_register_operand")
-	(if_then_else:VDQW
-	  (match_operator 3 "comparison_operator"
-	    [(match_operand:VDQW 4 "s_register_operand")
-	     (match_operand:VDQW 5 "reg_or_zero_operand")])
-	  (match_operand:VDQW 1 "s_register_operand")
-	  (match_operand:VDQW 2 "s_register_operand")))]
-  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
-{
-  arm_expand_vcond (operands, <V_cmp_result>mode);
-  DONE;
-})
-
-(define_expand "vcond<V_cvtto><mode>"
-  [(set (match_operand:<V_CVTTO> 0 "s_register_operand")
-	(if_then_else:<V_CVTTO>
-	  (match_operator 3 "comparison_operator"
-	    [(match_operand:V32 4 "s_register_operand")
-	     (match_operand:V32 5 "reg_or_zero_operand")])
-	  (match_operand:<V_CVTTO> 1 "s_register_operand")
-	  (match_operand:<V_CVTTO> 2 "s_register_operand")))]
-  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
-{
-  arm_expand_vcond (operands, <V_cmp_result>mode);
-  DONE;
-})
-
-(define_expand "vcondu<mode><v_cmp_result>"
-  [(set (match_operand:VDQW 0 "s_register_operand")
-	(if_then_else:VDQW
-	  (match_operator 3 "arm_comparison_operator"
-	    [(match_operand:<V_cmp_result> 4 "s_register_operand")
-	     (match_operand:<V_cmp_result> 5 "reg_or_zero_operand")])
-	  (match_operand:VDQW 1 "s_register_operand")
-	  (match_operand:VDQW 2 "s_register_operand")))]
-  "TARGET_NEON"
-{
-  arm_expand_vcond (operands, <V_cmp_result>mode);
-  DONE;
-})
-
-(define_expand "vcond_mask_<mode><v_cmp_result>"
-  [(set (match_operand:VDQW 0 "s_register_operand")
-	(if_then_else:VDQW
-	  (match_operand:<V_cmp_result> 3 "s_register_operand")
-	  (match_operand:VDQW 1 "s_register_operand")
-	  (match_operand:VDQW 2 "s_register_operand")))]
-  "TARGET_NEON"
-{
-  emit_insn (gen_neon_vbsl<mode> (operands[0], operands[3], operands[1],
-				  operands[2]));
-  DONE;
-})
-
 ;; Patterns for builtins.
 
 ; good for plain vadd, vaddq.
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 07ca53b..0778db1 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -596,8 +596,6 @@ (define_c_enum "unspec" [
   VCVTQ_N_FROM_F_U
   VADDLVQ_P_S
   VADDLVQ_P_U
-  VCMPNEQ_U
-  VCMPNEQ_S
   VSHLQ_S
   VSHLQ_U
   VABDQ_S
@@ -605,9 +603,6 @@ (define_c_enum "unspec" [
   VADDVAQ_S
   VADDVQ_P_S
   VBRSRQ_N_S
-  VCMPEQQ_S
-  VCMPEQQ_N_S
-  VCMPNEQ_N_S
   VHADDQ_S
   VHADDQ_N_S
   VHSUBQ_S
@@ -645,9 +640,6 @@ (define_c_enum "unspec" [
   VADDVAQ_U
   VADDVQ_P_U
   VBRSRQ_N_U
-  VCMPEQQ_U
-  VCMPEQQ_N_U
-  VCMPNEQ_N_U
   VHADDQ_U
   VHADDQ_N_U
   VHSUBQ_U
@@ -680,14 +672,6 @@ (define_c_enum "unspec" [
   VSHLQ_R_U
   VSUBQ_U
   VSUBQ_N_U
-  VCMPGEQ_N_S
-  VCMPGEQ_S
-  VCMPGTQ_N_S
-  VCMPGTQ_S
-  VCMPLEQ_N_S
-  VCMPLEQ_S
-  VCMPLTQ_N_S
-  VCMPLTQ_S
   VHCADDQ_ROT270_S
   VHCADDQ_ROT90_S
   VMAXAQ_S
@@ -702,10 +686,6 @@ (define_c_enum "unspec" [
   VQRDMULHQ_N_S
   VQRDMULHQ_S
   VQSHLUQ_N_S
-  VCMPCSQ_N_U
-  VCMPCSQ_U
-  VCMPHIQ_N_U
-  VCMPHIQ_U
   VABDQ_M_S
   VABDQ_M_U
   VABDQ_F
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 0b2b3b1..448731f 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -362,3 +362,111 @@ (define_expand "vlshr<mode>3"
       DONE;
     }
 })
+
+(define_expand "vec_cmp<mode><v_cmp_result>"
+  [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
+	(match_operator:<V_cmp_result> 1 "comparison_operator"
+	  [(match_operand:VDQW 2 "s_register_operand")
+	   (match_operand:VDQW 3 "reg_or_zero_operand")]))]
+  "ARM_HAVE_<MODE>_ARITH
+   && !TARGET_REALLY_IWMMXT
+   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
+			     operands[2], operands[3], false, false);
+  DONE;
+})
+
+(define_expand "vec_cmpu<mode><mode>"
+  [(set (match_operand:VDQIW 0 "s_register_operand")
+	(match_operator:VDQIW 1 "comparison_operator"
+	  [(match_operand:VDQIW 2 "s_register_operand")
+	   (match_operand:VDQIW 3 "reg_or_zero_operand")]))]
+  "ARM_HAVE_<MODE>_ARITH
+   && !TARGET_REALLY_IWMMXT"
+{
+  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
+			     operands[2], operands[3], false, false);
+  DONE;
+})
+
+;; Conditional instructions.  These are comparisons with conditional moves for
+;; vectors.  They perform the assignment:
+;;
+;;     Vop0 = (Vop4 <op3> Vop5) ? Vop1 : Vop2;
+;;
+;; where op3 is <, <=, ==, !=, >= or >.  Operations are performed
+;; element-wise.
+
+(define_expand "vcond<mode><mode>"
+  [(set (match_operand:VDQW 0 "s_register_operand")
+	(if_then_else:VDQW
+	  (match_operator 3 "comparison_operator"
+	    [(match_operand:VDQW 4 "s_register_operand")
+	     (match_operand:VDQW 5 "reg_or_zero_operand")])
+	  (match_operand:VDQW 1 "s_register_operand")
+	  (match_operand:VDQW 2 "s_register_operand")))]
+  "ARM_HAVE_<MODE>_ARITH
+   && !TARGET_REALLY_IWMMXT
+   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  arm_expand_vcond (operands, <V_cmp_result>mode);
+  DONE;
+})
+
+(define_expand "vcond<V_cvtto><mode>"
+  [(set (match_operand:<V_CVTTO> 0 "s_register_operand")
+	(if_then_else:<V_CVTTO>
+	  (match_operator 3 "comparison_operator"
+	    [(match_operand:V32 4 "s_register_operand")
+	     (match_operand:V32 5 "reg_or_zero_operand")])
+	  (match_operand:<V_CVTTO> 1 "s_register_operand")
+	  (match_operand:<V_CVTTO> 2 "s_register_operand")))]
+  "ARM_HAVE_<MODE>_ARITH
+   && !TARGET_REALLY_IWMMXT
+   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  arm_expand_vcond (operands, <V_cmp_result>mode);
+  DONE;
+})
+
+(define_expand "vcondu<mode><v_cmp_result>"
+  [(set (match_operand:VDQW 0 "s_register_operand")
+	(if_then_else:VDQW
+	  (match_operator 3 "arm_comparison_operator"
+	    [(match_operand:<V_cmp_result> 4 "s_register_operand")
+	     (match_operand:<V_cmp_result> 5 "reg_or_zero_operand")])
+	  (match_operand:VDQW 1 "s_register_operand")
+	  (match_operand:VDQW 2 "s_register_operand")))]
+  "ARM_HAVE_<MODE>_ARITH
+   && !TARGET_REALLY_IWMMXT"
+{
+  arm_expand_vcond (operands, <V_cmp_result>mode);
+  DONE;
+})
+
+(define_expand "vcond_mask_<mode><v_cmp_result>"
+  [(set (match_operand:VDQW 0 "s_register_operand")
+        (if_then_else:VDQW
+          (match_operand:<V_cmp_result> 3 "s_register_operand")
+          (match_operand:VDQW 1 "s_register_operand")
+          (match_operand:VDQW 2 "s_register_operand")))]
+  "ARM_HAVE_<MODE>_ARITH
+   && !TARGET_REALLY_IWMMXT
+   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  if (TARGET_NEON)
+    {
+      emit_insn (gen_neon_vbsl (<MODE>mode, operands[0], operands[3],
+                                operands[1], operands[2]));
+    }
+  else if (TARGET_HAVE_MVE)
+    {
+      emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0],
+                                 operands[1], operands[2], operands[3]));
+    }
+  else
+    gcc_unreachable ();
+
+  DONE;
+})
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
new file mode 100644
index 0000000..029c931
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
@@ -0,0 +1,80 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O3" } */
+
+/* Integer tests.  */
+
+#define COMPARE_REG(NAME, OP, TYPE) \
+  TYPE \
+  cmp_##NAME##_##TYPE##_reg (TYPE a, TYPE b) \
+  { \
+    return a OP b; \
+  }
+
+#define COMPARE_REG_AND_ZERO(NAME, OP, TYPE) \
+  COMPARE_REG (NAME, OP, TYPE) \
+  \
+  TYPE \
+  cmp_##NAME##_##TYPE##_zero (TYPE a) \
+  { \
+    return a OP (TYPE) {}; \
+  }
+
+#define COMPARE_TYPE(TYPE, COMPARE_ORDERED) \
+  COMPARE_REG_AND_ZERO (eq, ==, TYPE) \
+  COMPARE_REG_AND_ZERO (ne, !=, TYPE) \
+  COMPARE_ORDERED (lt, <, TYPE) \
+  COMPARE_ORDERED (le, <=, TYPE) \
+  COMPARE_ORDERED (gt, >, TYPE) \
+  COMPARE_ORDERED (ge, >=, TYPE)
+
+#define TEST_TYPE(NAME, ELEM, COMPARE_ORDERED, SIZE)  \
+  typedef ELEM NAME##SIZE __attribute__((vector_size(SIZE))); \
+  COMPARE_TYPE (NAME##SIZE, COMPARE_ORDERED)
+
+/* 64-bits vectors, not vectorized.  */
+TEST_TYPE (vs8, __INT8_TYPE__, COMPARE_REG_AND_ZERO, 8)
+TEST_TYPE (vu8, __UINT8_TYPE__, COMPARE_REG, 8)
+TEST_TYPE (vs16, __INT16_TYPE__, COMPARE_REG_AND_ZERO, 8)
+TEST_TYPE (vu16, __UINT16_TYPE__, COMPARE_REG, 8)
+TEST_TYPE (vs32, __INT32_TYPE__, COMPARE_REG_AND_ZERO, 8)
+TEST_TYPE (vu32, __UINT32_TYPE__, COMPARE_REG, 8)
+
+/* 128-bits vectors.  */
+TEST_TYPE (vs8, __INT8_TYPE__, COMPARE_REG_AND_ZERO, 16)
+TEST_TYPE (vu8, __UINT8_TYPE__, COMPARE_REG, 16)
+TEST_TYPE (vs16, __INT16_TYPE__, COMPARE_REG_AND_ZERO, 16)
+TEST_TYPE (vu16, __UINT16_TYPE__, COMPARE_REG, 16)
+TEST_TYPE (vs32, __INT32_TYPE__, COMPARE_REG_AND_ZERO, 16)
+TEST_TYPE (vu32, __UINT32_TYPE__, COMPARE_REG, 16)
+
+/* { 8 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
+/* { dg-final { scan-assembler-times {\tvcmp.i8  eq, q[0-9]+, q[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.i8  ne, q[0-9]+, q[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s8  lt, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s8  le, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s8  gt, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s8  ge, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u8  hi, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u8  cs, q[0-9]+, q[0-9]+\n} 2 } } */
+
+/* { 16 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
+/* { dg-final { scan-assembler-times {\tvcmp.i16  eq, q[0-9]+, q[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.i16  ne, q[0-9]+, q[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s16  lt, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s16  le, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s16  gt, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s16  ge, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u16  hi, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u16  cs, q[0-9]+, q[0-9]+\n} 2 } } */
+
+/* { 32 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
+/* { dg-final { scan-assembler-times {\tvcmp.i32  eq, q[0-9]+, q[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.i32  ne, q[0-9]+, q[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s32  lt, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s32  le, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s32  gt, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s32  ge, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u32  hi, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u32  cs, q[0-9]+, q[0-9]+\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
new file mode 100644
index 0000000..8515195
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
@@ -0,0 +1,38 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+
+/* float 32 tests.  */
+
+#ifndef ELEM_TYPE
+#define ELEM_TYPE float
+#endif
+#ifndef INT_ELEM_TYPE
+#define INT_ELEM_TYPE __INT32_TYPE__
+#endif
+
+#define COMPARE(NAME, OP)			\
+  int_vec					\
+  cmp_##NAME##_reg (vec a, vec b)		\
+  {						\
+    return a OP b;				\
+  }
+
+typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));
+typedef ELEM_TYPE vec __attribute__((vector_size(16)));
+
+COMPARE (eq, ==)
+COMPARE (ne, !=)
+COMPARE (lt, <)
+COMPARE (le, <=)
+COMPARE (gt, >)
+COMPARE (ge, >=)
+
+/* eq, ne, lt, le, gt, ge.
+/* { dg-final { scan-assembler-times {\tvcmp.f32\teq, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tne, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tle, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tge, q[0-9]+, q[0-9]+\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
new file mode 100644
index 0000000..7774972
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
@@ -0,0 +1,69 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O3" } */
+
+#define COMPARE_REG(NAME, OP, TYPE, SCALAR)	  \
+  TYPE						  \
+  cmp_##NAME##_##TYPE##_scalar (TYPE a, SCALAR b) \
+  {						  \
+    return a OP b;				  \
+  }
+
+#define COMPARE_TYPE(SCALAR, TYPE)				\
+  COMPARE_REG (eq, ==, TYPE, SCALAR)				\
+  COMPARE_REG (ne, !=, TYPE, SCALAR)				\
+  COMPARE_REG (lt, <, TYPE, SCALAR)				\
+  COMPARE_REG (le, <=, TYPE, SCALAR)				\
+  COMPARE_REG (gt, >, TYPE, SCALAR)				\
+  COMPARE_REG (ge, >=, TYPE, SCALAR)
+
+#define TEST_TYPE(NAME, ELEM, SIZE)			      \
+  typedef ELEM NAME##SIZE __attribute__((vector_size(SIZE))); \
+  COMPARE_TYPE (ELEM, NAME##SIZE)
+
+/* 64-bits vectors, not vectorized.  */
+TEST_TYPE (vs8, __INT8_TYPE__, 8)
+TEST_TYPE (vu8, __UINT8_TYPE__, 8)
+TEST_TYPE (vs16, __INT16_TYPE__, 8)
+TEST_TYPE (vu16, __UINT16_TYPE__, 8)
+TEST_TYPE (vs32, __INT32_TYPE__, 8)
+TEST_TYPE (vu32, __UINT32_TYPE__, 8)
+
+/* 128-bits vectors.  */
+TEST_TYPE (vs8, __INT8_TYPE__, 16)
+TEST_TYPE (vu8, __UINT8_TYPE__, 16)
+TEST_TYPE (vs16, __INT16_TYPE__, 16)
+TEST_TYPE (vu16, __UINT16_TYPE__, 16)
+TEST_TYPE (vs32, __INT32_TYPE__, 16)
+TEST_TYPE (vu32, __UINT32_TYPE__, 16)
+
+/* { 8 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
+/* { dg-final { scan-assembler-times {\tvcmp.i8  eq, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.i8  ne, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s8  lt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s8  le, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s8  gt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s8  ge, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u8  hi, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u8  cs, q[0-9]+, q[0-9]+\n} 2 } } */
+
+/* { 16 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
+/* { dg-final { scan-assembler-times {\tvcmp.i16  eq, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.i16  ne, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s16  lt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s16  le, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s16  gt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s16  ge, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u16  hi, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u16  cs, q[0-9]+, q[0-9]+\n} 2 } } */
+
+/* { 32 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
+/* { dg-final { scan-assembler-times {\tvcmp.i32  eq, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.i32  ne, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s32  lt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s32  le, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s32  gt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s32  ge, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u32  hi, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u32  cs, q[0-9]+, q[0-9]+\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
new file mode 100644
index 0000000..4ed449e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
@@ -0,0 +1,30 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+
+#include <stdint.h>
+
+#define NB 4
+
+#define FUNC(OP, NAME)							\
+  void test_ ## NAME ##_f (float * __restrict__ dest, float *a, float *b) { \
+    int i;								\
+    for (i=0; i<NB; i++) {						\
+      dest[i] = a[i] OP b[i];						\
+    }									\
+  }
+
+FUNC(==, vcmpeq)
+FUNC(!=, vcmpne)
+FUNC(<, vcmplt)
+FUNC(<=, vcmple)
+FUNC(>, vcmpgt)
+FUNC(>=, vcmpge)
+
+/* { dg-final { scan-assembler-times {\tvcmp.f32\teq, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tne, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tle, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tge, q[0-9]+, q[0-9]+\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c
new file mode 100644
index 0000000..8da15e7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c
@@ -0,0 +1,50 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O3" } */
+
+#include <stdint.h>
+
+#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)				\
+  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
+    int i;								\
+    for (i=0; i<NB; i++) {						\
+      dest[i] = a[i] OP b[i];						\
+    }									\
+}
+
+#define ALL_FUNCS(OP, NAME) \
+  FUNC(s, int, 32, 2, OP, NAME)			\
+  FUNC(u, uint, 32, 2, OP, NAME)		\
+  FUNC(s, int, 16, 4, OP, NAME)			\
+  FUNC(u, uint, 16, 4, OP, NAME)		\
+  FUNC(s, int, 8, 8, OP, NAME)			\
+  FUNC(u, uint, 8, 8, OP, NAME)			\
+  FUNC(s, int, 32, 4, OP, NAME)			\
+  FUNC(u, uint, 32, 4, OP, NAME)		\
+  FUNC(s, int, 16, 8, OP, NAME)			\
+  FUNC(u, uint, 16, 8, OP, NAME)		\
+  FUNC(s, int, 8, 16, OP, NAME)			\
+  FUNC(u, uint, 8, 16, OP, NAME)
+
+ALL_FUNCS(==, vcmpeq)
+ALL_FUNCS(!=, vcmpne)
+ALL_FUNCS(<, vcmplt)
+ALL_FUNCS(<=, vcmple)
+ALL_FUNCS(>, vcmpgt)
+ALL_FUNCS(>=, vcmpge)
+
+/* MVE has only 128-bit vectors, so we can vectorize only half of the
+   functions above.  */
+/* { dg-final { scan-assembler-times {\tvcmp.i[0-9]+  eq, q[0-9]+, q[0-9]+\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.i[0-9]+  ne, q[0-9]+, q[0-9]+\n} 6 } } */
+
+/* lt, le, gt, ge apply to signed types, cs and hi to unsigned types.  */
+/* lt and le with unsigned types are replaced with the opposite condition, hence
+   the double number of matches for cs and hi.  */
+/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  lt, q[0-9]+, q[0-9]+\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  le, q[0-9]+, q[0-9]+\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  gt, q[0-9]+, q[0-9]+\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  ge, q[0-9]+, q[0-9]+\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u[0-9]+  cs, q[0-9]+, q[0-9]+\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.u[0-9]+  hi, q[0-9]+, q[0-9]+\n} 6 } } */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 7/9] arm: Auto-vectorization for MVE: add __fp16 support to VCMP
  2021-05-04 17:03       ` Christophe Lyon
@ 2021-05-05 14:09         ` Christophe Lyon
  2021-05-17  9:54           ` Christophe Lyon
  2021-05-17 10:49           ` Kyrylo Tkachov
  0 siblings, 2 replies; 35+ messages in thread
From: Christophe Lyon @ 2021-05-05 14:09 UTC (permalink / raw)
  To: Andre Vieira (lists); +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 11945 bytes --]

On Tue, 4 May 2021 at 19:03, Christophe Lyon <christophe.lyon@linaro.org> wrote:
>
> On Tue, 4 May 2021 at 15:43, Christophe Lyon <christophe.lyon@linaro.org> wrote:
> >
> > On Tue, 4 May 2021 at 13:48, Andre Vieira (lists)
> > <andre.simoesdiasvieira@arm.com> wrote:
> > >
> > > It would be good to also add tests for NEON as you also enable auto-vec
> > > for it. I checked and I do think the necessary 'neon_vc' patterns exist
> > > for 'VH', so we should be OK there.
> > >
> >
> > Actually since I posted the patch series, I've noticed a regression in
> > armv8_2-fp16-arith-1.c, because we now vectorize all the float16x[48]_t loops,
> > but we lose the fact that some FP comparisons can throw exceptions.
> >
> > I'll have to revisit this patch.
>
> Actually it looks like my patch does the right thing: we now vectorize
> appropriately, given that the testcase is compiled with -ffast-math.
> I need to update the testcase, though.
>

Here is a new version, with armv8_2-fp16-arith-1.c updated to take
into account the new vectorization.

Christophe


> >
> > Thanks,
> >
> > Christophe
> >
> > > On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:
> > > > This patch adds __fp16 support to the previous patch that added vcmp
> > > > support with MVE. For this we update existing expanders to use VDQWH
> > > > iterator, and add a new expander vcond<VH_cvtto><mode>.  In the
> > > > process we need to create suitable iterators, and update v_cmp_result
> > > > as needed.
> > > >
> > > > 2021-04-26  Christophe Lyon  <christophe.lyon@linaro.org>
> > > >
> > > >       gcc/
> > > >       * config/arm/iterators.md (V16): New iterator.
> > > >       (VH_cvtto): New iterator.
> > > >       (v_cmp_result): Added V4HF and V8HF support.
> > > >       * config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>): Use VDQWH.
> > > >       (vcond<mode><mode>): Likewise.
> > > >       (vcond_mask_<mode><v_cmp_result>): Likewise.
> > > >       (vcond<VH_cvtto><mode>): New expander.
> > > >
> > > >       gcc/testsuite/
> > > >       * gcc.target/arm/simd/mve-compare-3.c: New test with GCC vectors.
> > > >       * gcc.target/arm/simd/mve-vcmp-f16.c: New test for
> > > >       auto-vectorization.
> > > > ---
> > > >   gcc/config/arm/iterators.md                       |  6 ++++
> > > >   gcc/config/arm/vec-common.md                      | 40 ++++++++++++++++-------
> > > >   gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c | 38 +++++++++++++++++++++
> > > >   gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c  | 30 +++++++++++++++++
> > > >   4 files changed, 102 insertions(+), 12 deletions(-)
> > > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
> > > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
> > > >
> > > > diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> > > > index a128465..3042baf 100644
> > > > --- a/gcc/config/arm/iterators.md
> > > > +++ b/gcc/config/arm/iterators.md
> > > > @@ -231,6 +231,9 @@ (define_mode_iterator VU [V16QI V8HI V4SI])
> > > >   ;; Vector modes for 16-bit floating-point support.
> > > >   (define_mode_iterator VH [V8HF V4HF])
> > > >
> > > > +;; Modes with 16-bit elements only.
> > > > +(define_mode_iterator V16 [V4HI V4HF V8HI V8HF])
> > > > +
> > > >   ;; 16-bit floating-point vector modes suitable for moving (includes BFmode).
> > > >   (define_mode_iterator VHFBF [V8HF V4HF V4BF V8BF])
> > > >
> > > > @@ -571,6 +574,8 @@ (define_mode_attr V_cvtto [(V2SI "v2sf") (V2SF "v2si")
> > > >   ;; (Opposite) mode to convert to/from for vector-half mode conversions.
> > > >   (define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")
> > > >                           (V8HI "V8HF") (V8HF "V8HI")])
> > > > +(define_mode_attr VH_cvtto [(V4HI "v4hf") (V4HF "v4hi")
> > > > +                         (V8HI "v8hf") (V8HF "v8hi")])
> > > >
> > > >   ;; Define element mode for each vector mode.
> > > >   (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
> > > > @@ -720,6 +725,7 @@ (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")
> > > >   (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")
> > > >                               (V4HI "v4hi") (V8HI  "v8hi")
> > > >                               (V2SI "v2si") (V4SI  "v4si")
> > > > +                             (V4HF "v4hi") (V8HF  "v8hi")
> > > >                               (DI   "di")   (V2DI  "v2di")
> > > >                               (V2SF "v2si") (V4SF  "v4si")])
> > > >
> > > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > > > index 034b48b..3fd341c 100644
> > > > --- a/gcc/config/arm/vec-common.md
> > > > +++ b/gcc/config/arm/vec-common.md
> > > > @@ -366,8 +366,8 @@ (define_expand "vlshr<mode>3"
> > > >   (define_expand "vec_cmp<mode><v_cmp_result>"
> > > >     [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
> > > >       (match_operator:<V_cmp_result> 1 "comparison_operator"
> > > > -       [(match_operand:VDQW 2 "s_register_operand")
> > > > -        (match_operand:VDQW 3 "reg_or_zero_operand")]))]
> > > > +       [(match_operand:VDQWH 2 "s_register_operand")
> > > > +        (match_operand:VDQWH 3 "reg_or_zero_operand")]))]
> > > >     "ARM_HAVE_<MODE>_ARITH
> > > >      && !TARGET_REALLY_IWMMXT
> > > >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > > @@ -399,13 +399,13 @@ (define_expand "vec_cmpu<mode><mode>"
> > > >   ;; element-wise.
> > > >
> > > >   (define_expand "vcond<mode><mode>"
> > > > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > > > -     (if_then_else:VDQW
> > > > +  [(set (match_operand:VDQWH 0 "s_register_operand")
> > > > +     (if_then_else:VDQWH
> > > >         (match_operator 3 "comparison_operator"
> > > > -         [(match_operand:VDQW 4 "s_register_operand")
> > > > -          (match_operand:VDQW 5 "reg_or_zero_operand")])
> > > > -       (match_operand:VDQW 1 "s_register_operand")
> > > > -       (match_operand:VDQW 2 "s_register_operand")))]
> > > > +         [(match_operand:VDQWH 4 "s_register_operand")
> > > > +          (match_operand:VDQWH 5 "reg_or_zero_operand")])
> > > > +       (match_operand:VDQWH 1 "s_register_operand")
> > > > +       (match_operand:VDQWH 2 "s_register_operand")))]
> > > >     "ARM_HAVE_<MODE>_ARITH
> > > >      && !TARGET_REALLY_IWMMXT
> > > >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > > @@ -430,6 +430,22 @@ (define_expand "vcond<V_cvtto><mode>"
> > > >     DONE;
> > > >   })
> > > >
> > > > +(define_expand "vcond<VH_cvtto><mode>"
> > > > +  [(set (match_operand:<VH_CVTTO> 0 "s_register_operand")
> > > > +     (if_then_else:<VH_CVTTO>
> > > > +       (match_operator 3 "comparison_operator"
> > > > +         [(match_operand:V16 4 "s_register_operand")
> > > > +          (match_operand:V16 5 "reg_or_zero_operand")])
> > > > +       (match_operand:<VH_CVTTO> 1 "s_register_operand")
> > > > +       (match_operand:<VH_CVTTO> 2 "s_register_operand")))]
> > > > +  "ARM_HAVE_<MODE>_ARITH
> > > > +   && !TARGET_REALLY_IWMMXT
> > > > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > > +{
> > > > +  arm_expand_vcond (operands, <V_cmp_result>mode);
> > > > +  DONE;
> > > > +})
> > > > +
> > > >   (define_expand "vcondu<mode><v_cmp_result>"
> > > >     [(set (match_operand:VDQW 0 "s_register_operand")
> > > >       (if_then_else:VDQW
> > > > @@ -446,11 +462,11 @@ (define_expand "vcondu<mode><v_cmp_result>"
> > > >   })
> > > >
> > > >   (define_expand "vcond_mask_<mode><v_cmp_result>"
> > > > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > > > -        (if_then_else:VDQW
> > > > +  [(set (match_operand:VDQWH 0 "s_register_operand")
> > > > +        (if_then_else:VDQWH
> > > >             (match_operand:<V_cmp_result> 3 "s_register_operand")
> > > > -          (match_operand:VDQW 1 "s_register_operand")
> > > > -          (match_operand:VDQW 2 "s_register_operand")))]
> > > > +          (match_operand:VDQWH 1 "s_register_operand")
> > > > +          (match_operand:VDQWH 2 "s_register_operand")))]
> > > >     "ARM_HAVE_<MODE>_ARITH
> > > >      && !TARGET_REALLY_IWMMXT"
> > > >   {
> > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
> > > > new file mode 100644
> > > > index 0000000..76f81e8
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
> > > > @@ -0,0 +1,38 @@
> > > > +/* { dg-do assemble } */
> > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > > > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> > > > +
> > > > +/* float 16 tests.  */
> > > > +
> > > > +#ifndef ELEM_TYPE
> > > > +#define ELEM_TYPE __fp16
> > > > +#endif
> > > > +#ifndef INT_ELEM_TYPE
> > > > +#define INT_ELEM_TYPE __INT16_TYPE__
> > > > +#endif
> > > > +
> > > > +#define COMPARE(NAME, OP)                    \
> > > > +  int_vec                                    \
> > > > +  cmp_##NAME##_reg (vec a, vec b)            \
> > > > +  {                                          \
> > > > +    return a OP b;                           \
> > > > +  }
> > > > +
> > > > +typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));
> > > > +typedef ELEM_TYPE vec __attribute__((vector_size(16)));
> > > > +
> > > > +COMPARE (eq, ==)
> > > > +COMPARE (ne, !=)
> > > > +COMPARE (lt, <)
> > > > +COMPARE (le, <=)
> > > > +COMPARE (gt, >)
> > > > +COMPARE (ge, >=)
> > > > +
> > > > +/* eq, ne, lt, le, gt, ge.
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
> > > > new file mode 100644
> > > > index 0000000..dbae2d1
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
> > > > @@ -0,0 +1,30 @@
> > > > +/* { dg-do assemble } */
> > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > > > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> > > > +
> > > > +#include <stdint.h>
> > > > +
> > > > +#define NB 8
> > > > +
> > > > +#define FUNC(OP, NAME)                                                       \
> > > > +  void test_ ## NAME ##_f (__fp16 * __restrict__ dest, __fp16 *a, __fp16 *b) { \
> > > > +    int i;                                                           \
> > > > +    for (i=0; i<NB; i++) {                                           \
> > > > +      dest[i] = a[i] OP b[i];                                                \
> > > > +    }                                                                        \
> > > > +  }
> > > > +
> > > > +FUNC(==, vcmpeq)
> > > > +FUNC(!=, vcmpne)
> > > > +FUNC(<, vcmplt)
> > > > +FUNC(<=, vcmple)
> > > > +FUNC(>, vcmpgt)
> > > > +FUNC(>=, vcmpge)
> > > > +
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */

[-- Attachment #2: v2-0007-arm-Auto-vectorization-for-MVE-add-__fp16-support.patch --]
[-- Type: text/x-patch, Size: 10340 bytes --]

From 0fd42e32d76b455b6c1a49dcc24902f810d9d482 Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.lyon@linaro.org>
Date: Fri, 23 Apr 2021 14:17:10 +0000
Subject: [PATCH v2 7/9] arm: Auto-vectorization for MVE: add __fp16 support to
 VCMP

This patch adds __fp16 support to the previous patch that added vcmp
support with MVE. For this we update existing expanders to use VDQWH
iterator, and add a new expander vcond<VH_cvtto><mode>.  In the
process we need to create suitable iterators, and update v_cmp_result
as needed.

2021-04-26  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/iterators.md (V16): New iterator.
	(VH_cvtto): New iterator.
	(v_cmp_result): Added V4HF and V8HF support.
	* config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>): Use VDQWH.
	(vcond<mode><mode>): Likewise.
	(vcond_mask_<mode><v_cmp_result>): Likewise.
	(vcond<VH_cvtto><mode>): New expander.

	gcc/testsuite/
	* gcc.target/arm/simd/mve-compare-3.c: New test with GCC vectors.
	* gcc.target/arm/simd/mve-vcmp-f16.c: New test for
	auto-vectorization.
	* gcc.target/arm/armv8_2-fp16-arith-1.c: Adjust since we now
	vectorize float16_t vectors.
---
 gcc/config/arm/iterators.md                        |  6 ++++
 gcc/config/arm/vec-common.md                       | 40 +++++++++++++++-------
 .../gcc.target/arm/armv8_2-fp16-arith-1.c          | 16 +++++++--
 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c  | 38 ++++++++++++++++++++
 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c   | 30 ++++++++++++++++
 5 files changed, 116 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index a128465..3042baf 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -231,6 +231,9 @@ (define_mode_iterator VU [V16QI V8HI V4SI])
 ;; Vector modes for 16-bit floating-point support.
 (define_mode_iterator VH [V8HF V4HF])
 
+;; Modes with 16-bit elements only.
+(define_mode_iterator V16 [V4HI V4HF V8HI V8HF])
+
 ;; 16-bit floating-point vector modes suitable for moving (includes BFmode).
 (define_mode_iterator VHFBF [V8HF V4HF V4BF V8BF])
 
@@ -571,6 +574,8 @@ (define_mode_attr V_cvtto [(V2SI "v2sf") (V2SF "v2si")
 ;; (Opposite) mode to convert to/from for vector-half mode conversions.
 (define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")
 			    (V8HI "V8HF") (V8HF "V8HI")])
+(define_mode_attr VH_cvtto [(V4HI "v4hf") (V4HF "v4hi")
+			    (V8HI "v8hf") (V8HF "v8hi")])
 
 ;; Define element mode for each vector mode.
 (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
@@ -720,6 +725,7 @@ (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")
 (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")
 				(V4HI "v4hi") (V8HI  "v8hi")
 				(V2SI "v2si") (V4SI  "v4si")
+				(V4HF "v4hi") (V8HF  "v8hi")
 				(DI   "di")   (V2DI  "v2di")
 				(V2SF "v2si") (V4SF  "v4si")])
 
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 448731f..265fa40 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -366,8 +366,8 @@ (define_expand "vlshr<mode>3"
 (define_expand "vec_cmp<mode><v_cmp_result>"
   [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
 	(match_operator:<V_cmp_result> 1 "comparison_operator"
-	  [(match_operand:VDQW 2 "s_register_operand")
-	   (match_operand:VDQW 3 "reg_or_zero_operand")]))]
+	  [(match_operand:VDQWH 2 "s_register_operand")
+	   (match_operand:VDQWH 3 "reg_or_zero_operand")]))]
   "ARM_HAVE_<MODE>_ARITH
    && !TARGET_REALLY_IWMMXT
    && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
@@ -399,13 +399,13 @@ (define_expand "vec_cmpu<mode><mode>"
 ;; element-wise.
 
 (define_expand "vcond<mode><mode>"
-  [(set (match_operand:VDQW 0 "s_register_operand")
-	(if_then_else:VDQW
+  [(set (match_operand:VDQWH 0 "s_register_operand")
+	(if_then_else:VDQWH
 	  (match_operator 3 "comparison_operator"
-	    [(match_operand:VDQW 4 "s_register_operand")
-	     (match_operand:VDQW 5 "reg_or_zero_operand")])
-	  (match_operand:VDQW 1 "s_register_operand")
-	  (match_operand:VDQW 2 "s_register_operand")))]
+	    [(match_operand:VDQWH 4 "s_register_operand")
+	     (match_operand:VDQWH 5 "reg_or_zero_operand")])
+	  (match_operand:VDQWH 1 "s_register_operand")
+	  (match_operand:VDQWH 2 "s_register_operand")))]
   "ARM_HAVE_<MODE>_ARITH
    && !TARGET_REALLY_IWMMXT
    && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
@@ -430,6 +430,22 @@ (define_expand "vcond<V_cvtto><mode>"
   DONE;
 })
 
+(define_expand "vcond<VH_cvtto><mode>"
+  [(set (match_operand:<VH_CVTTO> 0 "s_register_operand")
+	(if_then_else:<VH_CVTTO>
+	  (match_operator 3 "comparison_operator"
+	    [(match_operand:V16 4 "s_register_operand")
+	     (match_operand:V16 5 "reg_or_zero_operand")])
+	  (match_operand:<VH_CVTTO> 1 "s_register_operand")
+	  (match_operand:<VH_CVTTO> 2 "s_register_operand")))]
+  "ARM_HAVE_<MODE>_ARITH
+   && !TARGET_REALLY_IWMMXT
+   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  arm_expand_vcond (operands, <V_cmp_result>mode);
+  DONE;
+})
+
 (define_expand "vcondu<mode><v_cmp_result>"
   [(set (match_operand:VDQW 0 "s_register_operand")
 	(if_then_else:VDQW
@@ -446,11 +462,11 @@ (define_expand "vcondu<mode><v_cmp_result>"
 })
 
 (define_expand "vcond_mask_<mode><v_cmp_result>"
-  [(set (match_operand:VDQW 0 "s_register_operand")
-        (if_then_else:VDQW
+  [(set (match_operand:VDQWH 0 "s_register_operand")
+        (if_then_else:VDQWH
           (match_operand:<V_cmp_result> 3 "s_register_operand")
-          (match_operand:VDQW 1 "s_register_operand")
-          (match_operand:VDQW 2 "s_register_operand")))]
+          (match_operand:VDQWH 1 "s_register_operand")
+          (match_operand:VDQWH 2 "s_register_operand")))]
   "ARM_HAVE_<MODE>_ARITH
    && !TARGET_REALLY_IWMMXT
    && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
diff --git a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-1.c b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-1.c
index 921d26e..52b8737 100644
--- a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-1.c
+++ b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-1.c
@@ -104,8 +104,20 @@ TEST_CMP (greaterthanqual, >=, int16x8_t, float16x8_t)
 /* { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
 
 /* { dg-final { scan-assembler-times {vdiv\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } }  */
-/* { dg-final { scan-assembler-times {vcmp\.f32\ts[0-9]+, s[0-9]+} 26 } }  */
-/* { dg-final { scan-assembler-times {vcmpe\.f32\ts[0-9]+, s[0-9]+} 52 } }  */
+
+/* For float16_t.  */
+/* { dg-final { scan-assembler-times {vcmp\.f32\ts[0-9]+, s[0-9]+} 2 } }  */
+/* { dg-final { scan-assembler-times {vcmpe\.f32\ts[0-9]+, s[0-9]+} 4 } }  */
+
+/* For float16x4_t.  */
+/* { dg-final { scan-assembler-times {vceq\.f16\td[0-9]+, d[0-9]+} 2 } }  */
+/* { dg-final { scan-assembler-times {vcge\.f16\td[0-9]+, d[0-9]+} 2 } }  */
+/* { dg-final { scan-assembler-times {vcgt\.f16\td[0-9]+, d[0-9]+} 2 } }  */
+
+/* For float16x8_t.  */
+/* { dg-final { scan-assembler-times {vceq\.f16\tq[0-9]+, q[0-9]+} 2 } }  */
+/* { dg-final { scan-assembler-times {vcge\.f16\tq[0-9]+, q[0-9]+} 2 } }  */
+/* { dg-final { scan-assembler-times {vcgt\.f16\tq[0-9]+, q[0-9]+} 2 } }  */
 
 /* { dg-final { scan-assembler-not {vadd\.f32} } }  */
 /* { dg-final { scan-assembler-not {vsub\.f32} } }  */
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
new file mode 100644
index 0000000..76f81e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
@@ -0,0 +1,38 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+
+/* float 16 tests.  */
+
+#ifndef ELEM_TYPE
+#define ELEM_TYPE __fp16
+#endif
+#ifndef INT_ELEM_TYPE
+#define INT_ELEM_TYPE __INT16_TYPE__
+#endif
+
+#define COMPARE(NAME, OP)			\
+  int_vec					\
+  cmp_##NAME##_reg (vec a, vec b)		\
+  {						\
+    return a OP b;				\
+  }
+
+typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));
+typedef ELEM_TYPE vec __attribute__((vector_size(16)));
+
+COMPARE (eq, ==)
+COMPARE (ne, !=)
+COMPARE (lt, <)
+COMPARE (le, <=)
+COMPARE (gt, >)
+COMPARE (ge, >=)
+
+/* eq, ne, lt, le, gt, ge.
+/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
new file mode 100644
index 0000000..dbae2d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
@@ -0,0 +1,30 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+
+#include <stdint.h>
+
+#define NB 8
+
+#define FUNC(OP, NAME)							\
+  void test_ ## NAME ##_f (__fp16 * __restrict__ dest, __fp16 *a, __fp16 *b) { \
+    int i;								\
+    for (i=0; i<NB; i++) {						\
+      dest[i] = a[i] OP b[i];						\
+    }									\
+  }
+
+FUNC(==, vcmpeq)
+FUNC(!=, vcmpne)
+FUNC(<, vcmplt)
+FUNC(<=, vcmple)
+FUNC(>, vcmpgt)
+FUNC(>=, vcmpge)
+
+/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version
  2021-04-30 14:09 [PATCH 1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version Christophe Lyon
                   ` (7 preceding siblings ...)
  2021-04-30 14:09 ` [PATCH 9/9] arm: Auto-vectorization for MVE: vld4/vst4 Christophe Lyon
@ 2021-05-10 11:21 ` Christophe Lyon
  2021-05-10 11:54 ` Kyrylo Tkachov
  9 siblings, 0 replies; 35+ messages in thread
From: Christophe Lyon @ 2021-05-10 11:21 UTC (permalink / raw)
  To: gcc Patches

Ping for the series?

On Fri, 30 Apr 2021 at 16:09, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
>
> There is no need to have a signed and an unsigned version of these
> builtins. This is similar to what we do for Neon in arm_neon.h.
> This mechanical patch enables later cleanup patches.
>
> 2021-03-01  Christophe Lyon  <christophe.lyon@linaro.org>
>
>         gcc/
>         * config/arm/arm_mve.h (__arm_vcmpeq*u*, __arm_vcmpne*u*): Call
>         the 's' version of the builtin.
> ---
>  gcc/config/arm/arm_mve.h | 24 ++++++++++++------------
>  1 file changed, 12 insertions(+), 12 deletions(-)
>
> diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
> index 3a40c6e..e4dfe91 100644
> --- a/gcc/config/arm/arm_mve.h
> +++ b/gcc/config/arm/arm_mve.h
> @@ -3695,21 +3695,21 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_u8 (uint8x16_t __a, uint8x16_t __b)
>  {
> -  return __builtin_mve_vcmpneq_uv16qi (__a, __b);
> +  return __builtin_mve_vcmpneq_sv16qi ((int8x16_t)__a, (int8x16_t)__b);
>  }
>
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_u16 (uint16x8_t __a, uint16x8_t __b)
>  {
> -  return __builtin_mve_vcmpneq_uv8hi (__a, __b);
> +  return __builtin_mve_vcmpneq_sv8hi ((int16x8_t)__a, (int16x8_t)__b);
>  }
>
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_u32 (uint32x4_t __a, uint32x4_t __b)
>  {
> -  return __builtin_mve_vcmpneq_uv4si (__a, __b);
> +  return __builtin_mve_vcmpneq_sv4si ((int32x4_t)__a, (int32x4_t)__b);
>  }
>
>  __extension__ extern __inline int8x16_t
> @@ -3932,7 +3932,7 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_n_u8 (uint8x16_t __a, uint8_t __b)
>  {
> -  return __builtin_mve_vcmpneq_n_uv16qi (__a, __b);
> +  return __builtin_mve_vcmpneq_n_sv16qi ((int8x16_t)__a, (int8_t)__b);
>  }
>
>  __extension__ extern __inline mve_pred16_t
> @@ -3953,14 +3953,14 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_u8 (uint8x16_t __a, uint8x16_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_uv16qi (__a, __b);
> +  return __builtin_mve_vcmpeqq_sv16qi ((int8x16_t)__a, (int8x16_t)__b);
>  }
>
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_n_u8 (uint8x16_t __a, uint8_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_n_uv16qi (__a, __b);
> +  return __builtin_mve_vcmpeqq_n_sv16qi ((int8x16_t)__a, (int8_t)__b);
>  }
>
>  __extension__ extern __inline mve_pred16_t
> @@ -4774,7 +4774,7 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_n_u16 (uint16x8_t __a, uint16_t __b)
>  {
> -  return __builtin_mve_vcmpneq_n_uv8hi (__a, __b);
> +  return __builtin_mve_vcmpneq_n_sv8hi ((int16x8_t)__a, (int16_t)__b);
>  }
>
>  __extension__ extern __inline mve_pred16_t
> @@ -4795,14 +4795,14 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_u16 (uint16x8_t __a, uint16x8_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_uv8hi (__a, __b);
> +  return __builtin_mve_vcmpeqq_sv8hi ((int16x8_t)__a, (int16x8_t)__b);
>  }
>
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_n_u16 (uint16x8_t __a, uint16_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_n_uv8hi (__a, __b);
> +  return __builtin_mve_vcmpeqq_n_sv8hi ((int16x8_t)__a, (int16_t)__b);
>  }
>
>  __extension__ extern __inline mve_pred16_t
> @@ -5616,7 +5616,7 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_n_u32 (uint32x4_t __a, uint32_t __b)
>  {
> -  return __builtin_mve_vcmpneq_n_uv4si (__a, __b);
> +  return __builtin_mve_vcmpneq_n_sv4si ((int32x4_t)__a, (int32_t)__b);
>  }
>
>  __extension__ extern __inline mve_pred16_t
> @@ -5637,14 +5637,14 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_u32 (uint32x4_t __a, uint32x4_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_uv4si (__a, __b);
> +  return __builtin_mve_vcmpeqq_sv4si ((int32x4_t)__a, (int32x4_t)__b);
>  }
>
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_n_u32 (uint32x4_t __a, uint32_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_n_uv4si (__a, __b);
> +  return __builtin_mve_vcmpeqq_n_sv4si ((int32x4_t)__a, (int32_t)__b);
>  }
>
>  __extension__ extern __inline mve_pred16_t
> --
> 2.7.4
>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version
  2021-04-30 14:09 [PATCH 1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version Christophe Lyon
                   ` (8 preceding siblings ...)
  2021-05-10 11:21 ` [PATCH 1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version Christophe Lyon
@ 2021-05-10 11:54 ` Kyrylo Tkachov
  9 siblings, 0 replies; 35+ messages in thread
From: Kyrylo Tkachov @ 2021-05-10 11:54 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches



> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> Christophe Lyon via Gcc-patches
> Sent: 30 April 2021 15:10
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH 1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use
> only 's' builtin version
> 
> There is no need to have a signed and an unsigned version of these
> builtins. This is similar to what we do for Neon in arm_neon.h.
> This mechanical patch enables later cleanup patches.

Ok.
Thanks, the patches up to 4/9 seem good mechanical clean ups, the code gen changes are after 5/9. I'll get to them soon...
Kyrill

> 
> 2021-03-01  Christophe Lyon  <christophe.lyon@linaro.org>
> 
> 	gcc/
> 	* config/arm/arm_mve.h (__arm_vcmpeq*u*, __arm_vcmpne*u*):
> Call
> 	the 's' version of the builtin.
> ---
>  gcc/config/arm/arm_mve.h | 24 ++++++++++++------------
>  1 file changed, 12 insertions(+), 12 deletions(-)
> 
> diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
> index 3a40c6e..e4dfe91 100644
> --- a/gcc/config/arm/arm_mve.h
> +++ b/gcc/config/arm/arm_mve.h
> @@ -3695,21 +3695,21 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_u8 (uint8x16_t __a, uint8x16_t __b)
>  {
> -  return __builtin_mve_vcmpneq_uv16qi (__a, __b);
> +  return __builtin_mve_vcmpneq_sv16qi ((int8x16_t)__a, (int8x16_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_u16 (uint16x8_t __a, uint16x8_t __b)
>  {
> -  return __builtin_mve_vcmpneq_uv8hi (__a, __b);
> +  return __builtin_mve_vcmpneq_sv8hi ((int16x8_t)__a, (int16x8_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_u32 (uint32x4_t __a, uint32x4_t __b)
>  {
> -  return __builtin_mve_vcmpneq_uv4si (__a, __b);
> +  return __builtin_mve_vcmpneq_sv4si ((int32x4_t)__a, (int32x4_t)__b);
>  }
> 
>  __extension__ extern __inline int8x16_t
> @@ -3932,7 +3932,7 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_n_u8 (uint8x16_t __a, uint8_t __b)
>  {
> -  return __builtin_mve_vcmpneq_n_uv16qi (__a, __b);
> +  return __builtin_mve_vcmpneq_n_sv16qi ((int8x16_t)__a, (int8_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
> @@ -3953,14 +3953,14 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_u8 (uint8x16_t __a, uint8x16_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_uv16qi (__a, __b);
> +  return __builtin_mve_vcmpeqq_sv16qi ((int8x16_t)__a, (int8x16_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_n_u8 (uint8x16_t __a, uint8_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_n_uv16qi (__a, __b);
> +  return __builtin_mve_vcmpeqq_n_sv16qi ((int8x16_t)__a, (int8_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
> @@ -4774,7 +4774,7 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_n_u16 (uint16x8_t __a, uint16_t __b)
>  {
> -  return __builtin_mve_vcmpneq_n_uv8hi (__a, __b);
> +  return __builtin_mve_vcmpneq_n_sv8hi ((int16x8_t)__a, (int16_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
> @@ -4795,14 +4795,14 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_u16 (uint16x8_t __a, uint16x8_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_uv8hi (__a, __b);
> +  return __builtin_mve_vcmpeqq_sv8hi ((int16x8_t)__a, (int16x8_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_n_u16 (uint16x8_t __a, uint16_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_n_uv8hi (__a, __b);
> +  return __builtin_mve_vcmpeqq_n_sv8hi ((int16x8_t)__a, (int16_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
> @@ -5616,7 +5616,7 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_n_u32 (uint32x4_t __a, uint32_t __b)
>  {
> -  return __builtin_mve_vcmpneq_n_uv4si (__a, __b);
> +  return __builtin_mve_vcmpneq_n_sv4si ((int32x4_t)__a, (int32_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
> @@ -5637,14 +5637,14 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_u32 (uint32x4_t __a, uint32x4_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_uv4si (__a, __b);
> +  return __builtin_mve_vcmpeqq_sv4si ((int32x4_t)__a, (int32x4_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_n_u32 (uint32x4_t __a, uint32_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_n_uv4si (__a, __b);
> +  return __builtin_mve_vcmpeqq_n_sv4si ((int32x4_t)__a, (int32_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 2/9] arm: MVE: Cleanup vcmpne/vcmpeq builtins
  2021-04-30 14:09 ` [PATCH 2/9] arm: MVE: Cleanup vcmpne/vcmpeq builtins Christophe Lyon
@ 2021-05-10 11:57   ` Kyrylo Tkachov
  0 siblings, 0 replies; 35+ messages in thread
From: Kyrylo Tkachov @ 2021-05-10 11:57 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches



> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> Christophe Lyon via Gcc-patches
> Sent: 30 April 2021 15:10
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH 2/9] arm: MVE: Cleanup vcmpne/vcmpeq builtins
> 
> After the previous patch, we no longer need to emit the unsigned
> variants of vcmpneq/vcmpeqq. This patch removes them as well as the
> corresponding iterator entries.

Ok.
Thanks,
Kyrill

> 
> 2021-03-01  Christophe Lyon  <christophe.lyon@linaro.org>
> 
> 	gcc/
> 	* config/arm/arm_mve_builtins.def (vcmpneq_u): Remove.
> 	(vcmpneq_n_u): Likewise.
> 	(vcmpeqq_u,): Likewise.
> 	(vcmpeqq_n_u): Likewise.
> 	* config/arm/iterators.md (supf): Remove VCMPNEQ_U,
> VCMPEQQ_U,
> 	VCMPEQQ_N_U and VCMPNEQ_N_U.
> 	* config/arm/mve.md (mve_vcmpneq): Remove <supf> iteration.
> 	(mve_vcmpeqq_n): Likewise.
> 	(mve_vcmpeqq): Likewise.
> 	(mve_vcmpneq_n): Likewise.
> 
> arm_mve_builtins.def: Remove vcmpneq_u, vcmpneq_n_u, vcmpeqq_u,
> vcmpeqq_n_u.
> iterators.md: Update VCMPNEQ VCMPEQQ VCMPEQQ_N VCMPNEQ_N
> mve.md: Remove vcmpneq_s vcmpeqq_n_u vcmpeqq_u, vcmpneq_n_u,
> ---
>  gcc/config/arm/arm_mve_builtins.def |  4 ----
>  gcc/config/arm/iterators.md         | 15 +++++++--------
>  gcc/config/arm/mve.md               | 16 ++++++++--------
>  3 files changed, 15 insertions(+), 20 deletions(-)
> 
> diff --git a/gcc/config/arm/arm_mve_builtins.def
> b/gcc/config/arm/arm_mve_builtins.def
> index 460f6ba..ee34fd1 100644
> --- a/gcc/config/arm/arm_mve_builtins.def
> +++ b/gcc/config/arm/arm_mve_builtins.def
> @@ -90,7 +90,6 @@ VAR3 (BINOP_NONE_NONE_IMM, vshrq_n_s, v16qi,
> v8hi, v4si)
>  VAR1 (BINOP_NONE_NONE_UNONE, vaddlvq_p_s, v4si)
>  VAR1 (BINOP_UNONE_UNONE_UNONE, vaddlvq_p_u, v4si)
>  VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_s, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpneq_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_NONE_NONE_NONE, vshlq_s, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_NONE, vshlq_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_UNONE, vsubq_u, v16qi, v8hi, v4si)
> @@ -118,11 +117,8 @@ VAR3 (BINOP_UNONE_UNONE_UNONE,
> vhsubq_n_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_n_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_UNONE, veorq_u, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpneq_n_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_n_u, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpeqq_u, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpeqq_n_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_n_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_UNONE, vbicq_u, v16qi, v8hi, v4si)
> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> index 8fb723e..0aba93f 100644
> --- a/gcc/config/arm/iterators.md
> +++ b/gcc/config/arm/iterators.md
> @@ -1279,13 +1279,12 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s")
> (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
>  		       (VCREATEQ_U "u") (VCREATEQ_S "s") (VSHRQ_N_S "s")
>  		       (VSHRQ_N_U "u") (VCVTQ_N_FROM_F_S "s") (VSHLQ_U
> "u")
>  		       (VCVTQ_N_FROM_F_U "u") (VADDLVQ_P_S "s")
> (VSHLQ_S "s")
> -		       (VADDLVQ_P_U "u") (VCMPNEQ_U "u") (VCMPNEQ_S "s")
> +		       (VADDLVQ_P_U "u") (VCMPNEQ_S "s")
>  		       (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s")
>  		       (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
>  		       (VADDVQ_P_S "s")	(VADDVQ_P_U "u") (VBRSRQ_N_S "s")
> -		       (VBRSRQ_N_U "u") (VCMPEQQ_S "s") (VCMPEQQ_U "u")
> -		       (VCMPEQQ_N_S "s") (VCMPEQQ_N_U "u")
> (VCMPNEQ_N_S "s")
> -		       (VCMPNEQ_N_U "u")
> +		       (VBRSRQ_N_U "u") (VCMPEQQ_S "s")
> +		       (VCMPEQQ_N_S "s") (VCMPNEQ_N_S "s")
>  		       (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s")
>  		       (VHADDQ_U "u") (VHSUBQ_N_S "s")
> 	(VHSUBQ_N_U "u")
>  		       (VHSUBQ_S "s") (VMAXQ_S "s") (VMAXQ_U "u")
> (VHSUBQ_U "u")
> @@ -1541,16 +1540,16 @@ (define_int_iterator VCREATEQ [VCREATEQ_U
> VCREATEQ_S])
>  (define_int_iterator VSHRQ_N [VSHRQ_N_S VSHRQ_N_U])
>  (define_int_iterator VCVTQ_N_FROM_F [VCVTQ_N_FROM_F_S
> VCVTQ_N_FROM_F_U])
>  (define_int_iterator VADDLVQ_P [VADDLVQ_P_S VADDLVQ_P_U])
> -(define_int_iterator VCMPNEQ [VCMPNEQ_U VCMPNEQ_S])
> +(define_int_iterator VCMPNEQ [VCMPNEQ_S])
>  (define_int_iterator VSHLQ [VSHLQ_S VSHLQ_U])
>  (define_int_iterator VABDQ [VABDQ_S VABDQ_U])
>  (define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U])
>  (define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U])
>  (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
>  (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
> -(define_int_iterator VCMPEQQ [VCMPEQQ_U VCMPEQQ_S])
> -(define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S VCMPEQQ_N_U])
> -(define_int_iterator VCMPNEQ_N [VCMPNEQ_N_U VCMPNEQ_N_S])
> +(define_int_iterator VCMPEQQ [VCMPEQQ_S])
> +(define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S])
> +(define_int_iterator VCMPNEQ_N [VCMPNEQ_N_S])
>  (define_int_iterator VHADDQ [VHADDQ_S VHADDQ_U])
>  (define_int_iterator VHADDQ_N [VHADDQ_N_U VHADDQ_N_S])
>  (define_int_iterator VHSUBQ [VHSUBQ_S VHSUBQ_U])
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index 7467d5f..b04c22b 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -836,9 +836,9 @@ (define_insn "mve_vaddlvq_p_<supf>v4si"
>     (set_attr "length""8")])
> 
>  ;;
> -;; [vcmpneq_u, vcmpneq_s])
> +;; [vcmpneq_s])
>  ;;
> -(define_insn "mve_vcmpneq_<supf><mode>"
> +(define_insn "mve_vcmpneq_s<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1035,9 +1035,9 @@ (define_insn "mve_vcmpcsq_u<mode>"
>  ])
> 
>  ;;
> -;; [vcmpeqq_n_s, vcmpeqq_n_u])
> +;; [vcmpeqq_n_s])
>  ;;
> -(define_insn "mve_vcmpeqq_n_<supf><mode>"
> +(define_insn "mve_vcmpeqq_n_s<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1050,9 +1050,9 @@ (define_insn "mve_vcmpeqq_n_<supf><mode>"
>  ])
> 
>  ;;
> -;; [vcmpeqq_u, vcmpeqq_s])
> +;; [vcmpeqq_s])
>  ;;
> -(define_insn "mve_vcmpeqq_<supf><mode>"
> +(define_insn "mve_vcmpeqq_s<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1215,9 +1215,9 @@ (define_insn "mve_vcmpltq_s<mode>"
>  ])
> 
>  ;;
> -;; [vcmpneq_n_u, vcmpneq_n_s])
> +;; [vcmpneq_n_s])
>  ;;
> -(define_insn "mve_vcmpneq_n_<supf><mode>"
> +(define_insn "mve_vcmpneq_n_s<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 3/9] arm: MVE: Remove _s and _u suffixes from vcmp* builtins.
  2021-04-30 14:09 ` [PATCH 3/9] arm: MVE: Remove _s and _u suffixes from vcmp* builtins Christophe Lyon
@ 2021-05-10 11:58   ` Kyrylo Tkachov
  0 siblings, 0 replies; 35+ messages in thread
From: Kyrylo Tkachov @ 2021-05-10 11:58 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches



> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> Christophe Lyon via Gcc-patches
> Sent: 30 April 2021 15:10
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH 3/9] arm: MVE: Remove _s and _u suffixes from vcmp*
> builtins.
> 
> This patch brings more unification in the vector comparison builtins,
> by removing the useless 's' (signed) suffix since we no longer need
> unsigned versions.
> 

Ok.
Thanks,
Kyrill

> 2021-03-01  Christophe Lyon  <christophe.lyon@linaro.org>
> 
> 	gcc/
> 	* config/arm/arm_mve.h (__arm_vcmp*): Remove 's' suffix.
> 	* config/arm/arm_mve_builtins.def (vcmp*): Remove 's' suffix.
> 	* config/arm/mve.md (mve_vcmp*): Remove 's' suffix in pattern
> 	names.
> ---
>  gcc/config/arm/arm_mve.h            | 120 ++++++++++++++++++-----------------
> -
>  gcc/config/arm/arm_mve_builtins.def |  32 +++++-----
>  gcc/config/arm/mve.md               |  64 +++++++++----------
>  3 files changed, 108 insertions(+), 108 deletions(-)
> 
> diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
> index e4dfe91..5d78269 100644
> --- a/gcc/config/arm/arm_mve.h
> +++ b/gcc/config/arm/arm_mve.h
> @@ -3674,42 +3674,42 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_s8 (int8x16_t __a, int8x16_t __b)
>  {
> -  return __builtin_mve_vcmpneq_sv16qi (__a, __b);
> +  return __builtin_mve_vcmpneq_v16qi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_s16 (int16x8_t __a, int16x8_t __b)
>  {
> -  return __builtin_mve_vcmpneq_sv8hi (__a, __b);
> +  return __builtin_mve_vcmpneq_v8hi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_s32 (int32x4_t __a, int32x4_t __b)
>  {
> -  return __builtin_mve_vcmpneq_sv4si (__a, __b);
> +  return __builtin_mve_vcmpneq_v4si (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_u8 (uint8x16_t __a, uint8x16_t __b)
>  {
> -  return __builtin_mve_vcmpneq_sv16qi ((int8x16_t)__a, (int8x16_t)__b);
> +  return __builtin_mve_vcmpneq_v16qi ((int8x16_t)__a, (int8x16_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_u16 (uint16x8_t __a, uint16x8_t __b)
>  {
> -  return __builtin_mve_vcmpneq_sv8hi ((int16x8_t)__a, (int16x8_t)__b);
> +  return __builtin_mve_vcmpneq_v8hi ((int16x8_t)__a, (int16x8_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_u32 (uint32x4_t __a, uint32x4_t __b)
>  {
> -  return __builtin_mve_vcmpneq_sv4si ((int32x4_t)__a, (int32x4_t)__b);
> +  return __builtin_mve_vcmpneq_v4si ((int32x4_t)__a, (int32x4_t)__b);
>  }
> 
>  __extension__ extern __inline int8x16_t
> @@ -3932,49 +3932,49 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_n_u8 (uint8x16_t __a, uint8_t __b)
>  {
> -  return __builtin_mve_vcmpneq_n_sv16qi ((int8x16_t)__a, (int8_t)__b);
> +  return __builtin_mve_vcmpneq_n_v16qi ((int8x16_t)__a, (int8_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmphiq_u8 (uint8x16_t __a, uint8x16_t __b)
>  {
> -  return __builtin_mve_vcmphiq_uv16qi (__a, __b);
> +  return __builtin_mve_vcmphiq_v16qi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmphiq_n_u8 (uint8x16_t __a, uint8_t __b)
>  {
> -  return __builtin_mve_vcmphiq_n_uv16qi (__a, __b);
> +  return __builtin_mve_vcmphiq_n_v16qi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_u8 (uint8x16_t __a, uint8x16_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_sv16qi ((int8x16_t)__a, (int8x16_t)__b);
> +  return __builtin_mve_vcmpeqq_v16qi ((int8x16_t)__a, (int8x16_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_n_u8 (uint8x16_t __a, uint8_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_n_sv16qi ((int8x16_t)__a, (int8_t)__b);
> +  return __builtin_mve_vcmpeqq_n_v16qi ((int8x16_t)__a, (int8_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpcsq_u8 (uint8x16_t __a, uint8x16_t __b)
>  {
> -  return __builtin_mve_vcmpcsq_uv16qi (__a, __b);
> +  return __builtin_mve_vcmpcsq_v16qi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpcsq_n_u8 (uint8x16_t __a, uint8_t __b)
>  {
> -  return __builtin_mve_vcmpcsq_n_uv16qi (__a, __b);
> +  return __builtin_mve_vcmpcsq_n_v16qi (__a, __b);
>  }
> 
>  __extension__ extern __inline uint8x16_t
> @@ -4144,77 +4144,77 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_n_s8 (int8x16_t __a, int8_t __b)
>  {
> -  return __builtin_mve_vcmpneq_n_sv16qi (__a, __b);
> +  return __builtin_mve_vcmpneq_n_v16qi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpltq_s8 (int8x16_t __a, int8x16_t __b)
>  {
> -  return __builtin_mve_vcmpltq_sv16qi (__a, __b);
> +  return __builtin_mve_vcmpltq_v16qi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpltq_n_s8 (int8x16_t __a, int8_t __b)
>  {
> -  return __builtin_mve_vcmpltq_n_sv16qi (__a, __b);
> +  return __builtin_mve_vcmpltq_n_v16qi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpleq_s8 (int8x16_t __a, int8x16_t __b)
>  {
> -  return __builtin_mve_vcmpleq_sv16qi (__a, __b);
> +  return __builtin_mve_vcmpleq_v16qi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpleq_n_s8 (int8x16_t __a, int8_t __b)
>  {
> -  return __builtin_mve_vcmpleq_n_sv16qi (__a, __b);
> +  return __builtin_mve_vcmpleq_n_v16qi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpgtq_s8 (int8x16_t __a, int8x16_t __b)
>  {
> -  return __builtin_mve_vcmpgtq_sv16qi (__a, __b);
> +  return __builtin_mve_vcmpgtq_v16qi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpgtq_n_s8 (int8x16_t __a, int8_t __b)
>  {
> -  return __builtin_mve_vcmpgtq_n_sv16qi (__a, __b);
> +  return __builtin_mve_vcmpgtq_n_v16qi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpgeq_s8 (int8x16_t __a, int8x16_t __b)
>  {
> -  return __builtin_mve_vcmpgeq_sv16qi (__a, __b);
> +  return __builtin_mve_vcmpgeq_v16qi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpgeq_n_s8 (int8x16_t __a, int8_t __b)
>  {
> -  return __builtin_mve_vcmpgeq_n_sv16qi (__a, __b);
> +  return __builtin_mve_vcmpgeq_n_v16qi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_s8 (int8x16_t __a, int8x16_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_sv16qi (__a, __b);
> +  return __builtin_mve_vcmpeqq_v16qi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_n_s8 (int8x16_t __a, int8_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_n_sv16qi (__a, __b);
> +  return __builtin_mve_vcmpeqq_n_v16qi (__a, __b);
>  }
> 
>  __extension__ extern __inline uint8x16_t
> @@ -4774,49 +4774,49 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_n_u16 (uint16x8_t __a, uint16_t __b)
>  {
> -  return __builtin_mve_vcmpneq_n_sv8hi ((int16x8_t)__a, (int16_t)__b);
> +  return __builtin_mve_vcmpneq_n_v8hi ((int16x8_t)__a, (int16_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmphiq_u16 (uint16x8_t __a, uint16x8_t __b)
>  {
> -  return __builtin_mve_vcmphiq_uv8hi (__a, __b);
> +  return __builtin_mve_vcmphiq_v8hi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmphiq_n_u16 (uint16x8_t __a, uint16_t __b)
>  {
> -  return __builtin_mve_vcmphiq_n_uv8hi (__a, __b);
> +  return __builtin_mve_vcmphiq_n_v8hi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_u16 (uint16x8_t __a, uint16x8_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_sv8hi ((int16x8_t)__a, (int16x8_t)__b);
> +  return __builtin_mve_vcmpeqq_v8hi ((int16x8_t)__a, (int16x8_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_n_u16 (uint16x8_t __a, uint16_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_n_sv8hi ((int16x8_t)__a, (int16_t)__b);
> +  return __builtin_mve_vcmpeqq_n_v8hi ((int16x8_t)__a, (int16_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpcsq_u16 (uint16x8_t __a, uint16x8_t __b)
>  {
> -  return __builtin_mve_vcmpcsq_uv8hi (__a, __b);
> +  return __builtin_mve_vcmpcsq_v8hi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpcsq_n_u16 (uint16x8_t __a, uint16_t __b)
>  {
> -  return __builtin_mve_vcmpcsq_n_uv8hi (__a, __b);
> +  return __builtin_mve_vcmpcsq_n_v8hi (__a, __b);
>  }
> 
>  __extension__ extern __inline uint16x8_t
> @@ -4986,77 +4986,77 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_n_s16 (int16x8_t __a, int16_t __b)
>  {
> -  return __builtin_mve_vcmpneq_n_sv8hi (__a, __b);
> +  return __builtin_mve_vcmpneq_n_v8hi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpltq_s16 (int16x8_t __a, int16x8_t __b)
>  {
> -  return __builtin_mve_vcmpltq_sv8hi (__a, __b);
> +  return __builtin_mve_vcmpltq_v8hi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpltq_n_s16 (int16x8_t __a, int16_t __b)
>  {
> -  return __builtin_mve_vcmpltq_n_sv8hi (__a, __b);
> +  return __builtin_mve_vcmpltq_n_v8hi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpleq_s16 (int16x8_t __a, int16x8_t __b)
>  {
> -  return __builtin_mve_vcmpleq_sv8hi (__a, __b);
> +  return __builtin_mve_vcmpleq_v8hi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpleq_n_s16 (int16x8_t __a, int16_t __b)
>  {
> -  return __builtin_mve_vcmpleq_n_sv8hi (__a, __b);
> +  return __builtin_mve_vcmpleq_n_v8hi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpgtq_s16 (int16x8_t __a, int16x8_t __b)
>  {
> -  return __builtin_mve_vcmpgtq_sv8hi (__a, __b);
> +  return __builtin_mve_vcmpgtq_v8hi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpgtq_n_s16 (int16x8_t __a, int16_t __b)
>  {
> -  return __builtin_mve_vcmpgtq_n_sv8hi (__a, __b);
> +  return __builtin_mve_vcmpgtq_n_v8hi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpgeq_s16 (int16x8_t __a, int16x8_t __b)
>  {
> -  return __builtin_mve_vcmpgeq_sv8hi (__a, __b);
> +  return __builtin_mve_vcmpgeq_v8hi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpgeq_n_s16 (int16x8_t __a, int16_t __b)
>  {
> -  return __builtin_mve_vcmpgeq_n_sv8hi (__a, __b);
> +  return __builtin_mve_vcmpgeq_n_v8hi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_s16 (int16x8_t __a, int16x8_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_sv8hi (__a, __b);
> +  return __builtin_mve_vcmpeqq_v8hi (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_n_s16 (int16x8_t __a, int16_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_n_sv8hi (__a, __b);
> +  return __builtin_mve_vcmpeqq_n_v8hi (__a, __b);
>  }
> 
>  __extension__ extern __inline uint16x8_t
> @@ -5616,49 +5616,49 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_n_u32 (uint32x4_t __a, uint32_t __b)
>  {
> -  return __builtin_mve_vcmpneq_n_sv4si ((int32x4_t)__a, (int32_t)__b);
> +  return __builtin_mve_vcmpneq_n_v4si ((int32x4_t)__a, (int32_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmphiq_u32 (uint32x4_t __a, uint32x4_t __b)
>  {
> -  return __builtin_mve_vcmphiq_uv4si (__a, __b);
> +  return __builtin_mve_vcmphiq_v4si (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmphiq_n_u32 (uint32x4_t __a, uint32_t __b)
>  {
> -  return __builtin_mve_vcmphiq_n_uv4si (__a, __b);
> +  return __builtin_mve_vcmphiq_n_v4si (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_u32 (uint32x4_t __a, uint32x4_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_sv4si ((int32x4_t)__a, (int32x4_t)__b);
> +  return __builtin_mve_vcmpeqq_v4si ((int32x4_t)__a, (int32x4_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_n_u32 (uint32x4_t __a, uint32_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_n_sv4si ((int32x4_t)__a, (int32_t)__b);
> +  return __builtin_mve_vcmpeqq_n_v4si ((int32x4_t)__a, (int32_t)__b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpcsq_u32 (uint32x4_t __a, uint32x4_t __b)
>  {
> -  return __builtin_mve_vcmpcsq_uv4si (__a, __b);
> +  return __builtin_mve_vcmpcsq_v4si (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpcsq_n_u32 (uint32x4_t __a, uint32_t __b)
>  {
> -  return __builtin_mve_vcmpcsq_n_uv4si (__a, __b);
> +  return __builtin_mve_vcmpcsq_n_v4si (__a, __b);
>  }
> 
>  __extension__ extern __inline uint32x4_t
> @@ -5828,77 +5828,77 @@ __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpneq_n_s32 (int32x4_t __a, int32_t __b)
>  {
> -  return __builtin_mve_vcmpneq_n_sv4si (__a, __b);
> +  return __builtin_mve_vcmpneq_n_v4si (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpltq_s32 (int32x4_t __a, int32x4_t __b)
>  {
> -  return __builtin_mve_vcmpltq_sv4si (__a, __b);
> +  return __builtin_mve_vcmpltq_v4si (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpltq_n_s32 (int32x4_t __a, int32_t __b)
>  {
> -  return __builtin_mve_vcmpltq_n_sv4si (__a, __b);
> +  return __builtin_mve_vcmpltq_n_v4si (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpleq_s32 (int32x4_t __a, int32x4_t __b)
>  {
> -  return __builtin_mve_vcmpleq_sv4si (__a, __b);
> +  return __builtin_mve_vcmpleq_v4si (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpleq_n_s32 (int32x4_t __a, int32_t __b)
>  {
> -  return __builtin_mve_vcmpleq_n_sv4si (__a, __b);
> +  return __builtin_mve_vcmpleq_n_v4si (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpgtq_s32 (int32x4_t __a, int32x4_t __b)
>  {
> -  return __builtin_mve_vcmpgtq_sv4si (__a, __b);
> +  return __builtin_mve_vcmpgtq_v4si (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpgtq_n_s32 (int32x4_t __a, int32_t __b)
>  {
> -  return __builtin_mve_vcmpgtq_n_sv4si (__a, __b);
> +  return __builtin_mve_vcmpgtq_n_v4si (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpgeq_s32 (int32x4_t __a, int32x4_t __b)
>  {
> -  return __builtin_mve_vcmpgeq_sv4si (__a, __b);
> +  return __builtin_mve_vcmpgeq_v4si (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpgeq_n_s32 (int32x4_t __a, int32_t __b)
>  {
> -  return __builtin_mve_vcmpgeq_n_sv4si (__a, __b);
> +  return __builtin_mve_vcmpgeq_n_v4si (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_s32 (int32x4_t __a, int32x4_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_sv4si (__a, __b);
> +  return __builtin_mve_vcmpeqq_v4si (__a, __b);
>  }
> 
>  __extension__ extern __inline mve_pred16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vcmpeqq_n_s32 (int32x4_t __a, int32_t __b)
>  {
> -  return __builtin_mve_vcmpeqq_n_sv4si (__a, __b);
> +  return __builtin_mve_vcmpeqq_n_v4si (__a, __b);
>  }
> 
>  __extension__ extern __inline uint32x4_t
> diff --git a/gcc/config/arm/arm_mve_builtins.def
> b/gcc/config/arm/arm_mve_builtins.def
> index ee34fd1..e9b5b28 100644
> --- a/gcc/config/arm/arm_mve_builtins.def
> +++ b/gcc/config/arm/arm_mve_builtins.def
> @@ -89,7 +89,7 @@ VAR3 (BINOP_UNONE_UNONE_IMM, vshrq_n_u, v16qi,
> v8hi, v4si)
>  VAR3 (BINOP_NONE_NONE_IMM, vshrq_n_s, v16qi, v8hi, v4si)
>  VAR1 (BINOP_NONE_NONE_UNONE, vaddlvq_p_s, v4si)
>  VAR1 (BINOP_UNONE_UNONE_UNONE, vaddlvq_p_u, v4si)
> -VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_s, v16qi, v8hi, v4si)
> +VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_, v16qi, v8hi, v4si)
>  VAR3 (BINOP_NONE_NONE_NONE, vshlq_s, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_NONE, vshlq_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_UNONE, vsubq_u, v16qi, v8hi, v4si)
> @@ -117,10 +117,10 @@ VAR3 (BINOP_UNONE_UNONE_UNONE,
> vhsubq_n_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_UNONE, vhaddq_n_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_UNONE, veorq_u, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_u, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_n_u, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_u, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_n_u, v16qi, v8hi, v4si)
> +VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_, v16qi, v8hi, v4si)
> +VAR3 (BINOP_UNONE_UNONE_UNONE, vcmphiq_n_, v16qi, v8hi, v4si)
> +VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_, v16qi, v8hi, v4si)
> +VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_n_, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_UNONE, vbicq_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_UNONE, vandq_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_UNONE, vaddvq_p_u, v16qi, v8hi, v4si)
> @@ -142,17 +142,17 @@ VAR3 (BINOP_UNONE_UNONE_NONE,
> vbrsrq_n_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_IMM, vshlq_n_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_IMM, vrshrq_n_u, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_UNONE_IMM, vqshlq_n_u, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_n_s, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_NONE_NONE, vcmpltq_s, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_NONE_NONE, vcmpltq_n_s, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_NONE_NONE, vcmpleq_s, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_NONE_NONE, vcmpleq_n_s, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_NONE_NONE, vcmpgtq_s, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_NONE_NONE, vcmpgtq_n_s, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_NONE_NONE, vcmpgeq_s, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_NONE_NONE, vcmpgeq_n_s, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_NONE_NONE, vcmpeqq_s, v16qi, v8hi, v4si)
> -VAR3 (BINOP_UNONE_NONE_NONE, vcmpeqq_n_s, v16qi, v8hi, v4si)
> +VAR3 (BINOP_UNONE_NONE_NONE, vcmpneq_n_, v16qi, v8hi, v4si)
> +VAR3 (BINOP_UNONE_NONE_NONE, vcmpltq_, v16qi, v8hi, v4si)
> +VAR3 (BINOP_UNONE_NONE_NONE, vcmpltq_n_, v16qi, v8hi, v4si)
> +VAR3 (BINOP_UNONE_NONE_NONE, vcmpleq_, v16qi, v8hi, v4si)
> +VAR3 (BINOP_UNONE_NONE_NONE, vcmpleq_n_, v16qi, v8hi, v4si)
> +VAR3 (BINOP_UNONE_NONE_NONE, vcmpgtq_, v16qi, v8hi, v4si)
> +VAR3 (BINOP_UNONE_NONE_NONE, vcmpgtq_n_, v16qi, v8hi, v4si)
> +VAR3 (BINOP_UNONE_NONE_NONE, vcmpgeq_, v16qi, v8hi, v4si)
> +VAR3 (BINOP_UNONE_NONE_NONE, vcmpgeq_n_, v16qi, v8hi, v4si)
> +VAR3 (BINOP_UNONE_NONE_NONE, vcmpeqq_, v16qi, v8hi, v4si)
> +VAR3 (BINOP_UNONE_NONE_NONE, vcmpeqq_n_, v16qi, v8hi, v4si)
>  VAR3 (BINOP_UNONE_NONE_IMM, vqshluq_n_s, v16qi, v8hi, v4si)
>  VAR3 (BINOP_NONE_NONE_UNONE, vaddvq_p_s, v16qi, v8hi, v4si)
>  VAR3 (BINOP_NONE_NONE_NONE, vsubq_s, v16qi, v8hi, v4si)
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index b04c22b..e9f095d 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -836,9 +836,9 @@ (define_insn "mve_vaddlvq_p_<supf>v4si"
>     (set_attr "length""8")])
> 
>  ;;
> -;; [vcmpneq_s])
> +;; [vcmpneq_])
>  ;;
> -(define_insn "mve_vcmpneq_s<mode>"
> +(define_insn "mve_vcmpneq_<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1005,9 +1005,9 @@ (define_expand "cadd<rot><mode>3"
>  )
> 
>  ;;
> -;; [vcmpcsq_n_u])
> +;; [vcmpcsq_n_])
>  ;;
> -(define_insn "mve_vcmpcsq_n_u<mode>"
> +(define_insn "mve_vcmpcsq_n_<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1020,9 +1020,9 @@ (define_insn "mve_vcmpcsq_n_u<mode>"
>  ])
> 
>  ;;
> -;; [vcmpcsq_u])
> +;; [vcmpcsq_])
>  ;;
> -(define_insn "mve_vcmpcsq_u<mode>"
> +(define_insn "mve_vcmpcsq_<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1035,9 +1035,9 @@ (define_insn "mve_vcmpcsq_u<mode>"
>  ])
> 
>  ;;
> -;; [vcmpeqq_n_s])
> +;; [vcmpeqq_n_])
>  ;;
> -(define_insn "mve_vcmpeqq_n_s<mode>"
> +(define_insn "mve_vcmpeqq_n_<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1050,9 +1050,9 @@ (define_insn "mve_vcmpeqq_n_s<mode>"
>  ])
> 
>  ;;
> -;; [vcmpeqq_s])
> +;; [vcmpeqq_])
>  ;;
> -(define_insn "mve_vcmpeqq_s<mode>"
> +(define_insn "mve_vcmpeqq_<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1065,9 +1065,9 @@ (define_insn "mve_vcmpeqq_s<mode>"
>  ])
> 
>  ;;
> -;; [vcmpgeq_n_s])
> +;; [vcmpgeq_n_])
>  ;;
> -(define_insn "mve_vcmpgeq_n_s<mode>"
> +(define_insn "mve_vcmpgeq_n_<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1080,9 +1080,9 @@ (define_insn "mve_vcmpgeq_n_s<mode>"
>  ])
> 
>  ;;
> -;; [vcmpgeq_s])
> +;; [vcmpgeq_])
>  ;;
> -(define_insn "mve_vcmpgeq_s<mode>"
> +(define_insn "mve_vcmpgeq_<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1095,9 +1095,9 @@ (define_insn "mve_vcmpgeq_s<mode>"
>  ])
> 
>  ;;
> -;; [vcmpgtq_n_s])
> +;; [vcmpgtq_n_])
>  ;;
> -(define_insn "mve_vcmpgtq_n_s<mode>"
> +(define_insn "mve_vcmpgtq_n_<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1110,9 +1110,9 @@ (define_insn "mve_vcmpgtq_n_s<mode>"
>  ])
> 
>  ;;
> -;; [vcmpgtq_s])
> +;; [vcmpgtq_])
>  ;;
> -(define_insn "mve_vcmpgtq_s<mode>"
> +(define_insn "mve_vcmpgtq_<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1125,9 +1125,9 @@ (define_insn "mve_vcmpgtq_s<mode>"
>  ])
> 
>  ;;
> -;; [vcmphiq_n_u])
> +;; [vcmphiq_n_])
>  ;;
> -(define_insn "mve_vcmphiq_n_u<mode>"
> +(define_insn "mve_vcmphiq_n_<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1140,9 +1140,9 @@ (define_insn "mve_vcmphiq_n_u<mode>"
>  ])
> 
>  ;;
> -;; [vcmphiq_u])
> +;; [vcmphiq_])
>  ;;
> -(define_insn "mve_vcmphiq_u<mode>"
> +(define_insn "mve_vcmphiq_<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1155,9 +1155,9 @@ (define_insn "mve_vcmphiq_u<mode>"
>  ])
> 
>  ;;
> -;; [vcmpleq_n_s])
> +;; [vcmpleq_n_])
>  ;;
> -(define_insn "mve_vcmpleq_n_s<mode>"
> +(define_insn "mve_vcmpleq_n_<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1170,9 +1170,9 @@ (define_insn "mve_vcmpleq_n_s<mode>"
>  ])
> 
>  ;;
> -;; [vcmpleq_s])
> +;; [vcmpleq_])
>  ;;
> -(define_insn "mve_vcmpleq_s<mode>"
> +(define_insn "mve_vcmpleq_<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1185,9 +1185,9 @@ (define_insn "mve_vcmpleq_s<mode>"
>  ])
> 
>  ;;
> -;; [vcmpltq_n_s])
> +;; [vcmpltq_n_])
>  ;;
> -(define_insn "mve_vcmpltq_n_s<mode>"
> +(define_insn "mve_vcmpltq_n_<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1200,9 +1200,9 @@ (define_insn "mve_vcmpltq_n_s<mode>"
>  ])
> 
>  ;;
> -;; [vcmpltq_s])
> +;; [vcmpltq_])
>  ;;
> -(define_insn "mve_vcmpltq_s<mode>"
> +(define_insn "mve_vcmpltq_<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> @@ -1215,9 +1215,9 @@ (define_insn "mve_vcmpltq_s<mode>"
>  ])
> 
>  ;;
> -;; [vcmpneq_n_s])
> +;; [vcmpneq_n_])
>  ;;
> -(define_insn "mve_vcmpneq_n_s<mode>"
> +(define_insn "mve_vcmpneq_n_<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
>  	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 4/9] arm: MVE: Factorize all vcmp* integer patterns
  2021-04-30 14:09 ` [PATCH 4/9] arm: MVE: Factorize all vcmp* integer patterns Christophe Lyon
@ 2021-05-10 11:59   ` Kyrylo Tkachov
  0 siblings, 0 replies; 35+ messages in thread
From: Kyrylo Tkachov @ 2021-05-10 11:59 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches



> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> Christophe Lyon via Gcc-patches
> Sent: 30 April 2021 15:10
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH 4/9] arm: MVE: Factorize all vcmp* integer patterns
> 
> After removing the signed and unsigned suffixes in the previous
> patches, we can now factorize the vcmp* patterns: there is no longer
> an asymmetry where operators do not have the same set of signed and
> unsigned variants.
> 
> The will make maintenance easier.

Ok.
Thanks,
Kyrill

> 
> MVE has a different set of vector comparison operators than Neon,
> so we have to introduce dedicated iterators.
> 
> 2021-03-01  Christophe Lyon  <christophe.lyon@linaro.org>
> 
> 	gcc/
> 	* config/arm/iterators.md (MVE_COMPARISONS): New.
> 	(mve_cmp_op): New.
> 	(mve_cmp_type): New.
> 	* config/arm/mve.md (mve_vcmp<mve_cmp_op>q_<mode>): New,
> merge all
> 	mve_vcmp patterns.
> 	(mve_vcmpneq_<mode>, mve_vcmpcsq_n_<mode>,
> mve_vcmpcsq_<mode>)
> 	(mve_vcmpeqq_n_<mode>, mve_vcmpeqq_<mode>,
> mve_vcmpgeq_n_<mode>)
> 	(mve_vcmpgeq_<mode>, mve_vcmpgtq_n_<mode>,
> mve_vcmpgtq_<mode>)
> 	(mve_vcmphiq_n_<mode>, mve_vcmphiq_<mode>,
> mve_vcmpleq_n_<mode>)
> 	(mve_vcmpleq_<mode>, mve_vcmpltq_n_<mode>,
> mve_vcmpltq_<mode>)
> 	(mve_vcmpneq_n_<mode>, mve_vcmpltq_n_<mode>,
> mve_vcmpltq_<mode>)
> 	(mve_vcmpneq_n_<mode>): Remove.
> ---
>  gcc/config/arm/iterators.md |   8 ++
>  gcc/config/arm/mve.md       | 250 ++++----------------------------------------
>  2 files changed, 27 insertions(+), 231 deletions(-)
> 
> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> index 0aba93f..29347f7 100644
> --- a/gcc/config/arm/iterators.md
> +++ b/gcc/config/arm/iterators.md
> @@ -285,6 +285,8 @@ (define_code_iterator GTUGEU [gtu geu])
> 
>  ;; Comparisons for vc<cmp>
>  (define_code_iterator COMPARISONS [eq gt ge le lt])
> +;; Comparisons for MVE
> +(define_code_iterator MVE_COMPARISONS [eq ge geu gt gtu le lt ne])
> 
>  ;; A list of ...
>  (define_code_iterator IOR_XOR [ior xor])
> @@ -336,8 +338,14 @@ (define_code_attr arith_shift_insn
>  (define_code_attr cmp_op [(eq "eq") (gt "gt") (ge "ge") (lt "lt") (le "le")
>                            (gtu "gt") (geu "ge")])
> 
> +(define_code_attr mve_cmp_op [(eq "eq") (gt "gt") (ge "ge") (lt "lt") (le "le")
> +                              (gtu "hi") (geu "cs") (ne "ne")])
> +
>  (define_code_attr cmp_type [(eq "i") (gt "s") (ge "s") (lt "s") (le "s")])
> 
> +(define_code_attr mve_cmp_type [(eq "i") (gt "s") (ge "s") (lt "s") (le "s")
> +                                (gtu "u") (geu "u") (ne "i")])
> +
>  (define_code_attr vfml_op [(plus "a") (minus "s")])
> 
>  (define_code_attr ss_op [(ss_plus "qadd") (ss_minus "qsub")])
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index e9f095d..40baff7 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -836,17 +836,30 @@ (define_insn "mve_vaddlvq_p_<supf>v4si"
>     (set_attr "length""8")])
> 
>  ;;
> -;; [vcmpneq_])
> +;; [vcmpneq_, vcmpcsq_, vcmpeqq_, vcmpgeq_, vcmpgtq_, vcmphiq_,
> vcmpleq_, vcmpltq_])
>  ;;
> -(define_insn "mve_vcmpneq_<mode>"
> +(define_insn "mve_vcmp<mve_cmp_op>q_<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> -		    (match_operand:MVE_2 2 "s_register_operand" "w")]
> -	 VCMPNEQ))
> +	(MVE_COMPARISONS:HI (match_operand:MVE_2 1
> "s_register_operand" "w")
> +		    (match_operand:MVE_2 2 "s_register_operand" "w")))
> +  ]
> +  "TARGET_HAVE_MVE"
> +  "vcmp.<mve_cmp_type>%#<V_sz_elem>  <mve_cmp_op>, %q1, %q2"
> +  [(set_attr "type" "mve_move")
> +])
> +
> +;;
> +;; [vcmpcsq_n_, vcmpeqq_n_, vcmpgeq_n_, vcmpgtq_n_, vcmphiq_n_,
> vcmpleq_n_, vcmpltq_n_, vcmpneq_n_])
> +;;
> +(define_insn "mve_vcmp<mve_cmp_op>q_n_<mode>"
> +  [
> +   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> +	(MVE_COMPARISONS:HI (match_operand:MVE_2 1
> "s_register_operand" "w")
> +		    (match_operand:<V_elem> 2 "s_register_operand" "r")))
>    ]
>    "TARGET_HAVE_MVE"
> -  "vcmp.i%#<V_sz_elem>  ne, %q1, %q2"
> +  "vcmp.<mve_cmp_type>%#<V_sz_elem>	<mve_cmp_op>, %q1, %2"
>    [(set_attr "type" "mve_move")
>  ])
> 
> @@ -1005,231 +1018,6 @@ (define_expand "cadd<rot><mode>3"
>  )
> 
>  ;;
> -;; [vcmpcsq_n_])
> -;;
> -(define_insn "mve_vcmpcsq_n_<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> -		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
> -	 VCMPCSQ_N_U))
> -  ]
> -  "TARGET_HAVE_MVE"
> -  "vcmp.u%#<V_sz_elem>	cs, %q1, %2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpcsq_])
> -;;
> -(define_insn "mve_vcmpcsq_<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> -		    (match_operand:MVE_2 2 "s_register_operand" "w")]
> -	 VCMPCSQ_U))
> -  ]
> -  "TARGET_HAVE_MVE"
> -  "vcmp.u%#<V_sz_elem>	cs, %q1, %q2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpeqq_n_])
> -;;
> -(define_insn "mve_vcmpeqq_n_<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> -		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
> -	 VCMPEQQ_N))
> -  ]
> -  "TARGET_HAVE_MVE"
> -  "vcmp.i%#<V_sz_elem>	eq, %q1, %2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpeqq_])
> -;;
> -(define_insn "mve_vcmpeqq_<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> -		    (match_operand:MVE_2 2 "s_register_operand" "w")]
> -	 VCMPEQQ))
> -  ]
> -  "TARGET_HAVE_MVE"
> -  "vcmp.i%#<V_sz_elem>	eq, %q1, %q2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpgeq_n_])
> -;;
> -(define_insn "mve_vcmpgeq_n_<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> -		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
> -	 VCMPGEQ_N_S))
> -  ]
> -  "TARGET_HAVE_MVE"
> -  "vcmp.s%#<V_sz_elem>	ge, %q1, %2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpgeq_])
> -;;
> -(define_insn "mve_vcmpgeq_<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> -		    (match_operand:MVE_2 2 "s_register_operand" "w")]
> -	 VCMPGEQ_S))
> -  ]
> -  "TARGET_HAVE_MVE"
> -  "vcmp.s%#<V_sz_elem>	ge, %q1, %q2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpgtq_n_])
> -;;
> -(define_insn "mve_vcmpgtq_n_<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> -		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
> -	 VCMPGTQ_N_S))
> -  ]
> -  "TARGET_HAVE_MVE"
> -  "vcmp.s%#<V_sz_elem>	gt, %q1, %2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpgtq_])
> -;;
> -(define_insn "mve_vcmpgtq_<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> -		    (match_operand:MVE_2 2 "s_register_operand" "w")]
> -	 VCMPGTQ_S))
> -  ]
> -  "TARGET_HAVE_MVE"
> -  "vcmp.s%#<V_sz_elem>	gt, %q1, %q2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmphiq_n_])
> -;;
> -(define_insn "mve_vcmphiq_n_<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> -		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
> -	 VCMPHIQ_N_U))
> -  ]
> -  "TARGET_HAVE_MVE"
> -  "vcmp.u%#<V_sz_elem>	hi, %q1, %2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmphiq_])
> -;;
> -(define_insn "mve_vcmphiq_<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> -		    (match_operand:MVE_2 2 "s_register_operand" "w")]
> -	 VCMPHIQ_U))
> -  ]
> -  "TARGET_HAVE_MVE"
> -  "vcmp.u%#<V_sz_elem>	hi, %q1, %q2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpleq_n_])
> -;;
> -(define_insn "mve_vcmpleq_n_<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> -		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
> -	 VCMPLEQ_N_S))
> -  ]
> -  "TARGET_HAVE_MVE"
> -  "vcmp.s%#<V_sz_elem>	le, %q1, %2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpleq_])
> -;;
> -(define_insn "mve_vcmpleq_<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> -		    (match_operand:MVE_2 2 "s_register_operand" "w")]
> -	 VCMPLEQ_S))
> -  ]
> -  "TARGET_HAVE_MVE"
> -  "vcmp.s%#<V_sz_elem>	le, %q1, %q2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpltq_n_])
> -;;
> -(define_insn "mve_vcmpltq_n_<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> -		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
> -	 VCMPLTQ_N_S))
> -  ]
> -  "TARGET_HAVE_MVE"
> -  "vcmp.s%#<V_sz_elem>	lt, %q1, %2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpltq_])
> -;;
> -(define_insn "mve_vcmpltq_<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> -		    (match_operand:MVE_2 2 "s_register_operand" "w")]
> -	 VCMPLTQ_S))
> -  ]
> -  "TARGET_HAVE_MVE"
> -  "vcmp.s%#<V_sz_elem>	lt, %q1, %q2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpneq_n_])
> -;;
> -(define_insn "mve_vcmpneq_n_<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_2 1 "s_register_operand" "w")
> -		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
> -	 VCMPNEQ_N))
> -  ]
> -  "TARGET_HAVE_MVE"
> -  "vcmp.i%#<V_sz_elem>	ne, %q1, %2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
>  ;; [veorq_u, veorq_s])
>  ;;
>  (define_insn "mve_veorq_u<mode>"
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 5/9] arm: MVE: Factorize vcmp_*f*
  2021-04-30 14:09 ` [PATCH 5/9] arm: MVE: Factorize vcmp_*f* Christophe Lyon
@ 2021-05-10 11:59   ` Kyrylo Tkachov
  0 siblings, 0 replies; 35+ messages in thread
From: Kyrylo Tkachov @ 2021-05-10 11:59 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches



> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> Christophe Lyon via Gcc-patches
> Sent: 30 April 2021 15:10
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH 5/9] arm: MVE: Factorize vcmp_*f*
> 
> Like in the previous, we factorize the vcmp_*f* patterns to make
> maintenance easier.

Ok.
Thanks,
Kyrill

> 
> 2021-03-12  Christophe Lyon  <christophe.lyon@linaro.org>
> 
> 	gcc/
> 	* config/arm/iterators.md (MVE_FP_COMPARISONS): New.
> 	* config/arm/mve.md (mve_vcmp<mve_cmp_op>q_f<mode>)
> 	(mve_vcmp<mve_cmp_op>q_n_f<mode>): New, merge all vcmp_*f*
> 	patterns.
> 	(mve_vcmpeqq_f<mode>, mve_vcmpeqq_n_f<mode>,
> mve_vcmpgeq_f<mode>)
> 	(mve_vcmpgeq_n_f<mode>, mve_vcmpgtq_f<mode>)
> 	(mve_vcmpgtq_n_f<mode>, mve_vcmpleq_f<mode>)
> 	(mve_vcmpleq_n_f<mode>, mve_vcmpltq_f<mode>)
> 	(mve_vcmpltq_n_f<mode>, mve_vcmpneq_f<mode>)
> 	(mve_vcmpneq_n_f<mode>): Remove.
> 	* config/arm/unspecs.md (VCMPEQQ_F, VCMPEQQ_N_F,
> VCMPGEQ_F)
> 	(VCMPGEQ_N_F, VCMPGTQ_F, VCMPGTQ_N_F, VCMPLEQ_F,
> VCMPLEQ_N_F)
> 	(VCMPLTQ_F, VCMPLTQ_N_F, VCMPNEQ_F, VCMPNEQ_N_F):
> Remove.
> ---
>  gcc/config/arm/iterators.md |   1 +
>  gcc/config/arm/mve.md       | 172 +++-----------------------------------------
>  gcc/config/arm/unspecs.md   |  12 ----
>  3 files changed, 11 insertions(+), 174 deletions(-)
> 
> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> index 29347f7..95df8bd 100644
> --- a/gcc/config/arm/iterators.md
> +++ b/gcc/config/arm/iterators.md
> @@ -287,6 +287,7 @@ (define_code_iterator GTUGEU [gtu geu])
>  (define_code_iterator COMPARISONS [eq gt ge le lt])
>  ;; Comparisons for MVE
>  (define_code_iterator MVE_COMPARISONS [eq ge geu gt gtu le lt ne])
> +(define_code_iterator MVE_FP_COMPARISONS [eq ge gt le lt ne])
> 
>  ;; A list of ...
>  (define_code_iterator IOR_XOR [ior xor])
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index 40baff7..7c846a4 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -1926,182 +1926,30 @@ (define_insn "mve_vcaddq<mve_rot><mode>"
>  ])
> 
>  ;;
> -;; [vcmpeqq_f])
> +;; [vcmpeqq_f, vcmpgeq_f, vcmpgtq_f, vcmpleq_f, vcmpltq_f, vcmpneq_f])
>  ;;
> -(define_insn "mve_vcmpeqq_f<mode>"
> +(define_insn "mve_vcmp<mve_cmp_op>q_f<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
> -		    (match_operand:MVE_0 2 "s_register_operand" "w")]
> -	 VCMPEQQ_F))
> -  ]
> -  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> -  "vcmp.f%#<V_sz_elem>	eq, %q1, %q2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpeqq_n_f])
> -;;
> -(define_insn "mve_vcmpeqq_n_f<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
> -		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
> -	 VCMPEQQ_N_F))
> -  ]
> -  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> -  "vcmp.f%#<V_sz_elem>	eq, %q1, %2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpgeq_f])
> -;;
> -(define_insn "mve_vcmpgeq_f<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
> -		    (match_operand:MVE_0 2 "s_register_operand" "w")]
> -	 VCMPGEQ_F))
> -  ]
> -  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> -  "vcmp.f%#<V_sz_elem>	ge, %q1, %q2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpgeq_n_f])
> -;;
> -(define_insn "mve_vcmpgeq_n_f<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
> -		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
> -	 VCMPGEQ_N_F))
> -  ]
> -  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> -  "vcmp.f%#<V_sz_elem>	ge, %q1, %2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpgtq_f])
> -;;
> -(define_insn "mve_vcmpgtq_f<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
> -		    (match_operand:MVE_0 2 "s_register_operand" "w")]
> -	 VCMPGTQ_F))
> -  ]
> -  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> -  "vcmp.f%#<V_sz_elem>	gt, %q1, %q2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpgtq_n_f])
> -;;
> -(define_insn "mve_vcmpgtq_n_f<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
> -		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
> -	 VCMPGTQ_N_F))
> +	(MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1
> "s_register_operand" "w")
> +			       (match_operand:MVE_0 2 "s_register_operand"
> "w")))
>    ]
>    "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> -  "vcmp.f%#<V_sz_elem>	gt, %q1, %2"
> +  "vcmp.f%#<V_sz_elem>	<mve_cmp_op>, %q1, %q2"
>    [(set_attr "type" "mve_move")
>  ])
> 
>  ;;
> -;; [vcmpleq_f])
> +;; [vcmpeqq_n_f, vcmpgeq_n_f, vcmpgtq_n_f, vcmpleq_n_f, vcmpltq_n_f,
> vcmpneq_n_f])
>  ;;
> -(define_insn "mve_vcmpleq_f<mode>"
> +(define_insn "mve_vcmp<mve_cmp_op>q_n_f<mode>"
>    [
>     (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
> -		    (match_operand:MVE_0 2 "s_register_operand" "w")]
> -	 VCMPLEQ_F))
> -  ]
> -  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> -  "vcmp.f%#<V_sz_elem>	le, %q1, %q2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpleq_n_f])
> -;;
> -(define_insn "mve_vcmpleq_n_f<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
> -		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
> -	 VCMPLEQ_N_F))
> -  ]
> -  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> -  "vcmp.f%#<V_sz_elem>	le, %q1, %2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpltq_f])
> -;;
> -(define_insn "mve_vcmpltq_f<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
> -		    (match_operand:MVE_0 2 "s_register_operand" "w")]
> -	 VCMPLTQ_F))
> -  ]
> -  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> -  "vcmp.f%#<V_sz_elem>	lt, %q1, %q2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpltq_n_f])
> -;;
> -(define_insn "mve_vcmpltq_n_f<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
> -		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
> -	 VCMPLTQ_N_F))
> -  ]
> -  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> -  "vcmp.f%#<V_sz_elem>	lt, %q1, %2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpneq_f])
> -;;
> -(define_insn "mve_vcmpneq_f<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
> -		    (match_operand:MVE_0 2 "s_register_operand" "w")]
> -	 VCMPNEQ_F))
> -  ]
> -  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> -  "vcmp.f%#<V_sz_elem>	ne, %q1, %q2"
> -  [(set_attr "type" "mve_move")
> -])
> -
> -;;
> -;; [vcmpneq_n_f])
> -;;
> -(define_insn "mve_vcmpneq_n_f<mode>"
> -  [
> -   (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> -	(unspec:HI [(match_operand:MVE_0 1 "s_register_operand" "w")
> -		    (match_operand:<V_elem> 2 "s_register_operand" "r")]
> -	 VCMPNEQ_N_F))
> +	(MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1
> "s_register_operand" "w")
> +			       (match_operand:<V_elem> 2
> "s_register_operand" "r")))
>    ]
>    "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> -  "vcmp.f%#<V_sz_elem>	ne, %q1, %2"
> +  "vcmp.f%#<V_sz_elem>	<mve_cmp_op>, %q1, %2"
>    [(set_attr "type" "mve_move")
>  ])
> 
> diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
> index 4d47ab7..07ca53b 100644
> --- a/gcc/config/arm/unspecs.md
> +++ b/gcc/config/arm/unspecs.md
> @@ -710,18 +710,6 @@ (define_c_enum "unspec" [
>    VABDQ_M_U
>    VABDQ_F
>    VADDQ_N_F
> -  VCMPEQQ_F
> -  VCMPEQQ_N_F
> -  VCMPGEQ_F
> -  VCMPGEQ_N_F
> -  VCMPGTQ_F
> -  VCMPGTQ_N_F
> -  VCMPLEQ_F
> -  VCMPLEQ_N_F
> -  VCMPLTQ_F
> -  VCMPLTQ_N_F
> -  VCMPNEQ_F
> -  VCMPNEQ_N_F
>    VMAXNMAQ_F
>    VMAXNMAVQ_F
>    VMAXNMQ_F
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 6/9] arm: Auto-vectorization for MVE: vcmp
  2021-05-05 14:08       ` Christophe Lyon
@ 2021-05-17  9:54         ` Christophe Lyon
  2021-05-17 10:35         ` Kyrylo Tkachov
  1 sibling, 0 replies; 35+ messages in thread
From: Christophe Lyon @ 2021-05-17  9:54 UTC (permalink / raw)
  To: gcc Patches

ping?

On Wed, 5 May 2021 at 16:08, Christophe Lyon <christophe.lyon@linaro.org> wrote:
>
> On Tue, 4 May 2021 at 15:41, Christophe Lyon <christophe.lyon@linaro.org> wrote:
> >
> > On Tue, 4 May 2021 at 13:29, Andre Vieira (lists)
> > <andre.simoesdiasvieira@arm.com> wrote:
> > >
> > > Hi Christophe,
> > >
> > > On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:
> > > > Since MVE has a different set of vector comparison operators from
> > > > Neon, we have to update the expansion to take into account the new
> > > > ones, for instance 'NE' for which MVE does not require to use 'EQ'
> > > > with the inverted condition.
> > > >
> > > > Conversely, Neon supports comparisons with #0, MVE does not.
> > > >
> > > > For:
> > > > typedef long int vs32 __attribute__((vector_size(16)));
> > > > vs32 cmp_eq_vs32_reg (vs32 a, vs32 b) { return a == b; }
> > > >
> > > > we now generate:
> > > > cmp_eq_vs32_reg:
> > > >       vldr.64 d4, .L123       @ 8     [c=8 l=4]  *mve_movv4si/8
> > > >       vldr.64 d5, .L123+8
> > > >       vldr.64 d6, .L123+16    @ 9     [c=8 l=4]  *mve_movv4si/8
> > > >       vldr.64 d7, .L123+24
> > > >       vcmp.i32  eq, q0, q1    @ 7     [c=16 l=4]  mve_vcmpeqq_v4si
> > > >       vpsel q0, q3, q2        @ 15    [c=8 l=4]  mve_vpselq_sv4si
> > > >       bx      lr      @ 26    [c=8 l=4]  *thumb2_return
> > > > .L124:
> > > >       .align  3
> > > > .L123:
> > > >       .word   0
> > > >       .word   0
> > > >       .word   0
> > > >       .word   0
> > > >       .word   1
> > > >       .word   1
> > > >       .word   1
> > > >       .word   1
> > > >
> > > > For some reason emit_move_insn (zero, CONST0_RTX (cmp_mode)) produces
> > > > a pair of vldr instead of vmov.i32, qX, #0
> > > I think ideally we would even want:
> > > vpte  eq, q0, q1
> > > vmovt.i32 q0, #0
> > > vmove.i32 q0, #1
> > >
> > > But we don't have a way to generate VPT blocks with multiple
> > > instructions yet unfortunately so I guess VPSEL will have to do for now.
> >
> > TBH,  I looked at what LLVM generates currently ;-)
> >
>
> Here is an updated version, which adds
> && (!<Is_float_mode> || flag_unsafe_math_optimizations)
> to vcond_mask_
>
> This condition was not present in the neon.md version I move to vec-common.md,
> but since the VDQW iterator includes V2SF and V4SF, it should take
> float-point flags into account.
>
> Christophe
>
> > >
> > > >
> > > > 2021-03-01  Christophe Lyon  <christophe.lyon@linaro.org>
> > > >
> > > >       gcc/
> > > >       * config/arm/arm-protos.h (arm_expand_vector_compare): Update
> > > >       prototype.
> > > >       * config/arm/arm.c (arm_expand_vector_compare): Add support for
> > > >       MVE.
> > > >       (arm_expand_vcond): Likewise.
> > > >       * config/arm/iterators.md (supf): Remove VCMPNEQ_S, VCMPEQQ_S,
> > > >       VCMPEQQ_N_S, VCMPNEQ_N_S.
> > > >       (VCMPNEQ, VCMPEQQ, VCMPEQQ_N, VCMPNEQ_N): Remove.
> > > >       * config/arm/mve.md (@mve_vcmp<mve_cmp_op>q_<mode>): Add '@' prefix.
> > > >       (@mve_vcmp<mve_cmp_op>q_f<mode>): Likewise.
> > > >       (@mve_vcmp<mve_cmp_op>q_n_f<mode>): Likewise.
> > > >       (@mve_vpselq_<supf><mode>): Likewise.
> > > >       (@mve_vpselq_f<mode>"): Likewise.
> > > >       * config/arm/neon.md (vec_cmp<mode><v_cmp_result): Enable for MVE
> > > >       and move to vec-common.md.
> > > >       (vec_cmpu<mode><mode>): Likewise.
> > > >       (vcond<mode><mode>): Likewise.
> > > >       (vcond<V_cvtto><mode>): Likewise.
> > > >       (vcondu<mode><v_cmp_result>): Likewise.
> > > >       (vcond_mask_<mode><v_cmp_result>): Likewise.
> > > >       * config/arm/unspecs.md (VCMPNEQ_U, VCMPNEQ_S, VCMPEQQ_S)
> > > >       (VCMPEQQ_N_S, VCMPNEQ_N_S, VCMPEQQ_U, CMPEQQ_N_U, VCMPNEQ_N_U)
> > > >       (VCMPGEQ_N_S, VCMPGEQ_S, VCMPGTQ_N_S, VCMPGTQ_S, VCMPLEQ_N_S)
> > > >       (VCMPLEQ_S, VCMPLTQ_N_S, VCMPLTQ_S, VCMPCSQ_N_U, VCMPCSQ_U)
> > > >       (VCMPHIQ_N_U, VCMPHIQ_U): Remove.
> > > >       * config/arm/vec-common.md (vec_cmp<mode><v_cmp_result): Moved
> > > >       from neon.md.
> > > >       (vec_cmpu<mode><mode>): Likewise.
> > > >       (vcond<mode><mode>): Likewise.
> > > >       (vcond<V_cvtto><mode>): Likewise.
> > > >       (vcondu<mode><v_cmp_result>): Likewise.
> > > >       (vcond_mask_<mode><v_cmp_result>): Likewise.
> > > >
> > > >       gcc/testsuite
> > > >       * gcc.target/arm/simd/mve-compare-1.c: New test with GCC vectors.
> > > >       * gcc.target/arm/simd/mve-compare-2.c: New test with GCC vectors.
> > > >       * gcc.target/arm/simd/mve-compare-scalar-1.c: New test with GCC
> > > >       vectors.
> > > >       * gcc.target/arm/simd/mve-vcmp-f32.c: New test for
> > > >       auto-vectorization.
> > > >       * gcc.target/arm/simd/mve-vcmp.c: New test for auto-vectorization.
> > > >
> > > > add gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
> > > > ---
> > > >   gcc/config/arm/arm-protos.h                        |   2 +-
> > > >   gcc/config/arm/arm.c                               | 211 ++++++++++++++++-----
> > > >   gcc/config/arm/iterators.md                        |   9 +-
> > > >   gcc/config/arm/mve.md                              |  10 +-
> > > >   gcc/config/arm/neon.md                             |  87 ---------
> > > >   gcc/config/arm/unspecs.md                          |  20 --
> > > >   gcc/config/arm/vec-common.md                       | 107 +++++++++++
> > > >   gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c  |  80 ++++++++
> > > >   gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c  |  38 ++++
> > > >   .../gcc.target/arm/simd/mve-compare-scalar-1.c     |  69 +++++++
> > > >   gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c   |  30 +++
> > > >   gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c       |  50 +++++
> > > >   12 files changed, 547 insertions(+), 166 deletions(-)
> > > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
> > > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
> > > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
> > > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
> > > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c
> > > >
> > > > diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
> > > > index 2521541..ffccaa7 100644
> > > > --- a/gcc/config/arm/arm-protos.h
> > > > +++ b/gcc/config/arm/arm-protos.h
> > > > @@ -373,7 +373,7 @@ extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
> > > >   extern bool arm_fusion_enabled_p (tune_params::fuse_ops);
> > > >   extern bool arm_valid_symbolic_address_p (rtx);
> > > >   extern bool arm_validize_comparison (rtx *, rtx *, rtx *);
> > > > -extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool);
> > > > +extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool, bool);
> > > >   #endif /* RTX_CODE */
> > > >
> > > >   extern bool arm_gen_setmem (rtx *);
> > > > diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
> > > > index 0371d98..80e28ef 100644
> > > > --- a/gcc/config/arm/arm.c
> > > > +++ b/gcc/config/arm/arm.c
> > > > @@ -30933,66 +30933,114 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
> > > >      and return true if TARGET contains the inverse.  If !CAN_INVERT,
> > > >      always store the result in TARGET, never its inverse.
> > > >
> > > > +   If VCOND_MVE, do not emit the vpsel instruction here, let arm_expand_vcond do
> > > > +   it with the right destination type to avoid emiting two vpsel, one here and
> > > > +   one in arm_expand_vcond.
> > > > +
> > > >      Note that the handling of floating-point comparisons is not
> > > >      IEEE compliant.  */
> > > >
> > > >   bool
> > > >   arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
> > > > -                        bool can_invert)
> > > > +                        bool can_invert, bool vcond_mve)
> > > >   {
> > > >     machine_mode cmp_result_mode = GET_MODE (target);
> > > >     machine_mode cmp_mode = GET_MODE (op0);
> > > >
> > > >     bool inverted;
> > > > -  switch (code)
> > > > -    {
> > > > -    /* For these we need to compute the inverse of the requested
> > > > -       comparison.  */
> > > > -    case UNORDERED:
> > > > -    case UNLT:
> > > > -    case UNLE:
> > > > -    case UNGT:
> > > > -    case UNGE:
> > > > -    case UNEQ:
> > > > -    case NE:
> > > > -      code = reverse_condition_maybe_unordered (code);
> > > > -      if (!can_invert)
> > > > -     {
> > > > -       /* Recursively emit the inverted comparison into a temporary
> > > > -          and then store its inverse in TARGET.  This avoids reusing
> > > > -          TARGET (which for integer NE could be one of the inputs).  */
> > > > -       rtx tmp = gen_reg_rtx (cmp_result_mode);
> > > > -       if (arm_expand_vector_compare (tmp, code, op0, op1, true))
> > > > -         gcc_unreachable ();
> > > > -       emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp)));
> > > > -       return false;
> > > > -     }
> > > > -      inverted = true;
> > > > -      break;
> > > >
> > > > -    default:
> > > > +  /* MVE supports more comparisons than Neon.  */
> > > > +  if (TARGET_HAVE_MVE)
> > > >         inverted = false;
> > > > -      break;
> > > > -    }
> > > > +  else
> > > > +    switch (code)
> > > > +      {
> > > > +     /* For these we need to compute the inverse of the requested
> > > > +        comparison.  */
> > > > +      case UNORDERED:
> > > > +      case UNLT:
> > > > +      case UNLE:
> > > > +      case UNGT:
> > > > +      case UNGE:
> > > > +      case UNEQ:
> > > > +      case NE:
> > > > +     code = reverse_condition_maybe_unordered (code);
> > > > +     if (!can_invert)
> > > > +       {
> > > > +         /* Recursively emit the inverted comparison into a temporary
> > > > +            and then store its inverse in TARGET.  This avoids reusing
> > > > +            TARGET (which for integer NE could be one of the inputs).  */
> > > > +         rtx tmp = gen_reg_rtx (cmp_result_mode);
> > > > +         if (arm_expand_vector_compare (tmp, code, op0, op1, true, vcond_mve))
> > > > +           gcc_unreachable ();
> > > > +         emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp)));
> > > > +         return false;
> > > > +       }
> > > > +     inverted = true;
> > > > +     break;
> > > > +
> > > > +      default:
> > > > +     inverted = false;
> > > > +     break;
> > > > +      }
> > > >
> > > >     switch (code)
> > > >       {
> > > > -    /* These are natively supported for zero comparisons, but otherwise
> > > > -       require the operands to be swapped.  */
> > > > +    /* These are natively supported by Neon for zero comparisons, but otherwise
> > > > +       require the operands to be swapped. For MVE, we can only compare
> > > > +       registers.  */
> > > >       case LE:
> > > >       case LT:
> > > > -      if (op1 != CONST0_RTX (cmp_mode))
> > > > -     {
> > > > -       code = swap_condition (code);
> > > > -       std::swap (op0, op1);
> > > > -     }
> > > > +      if (!TARGET_HAVE_MVE)
> > > > +     if (op1 != CONST0_RTX (cmp_mode))
> > > > +       {
> > > > +         code = swap_condition (code);
> > > > +         std::swap (op0, op1);
> > > > +       }
> > > >         /* Fall through.  */
> > > >
> > > > -    /* These are natively supported for both register and zero operands.  */
> > > > +    /* These are natively supported by Neon for both register and zero
> > > > +       operands. MVE supports registers only.  */
> > > >       case EQ:
> > > >       case GE:
> > > >       case GT:
> > > > -      emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1));
> > > > +    case NE:
> > > > +      if (TARGET_HAVE_MVE) {
> > > > +     rtx vpr_p0;
> > > > +     if (vcond_mve)
> > > > +       vpr_p0 = target;
> > > > +     else
> > > > +       vpr_p0 = gen_reg_rtx (HImode);
> > > > +
> > > > +     switch (cmp_mode)
> > > > +       {
> > > > +       case E_V16QImode:
> > > > +       case E_V8HImode:
> > > > +       case E_V4SImode:
> > > > +         emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
> > > > +         break;
> > > > +       case E_V8HFmode:
> > > > +       case E_V4SFmode:
> > > > +         if (TARGET_HAVE_MVE_FLOAT)
> > > > +           emit_insn (gen_mve_vcmpq_f (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
> > > > +         else
> > > > +           gcc_unreachable ();
> > > > +         break;
> > > > +       default:
> > > > +         gcc_unreachable ();
> > > > +       }
> > > > +
> > > > +     /* If we are not expanding a vcond, build the result here.  */
> > > > +     if (!vcond_mve) {
> > > > +       rtx zero = gen_reg_rtx (cmp_result_mode);
> > > > +       rtx one = gen_reg_rtx (cmp_result_mode);
> > > > +       emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
> > > > +       emit_move_insn (one, CONST1_RTX (cmp_result_mode));
> > > > +       emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
> > > > +     }
> > > > +      }
> > > > +      else
> > > > +     emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1));
> > > >         return inverted;
> > > >
> > > >       /* These are natively supported for register operands only.
> > > > @@ -31000,16 +31048,50 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
> > > >          or canonicalized by target-independent code.  */
> > > >       case GEU:
> > > >       case GTU:
> > > > -      emit_insn (gen_neon_vc (code, cmp_mode, target,
> > > > -                           op0, force_reg (cmp_mode, op1)));
> > > > +      if (TARGET_HAVE_MVE) {
> > > > +     rtx vpr_p0;
> > > > +     if (vcond_mve)
> > > > +       vpr_p0 = target;
> > > > +     else
> > > > +       vpr_p0 = gen_reg_rtx (HImode);
> > > > +
> > > > +     emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
> > > > +     if (!vcond_mve) {
> > > > +       rtx zero = gen_reg_rtx (cmp_result_mode);
> > > > +       rtx one = gen_reg_rtx (cmp_result_mode);
> > > > +       emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
> > > > +       emit_move_insn (one, CONST1_RTX (cmp_result_mode));
> > > > +       emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
> > > > +     }
> > > > +      }
> > > > +       else
> > > > +     emit_insn (gen_neon_vc (code, cmp_mode, target,
> > > > +                             op0, force_reg (cmp_mode, op1)));
> > > >         return inverted;
> > > >
> > > >       /* These require the operands to be swapped and likewise do not
> > > >          support comparisons with zero.  */
> > > >       case LEU:
> > > >       case LTU:
> > > > -      emit_insn (gen_neon_vc (swap_condition (code), cmp_mode,
> > > > -                           target, force_reg (cmp_mode, op1), op0));
> > > > +      if (TARGET_HAVE_MVE) {
> > > > +     rtx vpr_p0;
> > > > +     if (vcond_mve)
> > > > +       vpr_p0 = target;
> > > > +     else
> > > > +       vpr_p0 = gen_reg_rtx (HImode);
> > > > +
> > > > +     emit_insn (gen_mve_vcmpq (swap_condition (code), cmp_mode, vpr_p0, force_reg (cmp_mode, op1), op0));
> > > > +     if (!vcond_mve) {
> > > > +       rtx zero = gen_reg_rtx (cmp_result_mode);
> > > > +       rtx one = gen_reg_rtx (cmp_result_mode);
> > > > +       emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
> > > > +       emit_move_insn (one, CONST1_RTX (cmp_result_mode));
> > > > +       emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
> > > > +     }
> > > > +      }
> > > > +      else
> > > > +     emit_insn (gen_neon_vc (swap_condition (code), cmp_mode,
> > > > +                             target, force_reg (cmp_mode, op1), op0));
> > > >         return inverted;
> > > >
> > > >       /* These need a combination of two comparisons.  */
> > > > @@ -31021,8 +31103,8 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
> > > >       rtx gt_res = gen_reg_rtx (cmp_result_mode);
> > > >       rtx alt_res = gen_reg_rtx (cmp_result_mode);
> > > >       rtx_code alt_code = (code == LTGT ? LT : LE);
> > > > -     if (arm_expand_vector_compare (gt_res, GT, op0, op1, true)
> > > > -         || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true))
> > > > +     if (arm_expand_vector_compare (gt_res, GT, op0, op1, true, vcond_mve)
> > > > +         || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true, vcond_mve))
> > > >         gcc_unreachable ();
> > > >       emit_insn (gen_rtx_SET (target, gen_rtx_IOR (cmp_result_mode,
> > > >                                                    gt_res, alt_res)));
> > > > @@ -31040,13 +31122,50 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
> > > >   void
> > > >   arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode)
> > > >   {
> > > > -  rtx mask = gen_reg_rtx (cmp_result_mode);
> > > > +  /* When expanding for MVE, we do not want to emit a (useless) vpsel in
> > > > +     arm_expand_vector_compare, and another one here.  */
> > > > +  bool vcond_mve=false;
> > > > +  rtx mask;
> > > > +
> > > > +  if (TARGET_HAVE_MVE)
> > > > +    {
> > > > +      vcond_mve=true;
> > > > +      mask = gen_reg_rtx (HImode);
> > > > +    }
> > > > +  else
> > > > +    mask = gen_reg_rtx (cmp_result_mode);
> > > > +
> > > >     bool inverted = arm_expand_vector_compare (mask, GET_CODE (operands[3]),
> > > > -                                          operands[4], operands[5], true);
> > > > +                                          operands[4], operands[5], true, vcond_mve);
> > > >     if (inverted)
> > > >       std::swap (operands[1], operands[2]);
> > > > +  if (TARGET_NEON)
> > > >     emit_insn (gen_neon_vbsl (GET_MODE (operands[0]), operands[0],
> > > >                           mask, operands[1], operands[2]));
> > > > +  else
> > > > +    {
> > > > +      machine_mode cmp_mode = GET_MODE (operands[4]);
> > > > +      rtx vpr_p0 = mask;
> > > > +      rtx zero = gen_reg_rtx (cmp_mode);
> > > > +      rtx one = gen_reg_rtx (cmp_mode);
> > > > +      emit_move_insn (zero, CONST0_RTX (cmp_mode));
> > > > +      emit_move_insn (one, CONST1_RTX (cmp_mode));
> > > > +      switch (cmp_mode)
> > > > +     {
> > > > +     case E_V16QImode:
> > > > +     case E_V8HImode:
> > > > +     case E_V4SImode:
> > > > +       emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, operands[0], one, zero, vpr_p0));
> > > > +       break;
> > > > +     case E_V8HFmode:
> > > > +     case E_V4SFmode:
> > > > +       if (TARGET_HAVE_MVE_FLOAT)
> > > > +         emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], one, zero, vpr_p0));
> > > > +       break;
> > > > +     default:
> > > > +       gcc_unreachable ();
> > > > +     }
> > > > +    }
> > > >   }
> > > >
> > > >   #define MAX_VECT_LEN 16
> > > > diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> > > > index 95df8bd..a128465 100644
> > > > --- a/gcc/config/arm/iterators.md
> > > > +++ b/gcc/config/arm/iterators.md
> > > > @@ -1288,12 +1288,11 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
> > > >                      (VCREATEQ_U "u") (VCREATEQ_S "s") (VSHRQ_N_S "s")
> > > >                      (VSHRQ_N_U "u") (VCVTQ_N_FROM_F_S "s") (VSHLQ_U "u")
> > > >                      (VCVTQ_N_FROM_F_U "u") (VADDLVQ_P_S "s") (VSHLQ_S "s")
> > > > -                    (VADDLVQ_P_U "u") (VCMPNEQ_S "s")
> > > > +                    (VADDLVQ_P_U "u")
> > > >                      (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s")
> > > >                      (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
> > > >                      (VADDVQ_P_S "s") (VADDVQ_P_U "u") (VBRSRQ_N_S "s")
> > > > -                    (VBRSRQ_N_U "u") (VCMPEQQ_S "s")
> > > > -                    (VCMPEQQ_N_S "s") (VCMPNEQ_N_S "s")
> > > > +                    (VBRSRQ_N_U "u")
> > > >                      (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s")
> > > >                      (VHADDQ_U "u") (VHSUBQ_N_S "s")  (VHSUBQ_N_U "u")
> > > >                      (VHSUBQ_S "s") (VMAXQ_S "s") (VMAXQ_U "u") (VHSUBQ_U "u")
> > > > @@ -1549,16 +1548,12 @@ (define_int_iterator VCREATEQ [VCREATEQ_U VCREATEQ_S])
> > > >   (define_int_iterator VSHRQ_N [VSHRQ_N_S VSHRQ_N_U])
> > > >   (define_int_iterator VCVTQ_N_FROM_F [VCVTQ_N_FROM_F_S VCVTQ_N_FROM_F_U])
> > > >   (define_int_iterator VADDLVQ_P [VADDLVQ_P_S VADDLVQ_P_U])
> > > > -(define_int_iterator VCMPNEQ [VCMPNEQ_S])
> > > >   (define_int_iterator VSHLQ [VSHLQ_S VSHLQ_U])
> > > >   (define_int_iterator VABDQ [VABDQ_S VABDQ_U])
> > > >   (define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U])
> > > >   (define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U])
> > > >   (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
> > > >   (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
> > > > -(define_int_iterator VCMPEQQ [VCMPEQQ_S])
> > > > -(define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S])
> > > > -(define_int_iterator VCMPNEQ_N [VCMPNEQ_N_S])
> > > >   (define_int_iterator VHADDQ [VHADDQ_S VHADDQ_U])
> > > >   (define_int_iterator VHADDQ_N [VHADDQ_N_U VHADDQ_N_S])
> > > >   (define_int_iterator VHSUBQ [VHSUBQ_S VHSUBQ_U])
> > > > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> > > > index 7c846a4..97f0a87 100644
> > > > --- a/gcc/config/arm/mve.md
> > > > +++ b/gcc/config/arm/mve.md
> > > > @@ -838,7 +838,7 @@ (define_insn "mve_vaddlvq_p_<supf>v4si"
> > > >   ;;
> > > >   ;; [vcmpneq_, vcmpcsq_, vcmpeqq_, vcmpgeq_, vcmpgtq_, vcmphiq_, vcmpleq_, vcmpltq_])
> > > >   ;;
> > > > -(define_insn "mve_vcmp<mve_cmp_op>q_<mode>"
> > > > +(define_insn "@mve_vcmp<mve_cmp_op>q_<mode>"
> > > >     [
> > > >      (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> > > >       (MVE_COMPARISONS:HI (match_operand:MVE_2 1 "s_register_operand" "w")
> > > > @@ -1928,7 +1928,7 @@ (define_insn "mve_vcaddq<mve_rot><mode>"
> > > >   ;;
> > > >   ;; [vcmpeqq_f, vcmpgeq_f, vcmpgtq_f, vcmpleq_f, vcmpltq_f, vcmpneq_f])
> > > >   ;;
> > > > -(define_insn "mve_vcmp<mve_cmp_op>q_f<mode>"
> > > > +(define_insn "@mve_vcmp<mve_cmp_op>q_f<mode>"
> > > >     [
> > > >      (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> > > >       (MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w")
> > > > @@ -1942,7 +1942,7 @@ (define_insn "mve_vcmp<mve_cmp_op>q_f<mode>"
> > > >   ;;
> > > >   ;; [vcmpeqq_n_f, vcmpgeq_n_f, vcmpgtq_n_f, vcmpleq_n_f, vcmpltq_n_f, vcmpneq_n_f])
> > > >   ;;
> > > > -(define_insn "mve_vcmp<mve_cmp_op>q_n_f<mode>"
> > > > +(define_insn "@mve_vcmp<mve_cmp_op>q_n_f<mode>"
> > > >     [
> > > >      (set (match_operand:HI 0 "vpr_register_operand" "=Up")
> > > >       (MVE_FP_COMPARISONS:HI (match_operand:MVE_0 1 "s_register_operand" "w")
> > > > @@ -3307,7 +3307,7 @@ (define_insn "mve_vnegq_m_s<mode>"
> > > >   ;;
> > > >   ;; [vpselq_u, vpselq_s])
> > > >   ;;
> > > > -(define_insn "mve_vpselq_<supf><mode>"
> > > > +(define_insn "@mve_vpselq_<supf><mode>"
> > > >     [
> > > >      (set (match_operand:MVE_1 0 "s_register_operand" "=w")
> > > >       (unspec:MVE_1 [(match_operand:MVE_1 1 "s_register_operand" "w")
> > > > @@ -4402,7 +4402,7 @@ (define_insn "mve_vorrq_m_n_<supf><mode>"
> > > >   ;;
> > > >   ;; [vpselq_f])
> > > >   ;;
> > > > -(define_insn "mve_vpselq_f<mode>"
> > > > +(define_insn "@mve_vpselq_f<mode>"
> > > >     [
> > > >      (set (match_operand:MVE_0 0 "s_register_operand" "=w")
> > > >       (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
> > > > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> > > > index fec2cc9..6660846 100644
> > > > --- a/gcc/config/arm/neon.md
> > > > +++ b/gcc/config/arm/neon.md
> > > > @@ -1416,93 +1416,6 @@ (define_insn "*us_sub<mode>_neon"
> > > >     [(set_attr "type" "neon_qsub<q>")]
> > > >   )
> > > >
> > > > -(define_expand "vec_cmp<mode><v_cmp_result>"
> > > > -  [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
> > > > -     (match_operator:<V_cmp_result> 1 "comparison_operator"
> > > > -       [(match_operand:VDQW 2 "s_register_operand")
> > > > -        (match_operand:VDQW 3 "reg_or_zero_operand")]))]
> > > > -  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > > -{
> > > > -  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
> > > > -                          operands[2], operands[3], false);
> > > > -  DONE;
> > > > -})
> > > > -
> > > > -(define_expand "vec_cmpu<mode><mode>"
> > > > -  [(set (match_operand:VDQIW 0 "s_register_operand")
> > > > -     (match_operator:VDQIW 1 "comparison_operator"
> > > > -       [(match_operand:VDQIW 2 "s_register_operand")
> > > > -        (match_operand:VDQIW 3 "reg_or_zero_operand")]))]
> > > > -  "TARGET_NEON"
> > > > -{
> > > > -  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
> > > > -                          operands[2], operands[3], false);
> > > > -  DONE;
> > > > -})
> > > > -
> > > > -;; Conditional instructions.  These are comparisons with conditional moves for
> > > > -;; vectors.  They perform the assignment:
> > > > -;;
> > > > -;;     Vop0 = (Vop4 <op3> Vop5) ? Vop1 : Vop2;
> > > > -;;
> > > > -;; where op3 is <, <=, ==, !=, >= or >.  Operations are performed
> > > > -;; element-wise.
> > > > -
> > > > -(define_expand "vcond<mode><mode>"
> > > > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > > > -     (if_then_else:VDQW
> > > > -       (match_operator 3 "comparison_operator"
> > > > -         [(match_operand:VDQW 4 "s_register_operand")
> > > > -          (match_operand:VDQW 5 "reg_or_zero_operand")])
> > > > -       (match_operand:VDQW 1 "s_register_operand")
> > > > -       (match_operand:VDQW 2 "s_register_operand")))]
> > > > -  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > > -{
> > > > -  arm_expand_vcond (operands, <V_cmp_result>mode);
> > > > -  DONE;
> > > > -})
> > > > -
> > > > -(define_expand "vcond<V_cvtto><mode>"
> > > > -  [(set (match_operand:<V_CVTTO> 0 "s_register_operand")
> > > > -     (if_then_else:<V_CVTTO>
> > > > -       (match_operator 3 "comparison_operator"
> > > > -         [(match_operand:V32 4 "s_register_operand")
> > > > -          (match_operand:V32 5 "reg_or_zero_operand")])
> > > > -       (match_operand:<V_CVTTO> 1 "s_register_operand")
> > > > -       (match_operand:<V_CVTTO> 2 "s_register_operand")))]
> > > > -  "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > > -{
> > > > -  arm_expand_vcond (operands, <V_cmp_result>mode);
> > > > -  DONE;
> > > > -})
> > > > -
> > > > -(define_expand "vcondu<mode><v_cmp_result>"
> > > > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > > > -     (if_then_else:VDQW
> > > > -       (match_operator 3 "arm_comparison_operator"
> > > > -         [(match_operand:<V_cmp_result> 4 "s_register_operand")
> > > > -          (match_operand:<V_cmp_result> 5 "reg_or_zero_operand")])
> > > > -       (match_operand:VDQW 1 "s_register_operand")
> > > > -       (match_operand:VDQW 2 "s_register_operand")))]
> > > > -  "TARGET_NEON"
> > > > -{
> > > > -  arm_expand_vcond (operands, <V_cmp_result>mode);
> > > > -  DONE;
> > > > -})
> > > > -
> > > > -(define_expand "vcond_mask_<mode><v_cmp_result>"
> > > > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > > > -     (if_then_else:VDQW
> > > > -       (match_operand:<V_cmp_result> 3 "s_register_operand")
> > > > -       (match_operand:VDQW 1 "s_register_operand")
> > > > -       (match_operand:VDQW 2 "s_register_operand")))]
> > > > -  "TARGET_NEON"
> > > > -{
> > > > -  emit_insn (gen_neon_vbsl<mode> (operands[0], operands[3], operands[1],
> > > > -                               operands[2]));
> > > > -  DONE;
> > > > -})
> > > > -
> > > >   ;; Patterns for builtins.
> > > >
> > > >   ; good for plain vadd, vaddq.
> > > > diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
> > > > index 07ca53b..0778db1 100644
> > > > --- a/gcc/config/arm/unspecs.md
> > > > +++ b/gcc/config/arm/unspecs.md
> > > > @@ -596,8 +596,6 @@ (define_c_enum "unspec" [
> > > >     VCVTQ_N_FROM_F_U
> > > >     VADDLVQ_P_S
> > > >     VADDLVQ_P_U
> > > > -  VCMPNEQ_U
> > > > -  VCMPNEQ_S
> > > >     VSHLQ_S
> > > >     VSHLQ_U
> > > >     VABDQ_S
> > > > @@ -605,9 +603,6 @@ (define_c_enum "unspec" [
> > > >     VADDVAQ_S
> > > >     VADDVQ_P_S
> > > >     VBRSRQ_N_S
> > > > -  VCMPEQQ_S
> > > > -  VCMPEQQ_N_S
> > > > -  VCMPNEQ_N_S
> > > >     VHADDQ_S
> > > >     VHADDQ_N_S
> > > >     VHSUBQ_S
> > > > @@ -645,9 +640,6 @@ (define_c_enum "unspec" [
> > > >     VADDVAQ_U
> > > >     VADDVQ_P_U
> > > >     VBRSRQ_N_U
> > > > -  VCMPEQQ_U
> > > > -  VCMPEQQ_N_U
> > > > -  VCMPNEQ_N_U
> > > >     VHADDQ_U
> > > >     VHADDQ_N_U
> > > >     VHSUBQ_U
> > > > @@ -680,14 +672,6 @@ (define_c_enum "unspec" [
> > > >     VSHLQ_R_U
> > > >     VSUBQ_U
> > > >     VSUBQ_N_U
> > > > -  VCMPGEQ_N_S
> > > > -  VCMPGEQ_S
> > > > -  VCMPGTQ_N_S
> > > > -  VCMPGTQ_S
> > > > -  VCMPLEQ_N_S
> > > > -  VCMPLEQ_S
> > > > -  VCMPLTQ_N_S
> > > > -  VCMPLTQ_S
> > > >     VHCADDQ_ROT270_S
> > > >     VHCADDQ_ROT90_S
> > > >     VMAXAQ_S
> > > > @@ -702,10 +686,6 @@ (define_c_enum "unspec" [
> > > >     VQRDMULHQ_N_S
> > > >     VQRDMULHQ_S
> > > >     VQSHLUQ_N_S
> > > > -  VCMPCSQ_N_U
> > > > -  VCMPCSQ_U
> > > > -  VCMPHIQ_N_U
> > > > -  VCMPHIQ_U
> > > >     VABDQ_M_S
> > > >     VABDQ_M_U
> > > >     VABDQ_F
> > > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > > > index 0b2b3b1..034b48b 100644
> > > > --- a/gcc/config/arm/vec-common.md
> > > > +++ b/gcc/config/arm/vec-common.md
> > > > @@ -362,3 +362,110 @@ (define_expand "vlshr<mode>3"
> > > >         DONE;
> > > >       }
> > > >   })
> > > > +
> > > > +(define_expand "vec_cmp<mode><v_cmp_result>"
> > > > +  [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
> > > > +     (match_operator:<V_cmp_result> 1 "comparison_operator"
> > > > +       [(match_operand:VDQW 2 "s_register_operand")
> > > > +        (match_operand:VDQW 3 "reg_or_zero_operand")]))]
> > > > +  "ARM_HAVE_<MODE>_ARITH
> > > > +   && !TARGET_REALLY_IWMMXT
> > > > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > > +{
> > > > +  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
> > > > +                          operands[2], operands[3], false, false);
> > > > +  DONE;
> > > > +})
> > > > +
> > > > +(define_expand "vec_cmpu<mode><mode>"
> > > > +  [(set (match_operand:VDQIW 0 "s_register_operand")
> > > > +     (match_operator:VDQIW 1 "comparison_operator"
> > > > +       [(match_operand:VDQIW 2 "s_register_operand")
> > > > +        (match_operand:VDQIW 3 "reg_or_zero_operand")]))]
> > > > +  "ARM_HAVE_<MODE>_ARITH
> > > > +   && !TARGET_REALLY_IWMMXT"
> > > > +{
> > > > +  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
> > > > +                          operands[2], operands[3], false, false);
> > > > +  DONE;
> > > > +})
> > > > +
> > > > +;; Conditional instructions.  These are comparisons with conditional moves for
> > > > +;; vectors.  They perform the assignment:
> > > > +;;
> > > > +;;     Vop0 = (Vop4 <op3> Vop5) ? Vop1 : Vop2;
> > > > +;;
> > > > +;; where op3 is <, <=, ==, !=, >= or >.  Operations are performed
> > > > +;; element-wise.
> > > > +
> > > > +(define_expand "vcond<mode><mode>"
> > > > +  [(set (match_operand:VDQW 0 "s_register_operand")
> > > > +     (if_then_else:VDQW
> > > > +       (match_operator 3 "comparison_operator"
> > > > +         [(match_operand:VDQW 4 "s_register_operand")
> > > > +          (match_operand:VDQW 5 "reg_or_zero_operand")])
> > > > +       (match_operand:VDQW 1 "s_register_operand")
> > > > +       (match_operand:VDQW 2 "s_register_operand")))]
> > > > +  "ARM_HAVE_<MODE>_ARITH
> > > > +   && !TARGET_REALLY_IWMMXT
> > > > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > > +{
> > > > +  arm_expand_vcond (operands, <V_cmp_result>mode);
> > > > +  DONE;
> > > > +})
> > > > +
> > > > +(define_expand "vcond<V_cvtto><mode>"
> > > > +  [(set (match_operand:<V_CVTTO> 0 "s_register_operand")
> > > > +     (if_then_else:<V_CVTTO>
> > > > +       (match_operator 3 "comparison_operator"
> > > > +         [(match_operand:V32 4 "s_register_operand")
> > > > +          (match_operand:V32 5 "reg_or_zero_operand")])
> > > > +       (match_operand:<V_CVTTO> 1 "s_register_operand")
> > > > +       (match_operand:<V_CVTTO> 2 "s_register_operand")))]
> > > > +  "ARM_HAVE_<MODE>_ARITH
> > > > +   && !TARGET_REALLY_IWMMXT
> > > > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > > +{
> > > > +  arm_expand_vcond (operands, <V_cmp_result>mode);
> > > > +  DONE;
> > > > +})
> > > > +
> > > > +(define_expand "vcondu<mode><v_cmp_result>"
> > > > +  [(set (match_operand:VDQW 0 "s_register_operand")
> > > > +     (if_then_else:VDQW
> > > > +       (match_operator 3 "arm_comparison_operator"
> > > > +         [(match_operand:<V_cmp_result> 4 "s_register_operand")
> > > > +          (match_operand:<V_cmp_result> 5 "reg_or_zero_operand")])
> > > > +       (match_operand:VDQW 1 "s_register_operand")
> > > > +       (match_operand:VDQW 2 "s_register_operand")))]
> > > > +  "ARM_HAVE_<MODE>_ARITH
> > > > +   && !TARGET_REALLY_IWMMXT"
> > > > +{
> > > > +  arm_expand_vcond (operands, <V_cmp_result>mode);
> > > > +  DONE;
> > > > +})
> > > > +
> > > > +(define_expand "vcond_mask_<mode><v_cmp_result>"
> > > > +  [(set (match_operand:VDQW 0 "s_register_operand")
> > > > +        (if_then_else:VDQW
> > > > +          (match_operand:<V_cmp_result> 3 "s_register_operand")
> > > > +          (match_operand:VDQW 1 "s_register_operand")
> > > > +          (match_operand:VDQW 2 "s_register_operand")))]
> > > > +  "ARM_HAVE_<MODE>_ARITH
> > > > +   && !TARGET_REALLY_IWMMXT"
> > > > +{
> > > > +  if (TARGET_NEON)
> > > > +    {
> > > > +      emit_insn (gen_neon_vbsl (<MODE>mode, operands[0], operands[3],
> > > > +                                operands[1], operands[2]));
> > > > +    }
> > > > +  else if (TARGET_HAVE_MVE)
> > > > +    {
> > > > +      emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0],
> > > > +                                 operands[1], operands[2], operands[3]));
> > > > +    }
> > > > +  else
> > > > +    gcc_unreachable ();
> > > > +
> > > > +  DONE;
> > > > +})
> > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
> > > > new file mode 100644
> > > > index 0000000..029c931
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
> > > > @@ -0,0 +1,80 @@
> > > > +/* { dg-do assemble } */
> > > > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> > > > +/* { dg-add-options arm_v8_1m_mve } */
> > > > +/* { dg-additional-options "-O3" } */
> > > > +
> > > > +/* Integer tests.  */
> > > > +
> > > > +#define COMPARE_REG(NAME, OP, TYPE) \
> > > > +  TYPE \
> > > > +  cmp_##NAME##_##TYPE##_reg (TYPE a, TYPE b) \
> > > > +  { \
> > > > +    return a OP b; \
> > > > +  }
> > > > +
> > > > +#define COMPARE_REG_AND_ZERO(NAME, OP, TYPE) \
> > > > +  COMPARE_REG (NAME, OP, TYPE) \
> > > > +  \
> > > > +  TYPE \
> > > > +  cmp_##NAME##_##TYPE##_zero (TYPE a) \
> > > > +  { \
> > > > +    return a OP (TYPE) {}; \
> > > > +  }
> > > > +
> > > > +#define COMPARE_TYPE(TYPE, COMPARE_ORDERED) \
> > > > +  COMPARE_REG_AND_ZERO (eq, ==, TYPE) \
> > > > +  COMPARE_REG_AND_ZERO (ne, !=, TYPE) \
> > > > +  COMPARE_ORDERED (lt, <, TYPE) \
> > > > +  COMPARE_ORDERED (le, <=, TYPE) \
> > > > +  COMPARE_ORDERED (gt, >, TYPE) \
> > > > +  COMPARE_ORDERED (ge, >=, TYPE)
> > > > +
> > > > +#define TEST_TYPE(NAME, ELEM, COMPARE_ORDERED, SIZE)  \
> > > > +  typedef ELEM NAME##SIZE __attribute__((vector_size(SIZE))); \
> > > > +  COMPARE_TYPE (NAME##SIZE, COMPARE_ORDERED)
> > > > +
> > > > +/* 64-bits vectors, not vectorized.  */
> > > > +TEST_TYPE (vs8, __INT8_TYPE__, COMPARE_REG_AND_ZERO, 8)
> > > > +TEST_TYPE (vu8, __UINT8_TYPE__, COMPARE_REG, 8)
> > > > +TEST_TYPE (vs16, __INT16_TYPE__, COMPARE_REG_AND_ZERO, 8)
> > > > +TEST_TYPE (vu16, __UINT16_TYPE__, COMPARE_REG, 8)
> > > > +TEST_TYPE (vs32, __INT32_TYPE__, COMPARE_REG_AND_ZERO, 8)
> > > > +TEST_TYPE (vu32, __UINT32_TYPE__, COMPARE_REG, 8)
> > > > +
> > > > +/* 128-bits vectors.  */
> > > > +TEST_TYPE (vs8, __INT8_TYPE__, COMPARE_REG_AND_ZERO, 16)
> > > > +TEST_TYPE (vu8, __UINT8_TYPE__, COMPARE_REG, 16)
> > > > +TEST_TYPE (vs16, __INT16_TYPE__, COMPARE_REG_AND_ZERO, 16)
> > > > +TEST_TYPE (vu16, __UINT16_TYPE__, COMPARE_REG, 16)
> > > > +TEST_TYPE (vs32, __INT32_TYPE__, COMPARE_REG_AND_ZERO, 16)
> > > > +TEST_TYPE (vu32, __UINT32_TYPE__, COMPARE_REG, 16)
> > > > +
> > > > +/* { 8 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.i8  eq, q[0-9]+, q[0-9]+\n} 4 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.i8  ne, q[0-9]+, q[0-9]+\n} 4 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s8  lt, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s8  le, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s8  gt, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s8  ge, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.u8  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.u8  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +
> > > > +/* { 16 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.i16  eq, q[0-9]+, q[0-9]+\n} 4 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.i16  ne, q[0-9]+, q[0-9]+\n} 4 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s16  lt, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s16  le, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s16  gt, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s16  ge, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.u16  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.u16  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +
> > > > +/* { 32 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.i32  eq, q[0-9]+, q[0-9]+\n} 4 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.i32  ne, q[0-9]+, q[0-9]+\n} 4 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s32  lt, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s32  le, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s32  gt, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s32  ge, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.u32  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.u32  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
> > > > new file mode 100644
> > > > index 0000000..8515195
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
> > > > @@ -0,0 +1,38 @@
> > > > +/* { dg-do assemble } */
> > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > > > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> > > > +
> > > > +/* float 32 tests.  */
> > > > +
> > > > +#ifndef ELEM_TYPE
> > > > +#define ELEM_TYPE float
> > > > +#endif
> > > > +#ifndef INT_ELEM_TYPE
> > > > +#define INT_ELEM_TYPE __INT32_TYPE__
> > > > +#endif
> > > > +
> > > > +#define COMPARE(NAME, OP)                    \
> > > > +  int_vec                                    \
> > > > +  cmp_##NAME##_reg (vec a, vec b)            \
> > > > +  {                                          \
> > > > +    return a OP b;                           \
> > > > +  }
> > > > +
> > > > +typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));
> > > > +typedef ELEM_TYPE vec __attribute__((vector_size(16)));
> > > > +
> > > > +COMPARE (eq, ==)
> > > > +COMPARE (ne, !=)
> > > > +COMPARE (lt, <)
> > > > +COMPARE (le, <=)
> > > > +COMPARE (gt, >)
> > > > +COMPARE (ge, >=)
> > > > +
> > > > +/* eq, ne, lt, le, gt, ge.
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\teq, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tne, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tle, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tge, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
> > > > new file mode 100644
> > > > index 0000000..7774972
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
> > > > @@ -0,0 +1,69 @@
> > > > +/* { dg-do assemble } */
> > > > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> > > > +/* { dg-add-options arm_v8_1m_mve } */
> > > > +/* { dg-additional-options "-O3" } */
> > > > +
> > > > +#define COMPARE_REG(NAME, OP, TYPE, SCALAR)    \
> > > > +  TYPE                                                 \
> > > > +  cmp_##NAME##_##TYPE##_scalar (TYPE a, SCALAR b) \
> > > > +  {                                            \
> > > > +    return a OP b;                             \
> > > > +  }
> > > > +
> > > > +#define COMPARE_TYPE(SCALAR, TYPE)                           \
> > > > +  COMPARE_REG (eq, ==, TYPE, SCALAR)                         \
> > > > +  COMPARE_REG (ne, !=, TYPE, SCALAR)                         \
> > > > +  COMPARE_REG (lt, <, TYPE, SCALAR)                          \
> > > > +  COMPARE_REG (le, <=, TYPE, SCALAR)                         \
> > > > +  COMPARE_REG (gt, >, TYPE, SCALAR)                          \
> > > > +  COMPARE_REG (ge, >=, TYPE, SCALAR)
> > > > +
> > > > +#define TEST_TYPE(NAME, ELEM, SIZE)                        \
> > > > +  typedef ELEM NAME##SIZE __attribute__((vector_size(SIZE))); \
> > > > +  COMPARE_TYPE (ELEM, NAME##SIZE)
> > > > +
> > > > +/* 64-bits vectors, not vectorized.  */
> > > > +TEST_TYPE (vs8, __INT8_TYPE__, 8)
> > > > +TEST_TYPE (vu8, __UINT8_TYPE__, 8)
> > > > +TEST_TYPE (vs16, __INT16_TYPE__, 8)
> > > > +TEST_TYPE (vu16, __UINT16_TYPE__, 8)
> > > > +TEST_TYPE (vs32, __INT32_TYPE__, 8)
> > > > +TEST_TYPE (vu32, __UINT32_TYPE__, 8)
> > > > +
> > > > +/* 128-bits vectors.  */
> > > > +TEST_TYPE (vs8, __INT8_TYPE__, 16)
> > > > +TEST_TYPE (vu8, __UINT8_TYPE__, 16)
> > > > +TEST_TYPE (vs16, __INT16_TYPE__, 16)
> > > > +TEST_TYPE (vu16, __UINT16_TYPE__, 16)
> > > > +TEST_TYPE (vs32, __INT32_TYPE__, 16)
> > > > +TEST_TYPE (vu32, __UINT32_TYPE__, 16)
> > > > +
> > > > +/* { 8 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.i8  eq, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.i8  ne, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s8  lt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s8  le, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s8  gt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s8  ge, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.u8  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.u8  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +
> > > > +/* { 16 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.i16  eq, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.i16  ne, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s16  lt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s16  le, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s16  gt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s16  ge, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.u16  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.u16  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +
> > > > +/* { 32 bits } x { eq, ne, lt, le, gt, ge, hi, cs }.
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.i32  eq, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.i32  ne, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s32  lt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s32  le, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s32  gt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s32  ge, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.u32  hi, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.u32  cs, q[0-9]+, q[0-9]+\n} 2 } } */
> > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
> > > > new file mode 100644
> > > > index 0000000..4ed449e
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
> > > > @@ -0,0 +1,30 @@
> > > > +/* { dg-do assemble } */
> > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > > > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> > > > +
> > > > +#include <stdint.h>
> > > > +
> > > > +#define NB 4
> > > > +
> > > > +#define FUNC(OP, NAME)                                                       \
> > > > +  void test_ ## NAME ##_f (float * __restrict__ dest, float *a, float *b) { \
> > > > +    int i;                                                           \
> > > > +    for (i=0; i<NB; i++) {                                           \
> > > > +      dest[i] = a[i] OP b[i];                                                \
> > > > +    }                                                                        \
> > > > +  }
> > > > +
> > > > +FUNC(==, vcmpeq)
> > > > +FUNC(!=, vcmpne)
> > > > +FUNC(<, vcmplt)
> > > > +FUNC(<=, vcmple)
> > > > +FUNC(>, vcmpgt)
> > > > +FUNC(>=, vcmpge)
> > > > +
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\teq, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tne, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tle, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.f32\tge, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c
> > > > new file mode 100644
> > > > index 0000000..8da15e7
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c
> > > > @@ -0,0 +1,50 @@
> > > > +/* { dg-do assemble } */
> > > > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> > > > +/* { dg-add-options arm_v8_1m_mve } */
> > > > +/* { dg-additional-options "-O3" } */
> > > > +
> > > > +#include <stdint.h>
> > > > +
> > > > +#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)                         \
> > > > +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
> > > > +    int i;                                                           \
> > > > +    for (i=0; i<NB; i++) {                                           \
> > > > +      dest[i] = a[i] OP b[i];                                                \
> > > > +    }                                                                        \
> > > > +}
> > > > +
> > > > +#define ALL_FUNCS(OP, NAME) \
> > > > +  FUNC(s, int, 32, 2, OP, NAME)                      \
> > > > +  FUNC(u, uint, 32, 2, OP, NAME)             \
> > > > +  FUNC(s, int, 16, 4, OP, NAME)                      \
> > > > +  FUNC(u, uint, 16, 4, OP, NAME)             \
> > > > +  FUNC(s, int, 8, 8, OP, NAME)                       \
> > > > +  FUNC(u, uint, 8, 8, OP, NAME)                      \
> > > > +  FUNC(s, int, 32, 4, OP, NAME)                      \
> > > > +  FUNC(u, uint, 32, 4, OP, NAME)             \
> > > > +  FUNC(s, int, 16, 8, OP, NAME)                      \
> > > > +  FUNC(u, uint, 16, 8, OP, NAME)             \
> > > > +  FUNC(s, int, 8, 16, OP, NAME)                      \
> > > > +  FUNC(u, uint, 8, 16, OP, NAME)
> > > > +
> > > > +ALL_FUNCS(==, vcmpeq)
> > > > +ALL_FUNCS(!=, vcmpne)
> > > > +ALL_FUNCS(<, vcmplt)
> > > > +ALL_FUNCS(<=, vcmple)
> > > > +ALL_FUNCS(>, vcmpgt)
> > > > +ALL_FUNCS(>=, vcmpge)
> > > > +
> > > > +/* MVE has only 128-bit vectors, so we can vectorize only half of the
> > > > +   functions above.  */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.i[0-9]+  eq, q[0-9]+, q[0-9]+\n} 6 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.i[0-9]+  ne, q[0-9]+, q[0-9]+\n} 6 } } */
> > > > +
> > > > +/* lt, le, gt, ge apply to signed types, cs and hi to unsigned types.  */
> > > > +/* lt and le with unsigned types are replaced with the opposite condition, hence
> > > > +   the double number of matches for cs and hi.  */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  lt, q[0-9]+, q[0-9]+\n} 3 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  le, q[0-9]+, q[0-9]+\n} 3 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  gt, q[0-9]+, q[0-9]+\n} 3 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.s[0-9]+  ge, q[0-9]+, q[0-9]+\n} 3 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.u[0-9]+  cs, q[0-9]+, q[0-9]+\n} 6 } } */
> > > > +/* { dg-final { scan-assembler-times {\tvcmp.u[0-9]+  hi, q[0-9]+, q[0-9]+\n} 6 } } */

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 7/9] arm: Auto-vectorization for MVE: add __fp16 support to VCMP
  2021-05-05 14:09         ` Christophe Lyon
@ 2021-05-17  9:54           ` Christophe Lyon
  2021-05-17 10:49           ` Kyrylo Tkachov
  1 sibling, 0 replies; 35+ messages in thread
From: Christophe Lyon @ 2021-05-17  9:54 UTC (permalink / raw)
  To: gcc-patches

ping?

On Wed, 5 May 2021 at 16:09, Christophe Lyon <christophe.lyon@linaro.org> wrote:
>
> On Tue, 4 May 2021 at 19:03, Christophe Lyon <christophe.lyon@linaro.org> wrote:
> >
> > On Tue, 4 May 2021 at 15:43, Christophe Lyon <christophe.lyon@linaro.org> wrote:
> > >
> > > On Tue, 4 May 2021 at 13:48, Andre Vieira (lists)
> > > <andre.simoesdiasvieira@arm.com> wrote:
> > > >
> > > > It would be good to also add tests for NEON as you also enable auto-vec
> > > > for it. I checked and I do think the necessary 'neon_vc' patterns exist
> > > > for 'VH', so we should be OK there.
> > > >
> > >
> > > Actually since I posted the patch series, I've noticed a regression in
> > > armv8_2-fp16-arith-1.c, because we now vectorize all the float16x[48]_t loops,
> > > but we lose the fact that some FP comparisons can throw exceptions.
> > >
> > > I'll have to revisit this patch.
> >
> > Actually it looks like my patch does the right thing: we now vectorize
> > appropriately, given that the testcase is compiled with -ffast-math.
> > I need to update the testcase, though.
> >
>
> Here is a new version, with armv8_2-fp16-arith-1.c updated to take
> into account the new vectorization.
>
> Christophe
>
>
> > >
> > > Thanks,
> > >
> > > Christophe
> > >
> > > > On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:
> > > > > This patch adds __fp16 support to the previous patch that added vcmp
> > > > > support with MVE. For this we update existing expanders to use VDQWH
> > > > > iterator, and add a new expander vcond<VH_cvtto><mode>.  In the
> > > > > process we need to create suitable iterators, and update v_cmp_result
> > > > > as needed.
> > > > >
> > > > > 2021-04-26  Christophe Lyon  <christophe.lyon@linaro.org>
> > > > >
> > > > >       gcc/
> > > > >       * config/arm/iterators.md (V16): New iterator.
> > > > >       (VH_cvtto): New iterator.
> > > > >       (v_cmp_result): Added V4HF and V8HF support.
> > > > >       * config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>): Use VDQWH.
> > > > >       (vcond<mode><mode>): Likewise.
> > > > >       (vcond_mask_<mode><v_cmp_result>): Likewise.
> > > > >       (vcond<VH_cvtto><mode>): New expander.
> > > > >
> > > > >       gcc/testsuite/
> > > > >       * gcc.target/arm/simd/mve-compare-3.c: New test with GCC vectors.
> > > > >       * gcc.target/arm/simd/mve-vcmp-f16.c: New test for
> > > > >       auto-vectorization.
> > > > > ---
> > > > >   gcc/config/arm/iterators.md                       |  6 ++++
> > > > >   gcc/config/arm/vec-common.md                      | 40 ++++++++++++++++-------
> > > > >   gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c | 38 +++++++++++++++++++++
> > > > >   gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c  | 30 +++++++++++++++++
> > > > >   4 files changed, 102 insertions(+), 12 deletions(-)
> > > > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
> > > > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
> > > > >
> > > > > diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> > > > > index a128465..3042baf 100644
> > > > > --- a/gcc/config/arm/iterators.md
> > > > > +++ b/gcc/config/arm/iterators.md
> > > > > @@ -231,6 +231,9 @@ (define_mode_iterator VU [V16QI V8HI V4SI])
> > > > >   ;; Vector modes for 16-bit floating-point support.
> > > > >   (define_mode_iterator VH [V8HF V4HF])
> > > > >
> > > > > +;; Modes with 16-bit elements only.
> > > > > +(define_mode_iterator V16 [V4HI V4HF V8HI V8HF])
> > > > > +
> > > > >   ;; 16-bit floating-point vector modes suitable for moving (includes BFmode).
> > > > >   (define_mode_iterator VHFBF [V8HF V4HF V4BF V8BF])
> > > > >
> > > > > @@ -571,6 +574,8 @@ (define_mode_attr V_cvtto [(V2SI "v2sf") (V2SF "v2si")
> > > > >   ;; (Opposite) mode to convert to/from for vector-half mode conversions.
> > > > >   (define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")
> > > > >                           (V8HI "V8HF") (V8HF "V8HI")])
> > > > > +(define_mode_attr VH_cvtto [(V4HI "v4hf") (V4HF "v4hi")
> > > > > +                         (V8HI "v8hf") (V8HF "v8hi")])
> > > > >
> > > > >   ;; Define element mode for each vector mode.
> > > > >   (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
> > > > > @@ -720,6 +725,7 @@ (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")
> > > > >   (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")
> > > > >                               (V4HI "v4hi") (V8HI  "v8hi")
> > > > >                               (V2SI "v2si") (V4SI  "v4si")
> > > > > +                             (V4HF "v4hi") (V8HF  "v8hi")
> > > > >                               (DI   "di")   (V2DI  "v2di")
> > > > >                               (V2SF "v2si") (V4SF  "v4si")])
> > > > >
> > > > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > > > > index 034b48b..3fd341c 100644
> > > > > --- a/gcc/config/arm/vec-common.md
> > > > > +++ b/gcc/config/arm/vec-common.md
> > > > > @@ -366,8 +366,8 @@ (define_expand "vlshr<mode>3"
> > > > >   (define_expand "vec_cmp<mode><v_cmp_result>"
> > > > >     [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
> > > > >       (match_operator:<V_cmp_result> 1 "comparison_operator"
> > > > > -       [(match_operand:VDQW 2 "s_register_operand")
> > > > > -        (match_operand:VDQW 3 "reg_or_zero_operand")]))]
> > > > > +       [(match_operand:VDQWH 2 "s_register_operand")
> > > > > +        (match_operand:VDQWH 3 "reg_or_zero_operand")]))]
> > > > >     "ARM_HAVE_<MODE>_ARITH
> > > > >      && !TARGET_REALLY_IWMMXT
> > > > >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > > > @@ -399,13 +399,13 @@ (define_expand "vec_cmpu<mode><mode>"
> > > > >   ;; element-wise.
> > > > >
> > > > >   (define_expand "vcond<mode><mode>"
> > > > > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > > > > -     (if_then_else:VDQW
> > > > > +  [(set (match_operand:VDQWH 0 "s_register_operand")
> > > > > +     (if_then_else:VDQWH
> > > > >         (match_operator 3 "comparison_operator"
> > > > > -         [(match_operand:VDQW 4 "s_register_operand")
> > > > > -          (match_operand:VDQW 5 "reg_or_zero_operand")])
> > > > > -       (match_operand:VDQW 1 "s_register_operand")
> > > > > -       (match_operand:VDQW 2 "s_register_operand")))]
> > > > > +         [(match_operand:VDQWH 4 "s_register_operand")
> > > > > +          (match_operand:VDQWH 5 "reg_or_zero_operand")])
> > > > > +       (match_operand:VDQWH 1 "s_register_operand")
> > > > > +       (match_operand:VDQWH 2 "s_register_operand")))]
> > > > >     "ARM_HAVE_<MODE>_ARITH
> > > > >      && !TARGET_REALLY_IWMMXT
> > > > >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > > > @@ -430,6 +430,22 @@ (define_expand "vcond<V_cvtto><mode>"
> > > > >     DONE;
> > > > >   })
> > > > >
> > > > > +(define_expand "vcond<VH_cvtto><mode>"
> > > > > +  [(set (match_operand:<VH_CVTTO> 0 "s_register_operand")
> > > > > +     (if_then_else:<VH_CVTTO>
> > > > > +       (match_operator 3 "comparison_operator"
> > > > > +         [(match_operand:V16 4 "s_register_operand")
> > > > > +          (match_operand:V16 5 "reg_or_zero_operand")])
> > > > > +       (match_operand:<VH_CVTTO> 1 "s_register_operand")
> > > > > +       (match_operand:<VH_CVTTO> 2 "s_register_operand")))]
> > > > > +  "ARM_HAVE_<MODE>_ARITH
> > > > > +   && !TARGET_REALLY_IWMMXT
> > > > > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > > > +{
> > > > > +  arm_expand_vcond (operands, <V_cmp_result>mode);
> > > > > +  DONE;
> > > > > +})
> > > > > +
> > > > >   (define_expand "vcondu<mode><v_cmp_result>"
> > > > >     [(set (match_operand:VDQW 0 "s_register_operand")
> > > > >       (if_then_else:VDQW
> > > > > @@ -446,11 +462,11 @@ (define_expand "vcondu<mode><v_cmp_result>"
> > > > >   })
> > > > >
> > > > >   (define_expand "vcond_mask_<mode><v_cmp_result>"
> > > > > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > > > > -        (if_then_else:VDQW
> > > > > +  [(set (match_operand:VDQWH 0 "s_register_operand")
> > > > > +        (if_then_else:VDQWH
> > > > >             (match_operand:<V_cmp_result> 3 "s_register_operand")
> > > > > -          (match_operand:VDQW 1 "s_register_operand")
> > > > > -          (match_operand:VDQW 2 "s_register_operand")))]
> > > > > +          (match_operand:VDQWH 1 "s_register_operand")
> > > > > +          (match_operand:VDQWH 2 "s_register_operand")))]
> > > > >     "ARM_HAVE_<MODE>_ARITH
> > > > >      && !TARGET_REALLY_IWMMXT"
> > > > >   {
> > > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
> > > > > new file mode 100644
> > > > > index 0000000..76f81e8
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
> > > > > @@ -0,0 +1,38 @@
> > > > > +/* { dg-do assemble } */
> > > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > > > > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> > > > > +
> > > > > +/* float 16 tests.  */
> > > > > +
> > > > > +#ifndef ELEM_TYPE
> > > > > +#define ELEM_TYPE __fp16
> > > > > +#endif
> > > > > +#ifndef INT_ELEM_TYPE
> > > > > +#define INT_ELEM_TYPE __INT16_TYPE__
> > > > > +#endif
> > > > > +
> > > > > +#define COMPARE(NAME, OP)                    \
> > > > > +  int_vec                                    \
> > > > > +  cmp_##NAME##_reg (vec a, vec b)            \
> > > > > +  {                                          \
> > > > > +    return a OP b;                           \
> > > > > +  }
> > > > > +
> > > > > +typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));
> > > > > +typedef ELEM_TYPE vec __attribute__((vector_size(16)));
> > > > > +
> > > > > +COMPARE (eq, ==)
> > > > > +COMPARE (ne, !=)
> > > > > +COMPARE (lt, <)
> > > > > +COMPARE (le, <=)
> > > > > +COMPARE (gt, >)
> > > > > +COMPARE (ge, >=)
> > > > > +
> > > > > +/* eq, ne, lt, le, gt, ge.
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
> > > > > new file mode 100644
> > > > > index 0000000..dbae2d1
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
> > > > > @@ -0,0 +1,30 @@
> > > > > +/* { dg-do assemble } */
> > > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > > > > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> > > > > +
> > > > > +#include <stdint.h>
> > > > > +
> > > > > +#define NB 8
> > > > > +
> > > > > +#define FUNC(OP, NAME)                                                       \
> > > > > +  void test_ ## NAME ##_f (__fp16 * __restrict__ dest, __fp16 *a, __fp16 *b) { \
> > > > > +    int i;                                                           \
> > > > > +    for (i=0; i<NB; i++) {                                           \
> > > > > +      dest[i] = a[i] OP b[i];                                                \
> > > > > +    }                                                                        \
> > > > > +  }
> > > > > +
> > > > > +FUNC(==, vcmpeq)
> > > > > +FUNC(!=, vcmpne)
> > > > > +FUNC(<, vcmplt)
> > > > > +FUNC(<=, vcmple)
> > > > > +FUNC(>, vcmpgt)
> > > > > +FUNC(>=, vcmpge)
> > > > > +
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-9]+\n} 1 } } */

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 8/9] arm: Auto-vectorization for MVE: vld2/vst2
  2021-04-30 14:09 ` [PATCH 8/9] arm: Auto-vectorization for MVE: vld2/vst2 Christophe Lyon
@ 2021-05-17  9:55   ` Christophe Lyon
  2021-05-24  7:19     ` Christophe Lyon
  2021-05-24 12:15   ` Kyrylo Tkachov
  1 sibling, 1 reply; 35+ messages in thread
From: Christophe Lyon @ 2021-05-17  9:55 UTC (permalink / raw)
  To: gcc Patches

ping?

On Fri, 30 Apr 2021 at 16:09, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
>
> This patch enables MVE vld2/vst2 instructions for auto-vectorization.
> We move the existing expanders from neon.md and enable them for MVE,
> calling the respective emitter.
>
> 2021-03-12  Christophe Lyon  <christophe.lyon@linaro.org>
>
>         gcc/
>         * config/arm/neon.md (vec_load_lanesoi<mode>)
>         (vec_store_lanesoi<mode>): Move ...
>         * config/arm/vec-common.md: here.
>
>         gcc/testsuite/
>         * gcc.target/arm/simd/mve-vld2.c: New test, derived from
>         slp-perm-2.c
> ---
>  gcc/config/arm/neon.md                       | 14 ----
>  gcc/config/arm/vec-common.md                 | 27 ++++++++
>  gcc/testsuite/gcc.target/arm/simd/mve-vld2.c | 96 ++++++++++++++++++++++++++++
>  3 files changed, 123 insertions(+), 14 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vld2.c
>
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index 6660846..bc8775c 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -5063,13 +5063,6 @@ (define_insn "neon_vld2<mode>"
>                      (const_string "neon_load2_2reg<q>")))]
>  )
>
> -(define_expand "vec_load_lanesoi<mode>"
> -  [(set (match_operand:OI 0 "s_register_operand")
> -        (unspec:OI [(match_operand:OI 1 "neon_struct_operand")
> -                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> -                  UNSPEC_VLD2))]
> -  "TARGET_NEON")
> -
>  (define_insn "neon_vld2<mode>"
>    [(set (match_operand:OI 0 "s_register_operand" "=w")
>          (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um")
> @@ -5197,13 +5190,6 @@ (define_insn "neon_vst2<mode>"
>                      (const_string "neon_store2_one_lane<q>")))]
>  )
>
> -(define_expand "vec_store_lanesoi<mode>"
> -  [(set (match_operand:OI 0 "neon_struct_operand")
> -       (unspec:OI [(match_operand:OI 1 "s_register_operand")
> -                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> -                   UNSPEC_VST2))]
> -  "TARGET_NEON")
> -
>  (define_insn "neon_vst2<mode>"
>    [(set (match_operand:OI 0 "neon_struct_operand" "=Um")
>         (unspec:OI [(match_operand:OI 1 "s_register_operand" "w")
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> index 3fd341c..7abefea 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -482,6 +482,33 @@ (define_expand "vcond_mask_<mode><v_cmp_result>"
>      }
>    else
>      gcc_unreachable ();
> +  DONE;
> +})
>
> +(define_expand "vec_load_lanesoi<mode>"
> +  [(set (match_operand:OI 0 "s_register_operand")
> +        (unspec:OI [(match_operand:OI 1 "neon_struct_operand")
> +                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> +                  UNSPEC_VLD2))]
> +  "TARGET_NEON || TARGET_HAVE_MVE"
> +{
> +  if (TARGET_NEON)
> +    emit_insn (gen_neon_vld2<mode> (operands[0], operands[1]));
> +  else
> +    emit_insn (gen_mve_vld2q<mode> (operands[0], operands[1]));
> +  DONE;
> +})
> +
> +(define_expand "vec_store_lanesoi<mode>"
> +  [(set (match_operand:OI 0 "neon_struct_operand")
> +       (unspec:OI [(match_operand:OI 1 "s_register_operand")
> +                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> +                   UNSPEC_VST2))]
> +  "TARGET_NEON || TARGET_HAVE_MVE"
> +{
> +  if (TARGET_NEON)
> +    emit_insn (gen_neon_vst2<mode> (operands[0], operands[1]));
> +  else
> +    emit_insn (gen_mve_vst2q<mode> (operands[0], operands[1]));
>    DONE;
>  })
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vld2.c b/gcc/testsuite/gcc.target/arm/simd/mve-vld2.c
> new file mode 100644
> index 0000000..9c7c3f5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vld2.c
> @@ -0,0 +1,96 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> +/* { dg-add-options arm_v8_1m_mve_fp } */
> +/* { dg-additional-options "-O3" } */
> +
> +#include <stdint.h>
> +
> +#define M00 100
> +#define M10 216
> +#define M01 1322
> +#define M11 13
> +
> +#define N 128
> +
> +
> +/* Integer tests.  */
> +#define FUNC(SIGN, TYPE, BITS)                                         \
> +  void foo_##SIGN##BITS##x (TYPE##BITS##_t *__restrict__ pInput,       \
> +                           TYPE##BITS##_t *__restrict__ pOutput)       \
> +  {                                                                    \
> +    unsigned int i;                                                    \
> +    TYPE##BITS##_t  a, b;                                              \
> +                                                                       \
> +    for (i = 0; i < N / BITS; i++)                                     \
> +      {                                                                        \
> +       a = *pInput++;                                                  \
> +       b = *pInput++;                                                  \
> +                                                                       \
> +       *pOutput++ = M00 * a + M01 * b;                                 \
> +       *pOutput++ = M10 * a + M11 * b;                                 \
> +      }                                                                        \
> +  }
> +
> +FUNC(s, int, 8)
> +FUNC(u, uint, 8)
> +FUNC(s, int, 16)
> +FUNC(u, uint, 16)
> +FUNC(s, int, 32)
> +FUNC(u, uint, 32)
> +
> +/* float test, keep the macro because it's similar to the above, but does not
> +   need the ##BITS##_t.  */
> +#define FUNC_FLOAT(SIGN, TYPE, BITS)                                   \
> +  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,                 \
> +                           TYPE *__restrict__ pOutput)                 \
> +  {                                                                    \
> +    unsigned int i;                                                    \
> +    TYPE a, b;                                                         \
> +                                                                       \
> +    for (i = 0; i < N / BITS; i++)                                     \
> +      {                                                                        \
> +       a = *pInput++;                                                  \
> +       b = *pInput++;                                                  \
> +                                                                       \
> +       *pOutput++ = M00 * a + M01 * b;                                 \
> +       *pOutput++ = M10 * a + M11 * b;                                 \
> +      }                                                                        \
> +  }
> +
> +FUNC_FLOAT(f, float, 32)
> +
> +/* __fp16 test, needs explicit casts to avoid conversions to floating-point and
> +   failure to vectorize.  */
> +__fp16 M00_fp16 = 100.0f16;
> +__fp16 M10_fp16 = 216.0f16;
> +__fp16 M01_fp16 = 1322.0f16;
> +__fp16 M11_fp16 = 13.0f16;
> +
> +#define FUNC_FLOAT_FP16(SIGN, TYPE, BITS)                              \
> +  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,                 \
> +                           TYPE *__restrict__ pOutput)                 \
> +  {                                                                    \
> +    unsigned int i;                                                    \
> +    TYPE a, b;                                                         \
> +                                                                       \
> +    for (i = 0; i < N / BITS; i++)                                     \
> +      {                                                                        \
> +       a = *pInput++;                                                  \
> +       b = *pInput++;                                                  \
> +                                                                       \
> +       *pOutput++ = (__fp16)(M00_fp16 * a) + (__fp16)(M01_fp16 * b);   \
> +       *pOutput++ = (__fp16)(M10_fp16 * a) + (__fp16)(M11_fp16 * b);   \
> +      }                                                                        \
> +  }
> +
> +FUNC_FLOAT_FP16(f, __fp16, 16)
> +
> +/* vld2X.8 is used for signed and unsigned chars: 2 pairs.  */
> +/* vld2X.16 is used for signed and unsigned shorts and __fp16: 3 pairs.  */
> +/* vld2X.32 is used for signed and unsigned ints and float: 3 pairs.  */
> +/* { dg-final { scan-assembler-times {vld2[01].8\t.q[0-9]+, q[0-9]+., } 4 } } */
> +/* { dg-final { scan-assembler-times {vld2[01].16\t.q[0-9]+, q[0-9]+., } 6 } } */
> +/* { dg-final { scan-assembler-times {vld2[01].32\t.q[0-9]+, q[0-9]+., } 6 } } */
> +/* { dg-final { scan-assembler-times {vst2[01].8\t.q[0-9]+, q[0-9]+., } 4 } } */
> +/* { dg-final { scan-assembler-times {vst2[01].16\t.q[0-9]+, q[0-9]+., } 6 } } */
> +/* { dg-final { scan-assembler-times {vst2[01].32\t.q[0-9]+, q[0-9]+., } 6 } } */
> --
> 2.7.4
>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 9/9] arm: Auto-vectorization for MVE: vld4/vst4
  2021-05-04 14:57     ` Christophe Lyon
@ 2021-05-17  9:55       ` Christophe Lyon
  2021-05-24  7:20         ` Christophe Lyon
  0 siblings, 1 reply; 35+ messages in thread
From: Christophe Lyon @ 2021-05-17  9:55 UTC (permalink / raw)
  To: gcc Patches

ping?

On Tue, 4 May 2021 at 16:57, Christophe Lyon <christophe.lyon@linaro.org> wrote:
>
> On Tue, 4 May 2021 at 14:03, Andre Vieira (lists)
> <andre.simoesdiasvieira@arm.com> wrote:
> >
> > Hi Christophe,
> >
> > The series LGTM but you'll need the approval of an arm port maintainer
> > before committing. I only did code-review, did not try to build/run tests.
> >
>
> Hi Andre,
>
> Thanks for the comments!
>
> > Kind regards,
> > Andre
> >
> > On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:
> > > This patch enables MVE vld4/vst4 instructions for auto-vectorization.
> > > We move the existing expanders from neon.md and enable them for MVE,
> > > calling the respective emitter.
> > >
> > > 2021-03-12  Christophe Lyon  <christophe.lyon@linaro.org>
> > >
> > >       gcc/
> > >       * config/arm/neon.md (vec_load_lanesxi<mode>)
> > >       (vec_store_lanexoi<mode>): Move ...
> > >       * config/arm/vec-common.md: here.
> > >
> > >       gcc/testsuite/
> > >       * gcc.target/arm/simd/mve-vld4.c: New test, derived from
> > >       slp-perm-3.c
> > > ---
> > >   gcc/config/arm/neon.md                       |  20 ----
> > >   gcc/config/arm/vec-common.md                 |  26 +++++
> > >   gcc/testsuite/gcc.target/arm/simd/mve-vld4.c | 140 +++++++++++++++++++++++++++
> > >   3 files changed, 166 insertions(+), 20 deletions(-)
> > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
> > >
> > > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> > > index bc8775c..fb58baf 100644
> > > --- a/gcc/config/arm/neon.md
> > > +++ b/gcc/config/arm/neon.md
> > > @@ -5617,16 +5617,6 @@ (define_insn "neon_vld4<mode>"
> > >                       (const_string "neon_load4_4reg<q>")))]
> > >   )
> > >
> > > -(define_expand "vec_load_lanesxi<mode>"
> > > -  [(match_operand:XI 0 "s_register_operand")
> > > -   (match_operand:XI 1 "neon_struct_operand")
> > > -   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> > > -  "TARGET_NEON"
> > > -{
> > > -  emit_insn (gen_neon_vld4<mode> (operands[0], operands[1]));
> > > -  DONE;
> > > -})
> > > -
> > >   (define_expand "neon_vld4<mode>"
> > >     [(match_operand:XI 0 "s_register_operand")
> > >      (match_operand:XI 1 "neon_struct_operand")
> > > @@ -5818,16 +5808,6 @@ (define_insn "neon_vst4<mode>"
> > >                       (const_string "neon_store4_4reg<q>")))]
> > >   )
> > >
> > > -(define_expand "vec_store_lanesxi<mode>"
> > > -  [(match_operand:XI 0 "neon_struct_operand")
> > > -   (match_operand:XI 1 "s_register_operand")
> > > -   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> > > -  "TARGET_NEON"
> > > -{
> > > -  emit_insn (gen_neon_vst4<mode> (operands[0], operands[1]));
> > > -  DONE;
> > > -})
> > > -
> > >   (define_expand "neon_vst4<mode>"
> > >     [(match_operand:XI 0 "neon_struct_operand")
> > >      (match_operand:XI 1 "s_register_operand")
> > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > > index 7abefea..d46b78d 100644
> > > --- a/gcc/config/arm/vec-common.md
> > > +++ b/gcc/config/arm/vec-common.md
> > > @@ -512,3 +512,29 @@ (define_expand "vec_store_lanesoi<mode>"
> > >       emit_insn (gen_mve_vst2q<mode> (operands[0], operands[1]));
> > >     DONE;
> > >   })
> > > +
> > > +(define_expand "vec_load_lanesxi<mode>"
> > > +  [(match_operand:XI 0 "s_register_operand")
> > > +   (match_operand:XI 1 "neon_struct_operand")
> > > +   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> > > +  "TARGET_NEON || TARGET_HAVE_MVE"
> > > +{
> > > +  if (TARGET_NEON)
> > > +    emit_insn (gen_neon_vld4<mode> (operands[0], operands[1]));
> > > +  else
> > > +    emit_insn (gen_mve_vld4q<mode> (operands[0], operands[1]));
> > > +  DONE;
> > > +})
> > > +
> > > +(define_expand "vec_store_lanesxi<mode>"
> > > +  [(match_operand:XI 0 "neon_struct_operand")
> > > +   (match_operand:XI 1 "s_register_operand")
> > > +   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> > > +  "TARGET_NEON || TARGET_HAVE_MVE"
> > > +{
> > > +  if (TARGET_NEON)
> > > +    emit_insn (gen_neon_vst4<mode> (operands[0], operands[1]));
> > > +  else
> > > +    emit_insn (gen_mve_vst4q<mode> (operands[0], operands[1]));
> > > +  DONE;
> > > +})
> > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c b/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
> > > new file mode 100644
> > > index 0000000..ce3e755
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
> > > @@ -0,0 +1,140 @@
> > > +/* { dg-do assemble } */
> > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > > +/* { dg-additional-options "-O3" } */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +#define M00 100
> > > +#define M10 216
> > > +#define M20 23
> > > +#define M30 237
> > > +#define M01 1322
> > > +#define M11 13
> > > +#define M21 27271
> > > +#define M31 2280
> > > +#define M02 74
> > > +#define M12 191
> > > +#define M22 500
> > > +#define M32 111
> > > +#define M03 134
> > > +#define M13 117
> > > +#define M23 11
> > > +#define M33 771
> > > +
> > > +#define N 128
> > > +
> > > +/* Integer tests.  */
> > > +#define FUNC(SIGN, TYPE, BITS)                                               \
> > > +  void foo_##SIGN##BITS##x (TYPE##BITS##_t *__restrict__ pInput,     \
> > > +                         TYPE##BITS##_t *__restrict__ pOutput)       \
> > > +  {                                                                  \
> > > +    unsigned int i;                                                  \
> > > +    TYPE##BITS##_t  a, b, c, d;                                              \
> > > +                                                                     \
> > > +    for (i = 0; i < N / BITS; i++)                                   \
> > > +      {                                                                      \
> > > +     a = *pInput++;                                                  \
> > > +     b = *pInput++;                                                  \
> > > +     c = *pInput++;                                                  \
> > > +     d = *pInput++;                                                  \
> > > +                                                                     \
> > > +     *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;             \
> > > +     *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;             \
> > > +     *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;             \
> > > +     *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;             \
> > > +      }                                                                      \
> > > +  }
> > > +
> > > +FUNC(s, int, 8)
> > > +FUNC(u, uint, 8)
> > > +FUNC(s, int, 16)
> > > +FUNC(u, uint, 16)
> > > +FUNC(s, int, 32)
> > > +FUNC(u, uint, 32)
> > > +
> > > +/* float test, keep the macro because it's similar to the above, but does not
> > > +   need the ##BITS##_t.  */
> > > +#define FUNC_FLOAT(SIGN, TYPE, BITS)                                         \
> > > +  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,                       \
> > > +                         TYPE *__restrict__ pOutput)                 \
> > > +  {                                                                  \
> > > +    unsigned int i;                                                  \
> > > +    TYPE a, b, c, d;                                                 \
> > > +                                                                     \
> > > +    for (i = 0; i < N / BITS; i++)                                   \
> > > +      {                                                                      \
> > > +     a = *pInput++;                                                  \
> > > +     b = *pInput++;                                                  \
> > > +     c = *pInput++;                                                  \
> > > +     d = *pInput++;                                                  \
> > > +                                                                     \
> > > +     *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;             \
> > > +     *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;             \
> > > +     *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;             \
> > > +     *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;             \
> > > +      }                                                                      \
> > > +  }
> > > +
> > > +FUNC_FLOAT(f, float, 32)
> > > +
> > > +/* __fp16 test, needs explicit casts to avoid conversions to floating-point and
> > > +   failure to vectorize.  */
> > > +__fp16 M00_fp16 = 100.0f16;
> > > +__fp16 M10_fp16 = 216.0f16;
> > > +__fp16 M20_fp16 = 23.0f16;
> > > +__fp16 M30_fp16 = 237.0f16;
> > > +__fp16 M01_fp16 = 1322.0f16;
> > > +__fp16 M11_fp16 = 13.0f16;
> > > +__fp16 M21_fp16 = 27271.0f16;
> > > +__fp16 M31_fp16 = 2280.0f16;
> > > +__fp16 M02_fp16 = 74.0f16;
> > > +__fp16 M12_fp16 = 191.0f16;
> > > +__fp16 M22_fp16 = 500.0f16;
> > > +__fp16 M32_fp16 = 111.0f16;
> > > +__fp16 M03_fp16 = 134.0f16;
> > > +__fp16 M13_fp16 = 117.0f16;
> > > +__fp16 M23_fp16 = 11.0f16;
> > > +__fp16 M33_fp16 = 771.0f16;
> > > +
> > > +#define FUNC_FLOAT_FP16(SIGN, TYPE, BITS)                            \
> > > +  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,                       \
> > > +                         TYPE *__restrict__ pOutput)                 \
> > > +  {                                                                  \
> > > +    unsigned int i;                                                  \
> > > +    TYPE a, b, c, d;                                                 \
> > > +                                                                     \
> > > +    for (i = 0; i < N / BITS; i++)                                   \
> > > +      {                                                                      \
> > > +     a = *pInput++;                                                  \
> > > +     b = *pInput++;                                                  \
> > > +     c = *pInput++;                                                  \
> > > +     d = *pInput++;                                                  \
> > > +                                                                     \
> > > +     TYPE ab, cd;                                                    \
> > > +     ab = (__fp16)(M00_fp16 * a) + (__fp16)(M01_fp16 * b);           \
> > > +     cd = (__fp16)(M02_fp16 * c) + (__fp16)(M03_fp16 * d);           \
> > > +     *pOutput++ = ab + cd;                                           \
> > > +     ab = (__fp16)(M10_fp16 * a) + (__fp16)(M11_fp16 * b);           \
> > > +     cd = (__fp16)(M12_fp16 * c) + (__fp16)(M13_fp16 * d);           \
> > > +     *pOutput++ = ab + cd;                                           \
> > > +     ab = (__fp16)(M20_fp16 * a) + (__fp16)(M21_fp16 * b);           \
> > > +     cd = (__fp16)(M22_fp16 * c) + (__fp16)(M23_fp16 * d);           \
> > > +     *pOutput++ = ab + cd;                                           \
> > > +     ab = (__fp16)(M30_fp16 * a) + (__fp16)(M31_fp16 * b);           \
> > > +     cd = (__fp16)(M32_fp16 * c) + (__fp16)(M33_fp16 * d);           \
> > > +     *pOutput++ = ab + cd;                                           \
> > > +      }                                                                      \
> > > +  }
> > > +
> > > +FUNC_FLOAT_FP16(f, __fp16, 16)
> > > +
> > > +/* vld4X.8 is used for signed and unsigned chars: 2 * 4.  */
> > > +/* vld4X.16 is used for signed and unsigned shorts and __fp16: 3 * 4.  */
> > > +/* vld4X.32 is used for signed and unsigned ints and float: 3 * 4.  */
> > > +/* { dg-final { scan-assembler-times {vld4[0123].8\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 8 } } */
> > > +/* { dg-final { scan-assembler-times {vld4[0123].16\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
> > > +/* { dg-final { scan-assembler-times {vld4[0123].32\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
> > > +/* { dg-final { scan-assembler-times {vst4[0123].8\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 8 } } */
> > > +/* { dg-final { scan-assembler-times {vst4[0123].16\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
> > > +/* { dg-final { scan-assembler-times {vst4[0123].32\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 6/9] arm: Auto-vectorization for MVE: vcmp
  2021-05-05 14:08       ` Christophe Lyon
  2021-05-17  9:54         ` Christophe Lyon
@ 2021-05-17 10:35         ` Kyrylo Tkachov
  2021-05-17 12:31           ` Christophe Lyon
  1 sibling, 1 reply; 35+ messages in thread
From: Kyrylo Tkachov @ 2021-05-17 10:35 UTC (permalink / raw)
  To: Christophe Lyon, Andre Simoes Dias Vieira; +Cc: gcc-patches



> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> Christophe Lyon via Gcc-patches
> Sent: 05 May 2021 15:08
> To: Andre Simoes Dias Vieira <Andre.SimoesDiasVieira@arm.com>
> Cc: gcc Patches <gcc-patches@gcc.gnu.org>
> Subject: Re: [PATCH 6/9] arm: Auto-vectorization for MVE: vcmp
> 
> On Tue, 4 May 2021 at 15:41, Christophe Lyon <christophe.lyon@linaro.org>
> wrote:
> >
> > On Tue, 4 May 2021 at 13:29, Andre Vieira (lists)
> > <andre.simoesdiasvieira@arm.com> wrote:
> > >
> > > Hi Christophe,
> > >
> > > On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:
> > > > Since MVE has a different set of vector comparison operators from
> > > > Neon, we have to update the expansion to take into account the new
> > > > ones, for instance 'NE' for which MVE does not require to use 'EQ'
> > > > with the inverted condition.
> > > >
> > > > Conversely, Neon supports comparisons with #0, MVE does not.
> > > >
> > > > For:
> > > > typedef long int vs32 __attribute__((vector_size(16)));
> > > > vs32 cmp_eq_vs32_reg (vs32 a, vs32 b) { return a == b; }
> > > >
> > > > we now generate:
> > > > cmp_eq_vs32_reg:
> > > >       vldr.64 d4, .L123       @ 8     [c=8 l=4]  *mve_movv4si/8
> > > >       vldr.64 d5, .L123+8
> > > >       vldr.64 d6, .L123+16    @ 9     [c=8 l=4]  *mve_movv4si/8
> > > >       vldr.64 d7, .L123+24
> > > >       vcmp.i32  eq, q0, q1    @ 7     [c=16 l=4]  mve_vcmpeqq_v4si
> > > >       vpsel q0, q3, q2        @ 15    [c=8 l=4]  mve_vpselq_sv4si
> > > >       bx      lr      @ 26    [c=8 l=4]  *thumb2_return
> > > > .L124:
> > > >       .align  3
> > > > .L123:
> > > >       .word   0
> > > >       .word   0
> > > >       .word   0
> > > >       .word   0
> > > >       .word   1
> > > >       .word   1
> > > >       .word   1
> > > >       .word   1
> > > >
> > > > For some reason emit_move_insn (zero, CONST0_RTX (cmp_mode))
> produces
> > > > a pair of vldr instead of vmov.i32, qX, #0
> > > I think ideally we would even want:
> > > vpte  eq, q0, q1
> > > vmovt.i32 q0, #0
> > > vmove.i32 q0, #1
> > >
> > > But we don't have a way to generate VPT blocks with multiple
> > > instructions yet unfortunately so I guess VPSEL will have to do for now.
> >
> > TBH,  I looked at what LLVM generates currently ;-)
> >
> 
> Here is an updated version, which adds
> && (!<Is_float_mode> || flag_unsafe_math_optimizations)
> to vcond_mask_
> 
> This condition was not present in the neon.md version I move to vec-
> common.md,
> but since the VDQW iterator includes V2SF and V4SF, it should take
> float-point flags into account.
> 

-      emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1));
+    case NE:
+      if (TARGET_HAVE_MVE) {
+	rtx vpr_p0;

GNU style wants the '{' on the new line. This appears a few other times in the patch.

+	if (vcond_mve)
+	  vpr_p0 = target;
+	else
+	  vpr_p0 = gen_reg_rtx (HImode);
+
+	switch (cmp_mode)
+	  {
+	  case E_V16QImode:
+	  case E_V8HImode:
+	  case E_V4SImode:
+	    emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
+	    break;
+	  case E_V8HFmode:
+	  case E_V4SFmode:
+	    if (TARGET_HAVE_MVE_FLOAT)
+	      emit_insn (gen_mve_vcmpq_f (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
+	    else
+	      gcc_unreachable ();
+	    break;
+	  default:
+	    gcc_unreachable ();
+	  }

Hmm, I think we can just check GET_MODE_CLASS (cmp_mode) for MODE_VECTOR_INT or MODE_VECTOR_FLOAT here rather than have this switch statement.

+
+	/* If we are not expanding a vcond, build the result here.  */
+	if (!vcond_mve) {
+	  rtx zero = gen_reg_rtx (cmp_result_mode);
+	  rtx one = gen_reg_rtx (cmp_result_mode);
+	  emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
+	  emit_move_insn (one, CONST1_RTX (cmp_result_mode));
+	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
+	}
+      }
+      else

...
   bool inverted = arm_expand_vector_compare (mask, GET_CODE (operands[3]),
-					     operands[4], operands[5], true);
+					     operands[4], operands[5], true, vcond_mve);
   if (inverted)
     std::swap (operands[1], operands[2]);
+  if (TARGET_NEON)
   emit_insn (gen_neon_vbsl (GET_MODE (operands[0]), operands[0],
 			    mask, operands[1], operands[2]));
+  else
+    {
+      machine_mode cmp_mode = GET_MODE (operands[4]);
+      rtx vpr_p0 = mask;
+      rtx zero = gen_reg_rtx (cmp_mode);
+      rtx one = gen_reg_rtx (cmp_mode);
+      emit_move_insn (zero, CONST0_RTX (cmp_mode));
+      emit_move_insn (one, CONST1_RTX (cmp_mode));
+      switch (cmp_mode)
+	{
+	case E_V16QImode:
+	case E_V8HImode:
+	case E_V4SImode:
+	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, operands[0], one, zero, vpr_p0));
+	  break;
+	case E_V8HFmode:
+	case E_V4SFmode:
+	  if (TARGET_HAVE_MVE_FLOAT)
+	    emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], one, zero, vpr_p0));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}

Similarly here.
Ok with those changes.
Thanks,
Kyrill

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 7/9] arm: Auto-vectorization for MVE: add __fp16 support to VCMP
  2021-05-05 14:09         ` Christophe Lyon
  2021-05-17  9:54           ` Christophe Lyon
@ 2021-05-17 10:49           ` Kyrylo Tkachov
  1 sibling, 0 replies; 35+ messages in thread
From: Kyrylo Tkachov @ 2021-05-17 10:49 UTC (permalink / raw)
  To: Christophe Lyon, Andre Simoes Dias Vieira; +Cc: gcc-patches



> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> Christophe Lyon via Gcc-patches
> Sent: 05 May 2021 15:09
> To: Andre Simoes Dias Vieira <Andre.SimoesDiasVieira@arm.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH 7/9] arm: Auto-vectorization for MVE: add __fp16
> support to VCMP
> 
> On Tue, 4 May 2021 at 19:03, Christophe Lyon <christophe.lyon@linaro.org>
> wrote:
> >
> > On Tue, 4 May 2021 at 15:43, Christophe Lyon
> <christophe.lyon@linaro.org> wrote:
> > >
> > > On Tue, 4 May 2021 at 13:48, Andre Vieira (lists)
> > > <andre.simoesdiasvieira@arm.com> wrote:
> > > >
> > > > It would be good to also add tests for NEON as you also enable auto-
> vec
> > > > for it. I checked and I do think the necessary 'neon_vc' patterns exist
> > > > for 'VH', so we should be OK there.
> > > >
> > >
> > > Actually since I posted the patch series, I've noticed a regression in
> > > armv8_2-fp16-arith-1.c, because we now vectorize all the float16x[48]_t
> loops,
> > > but we lose the fact that some FP comparisons can throw exceptions.
> > >
> > > I'll have to revisit this patch.
> >
> > Actually it looks like my patch does the right thing: we now vectorize
> > appropriately, given that the testcase is compiled with -ffast-math.
> > I need to update the testcase, though.
> >
> 
> Here is a new version, with armv8_2-fp16-arith-1.c updated to take
> into account the new vectorization.

Ok.
Thanks,
Kyrill

> 
> Christophe
> 
> 
> > >
> > > Thanks,
> > >
> > > Christophe
> > >
> > > > On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:
> > > > > This patch adds __fp16 support to the previous patch that added
> vcmp
> > > > > support with MVE. For this we update existing expanders to use
> VDQWH
> > > > > iterator, and add a new expander vcond<VH_cvtto><mode>.  In the
> > > > > process we need to create suitable iterators, and update
> v_cmp_result
> > > > > as needed.
> > > > >
> > > > > 2021-04-26  Christophe Lyon  <christophe.lyon@linaro.org>
> > > > >
> > > > >       gcc/
> > > > >       * config/arm/iterators.md (V16): New iterator.
> > > > >       (VH_cvtto): New iterator.
> > > > >       (v_cmp_result): Added V4HF and V8HF support.
> > > > >       * config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>):
> Use VDQWH.
> > > > >       (vcond<mode><mode>): Likewise.
> > > > >       (vcond_mask_<mode><v_cmp_result>): Likewise.
> > > > >       (vcond<VH_cvtto><mode>): New expander.
> > > > >
> > > > >       gcc/testsuite/
> > > > >       * gcc.target/arm/simd/mve-compare-3.c: New test with GCC
> vectors.
> > > > >       * gcc.target/arm/simd/mve-vcmp-f16.c: New test for
> > > > >       auto-vectorization.
> > > > > ---
> > > > >   gcc/config/arm/iterators.md                       |  6 ++++
> > > > >   gcc/config/arm/vec-common.md                      | 40
> ++++++++++++++++-------
> > > > >   gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c | 38
> +++++++++++++++++++++
> > > > >   gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c  | 30
> +++++++++++++++++
> > > > >   4 files changed, 102 insertions(+), 12 deletions(-)
> > > > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-
> compare-3.c
> > > > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-
> f16.c
> > > > >
> > > > > diff --git a/gcc/config/arm/iterators.md
> b/gcc/config/arm/iterators.md
> > > > > index a128465..3042baf 100644
> > > > > --- a/gcc/config/arm/iterators.md
> > > > > +++ b/gcc/config/arm/iterators.md
> > > > > @@ -231,6 +231,9 @@ (define_mode_iterator VU [V16QI V8HI V4SI])
> > > > >   ;; Vector modes for 16-bit floating-point support.
> > > > >   (define_mode_iterator VH [V8HF V4HF])
> > > > >
> > > > > +;; Modes with 16-bit elements only.
> > > > > +(define_mode_iterator V16 [V4HI V4HF V8HI V8HF])
> > > > > +
> > > > >   ;; 16-bit floating-point vector modes suitable for moving (includes
> BFmode).
> > > > >   (define_mode_iterator VHFBF [V8HF V4HF V4BF V8BF])
> > > > >
> > > > > @@ -571,6 +574,8 @@ (define_mode_attr V_cvtto [(V2SI "v2sf")
> (V2SF "v2si")
> > > > >   ;; (Opposite) mode to convert to/from for vector-half mode
> conversions.
> > > > >   (define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")
> > > > >                           (V8HI "V8HF") (V8HF "V8HI")])
> > > > > +(define_mode_attr VH_cvtto [(V4HI "v4hf") (V4HF "v4hi")
> > > > > +                         (V8HI "v8hf") (V8HF "v8hi")])
> > > > >
> > > > >   ;; Define element mode for each vector mode.
> > > > >   (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
> > > > > @@ -720,6 +725,7 @@ (define_mode_attr V_cmp_result [(V8QI
> "V8QI") (V16QI "V16QI")
> > > > >   (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")
> > > > >                               (V4HI "v4hi") (V8HI  "v8hi")
> > > > >                               (V2SI "v2si") (V4SI  "v4si")
> > > > > +                             (V4HF "v4hi") (V8HF  "v8hi")
> > > > >                               (DI   "di")   (V2DI  "v2di")
> > > > >                               (V2SF "v2si") (V4SF  "v4si")])
> > > > >
> > > > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-
> common.md
> > > > > index 034b48b..3fd341c 100644
> > > > > --- a/gcc/config/arm/vec-common.md
> > > > > +++ b/gcc/config/arm/vec-common.md
> > > > > @@ -366,8 +366,8 @@ (define_expand "vlshr<mode>3"
> > > > >   (define_expand "vec_cmp<mode><v_cmp_result>"
> > > > >     [(set (match_operand:<V_cmp_result> 0 "s_register_operand")
> > > > >       (match_operator:<V_cmp_result> 1 "comparison_operator"
> > > > > -       [(match_operand:VDQW 2 "s_register_operand")
> > > > > -        (match_operand:VDQW 3 "reg_or_zero_operand")]))]
> > > > > +       [(match_operand:VDQWH 2 "s_register_operand")
> > > > > +        (match_operand:VDQWH 3 "reg_or_zero_operand")]))]
> > > > >     "ARM_HAVE_<MODE>_ARITH
> > > > >      && !TARGET_REALLY_IWMMXT
> > > > >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > > > @@ -399,13 +399,13 @@ (define_expand
> "vec_cmpu<mode><mode>"
> > > > >   ;; element-wise.
> > > > >
> > > > >   (define_expand "vcond<mode><mode>"
> > > > > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > > > > -     (if_then_else:VDQW
> > > > > +  [(set (match_operand:VDQWH 0 "s_register_operand")
> > > > > +     (if_then_else:VDQWH
> > > > >         (match_operator 3 "comparison_operator"
> > > > > -         [(match_operand:VDQW 4 "s_register_operand")
> > > > > -          (match_operand:VDQW 5 "reg_or_zero_operand")])
> > > > > -       (match_operand:VDQW 1 "s_register_operand")
> > > > > -       (match_operand:VDQW 2 "s_register_operand")))]
> > > > > +         [(match_operand:VDQWH 4 "s_register_operand")
> > > > > +          (match_operand:VDQWH 5 "reg_or_zero_operand")])
> > > > > +       (match_operand:VDQWH 1 "s_register_operand")
> > > > > +       (match_operand:VDQWH 2 "s_register_operand")))]
> > > > >     "ARM_HAVE_<MODE>_ARITH
> > > > >      && !TARGET_REALLY_IWMMXT
> > > > >      && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > > > @@ -430,6 +430,22 @@ (define_expand "vcond<V_cvtto><mode>"
> > > > >     DONE;
> > > > >   })
> > > > >
> > > > > +(define_expand "vcond<VH_cvtto><mode>"
> > > > > +  [(set (match_operand:<VH_CVTTO> 0 "s_register_operand")
> > > > > +     (if_then_else:<VH_CVTTO>
> > > > > +       (match_operator 3 "comparison_operator"
> > > > > +         [(match_operand:V16 4 "s_register_operand")
> > > > > +          (match_operand:V16 5 "reg_or_zero_operand")])
> > > > > +       (match_operand:<VH_CVTTO> 1 "s_register_operand")
> > > > > +       (match_operand:<VH_CVTTO> 2 "s_register_operand")))]
> > > > > +  "ARM_HAVE_<MODE>_ARITH
> > > > > +   && !TARGET_REALLY_IWMMXT
> > > > > +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
> > > > > +{
> > > > > +  arm_expand_vcond (operands, <V_cmp_result>mode);
> > > > > +  DONE;
> > > > > +})
> > > > > +
> > > > >   (define_expand "vcondu<mode><v_cmp_result>"
> > > > >     [(set (match_operand:VDQW 0 "s_register_operand")
> > > > >       (if_then_else:VDQW
> > > > > @@ -446,11 +462,11 @@ (define_expand
> "vcondu<mode><v_cmp_result>"
> > > > >   })
> > > > >
> > > > >   (define_expand "vcond_mask_<mode><v_cmp_result>"
> > > > > -  [(set (match_operand:VDQW 0 "s_register_operand")
> > > > > -        (if_then_else:VDQW
> > > > > +  [(set (match_operand:VDQWH 0 "s_register_operand")
> > > > > +        (if_then_else:VDQWH
> > > > >             (match_operand:<V_cmp_result> 3 "s_register_operand")
> > > > > -          (match_operand:VDQW 1 "s_register_operand")
> > > > > -          (match_operand:VDQW 2 "s_register_operand")))]
> > > > > +          (match_operand:VDQWH 1 "s_register_operand")
> > > > > +          (match_operand:VDQWH 2 "s_register_operand")))]
> > > > >     "ARM_HAVE_<MODE>_ARITH
> > > > >      && !TARGET_REALLY_IWMMXT"
> > > > >   {
> > > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
> b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
> > > > > new file mode 100644
> > > > > index 0000000..76f81e8
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
> > > > > @@ -0,0 +1,38 @@
> > > > > +/* { dg-do assemble } */
> > > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > > > > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> > > > > +
> > > > > +/* float 16 tests.  */
> > > > > +
> > > > > +#ifndef ELEM_TYPE
> > > > > +#define ELEM_TYPE __fp16
> > > > > +#endif
> > > > > +#ifndef INT_ELEM_TYPE
> > > > > +#define INT_ELEM_TYPE __INT16_TYPE__
> > > > > +#endif
> > > > > +
> > > > > +#define COMPARE(NAME, OP)                    \
> > > > > +  int_vec                                    \
> > > > > +  cmp_##NAME##_reg (vec a, vec b)            \
> > > > > +  {                                          \
> > > > > +    return a OP b;                           \
> > > > > +  }
> > > > > +
> > > > > +typedef INT_ELEM_TYPE int_vec __attribute__((vector_size(16)));
> > > > > +typedef ELEM_TYPE vec __attribute__((vector_size(16)));
> > > > > +
> > > > > +COMPARE (eq, ==)
> > > > > +COMPARE (ne, !=)
> > > > > +COMPARE (lt, <)
> > > > > +COMPARE (le, <=)
> > > > > +COMPARE (gt, >)
> > > > > +COMPARE (ge, >=)
> > > > > +
> > > > > +/* eq, ne, lt, le, gt, ge.
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-
> 9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-
> 9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-
> 9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-
> 9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-
> 9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-
> 9]+\n} 1 } } */
> > > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
> b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
> > > > > new file mode 100644
> > > > > index 0000000..dbae2d1
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c
> > > > > @@ -0,0 +1,30 @@
> > > > > +/* { dg-do assemble } */
> > > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > > > > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
> > > > > +
> > > > > +#include <stdint.h>
> > > > > +
> > > > > +#define NB 8
> > > > > +
> > > > > +#define FUNC(OP, NAME)                                                       \
> > > > > +  void test_ ## NAME ##_f (__fp16 * __restrict__ dest, __fp16 *a,
> __fp16 *b) { \
> > > > > +    int i;                                                           \
> > > > > +    for (i=0; i<NB; i++) {                                           \
> > > > > +      dest[i] = a[i] OP b[i];                                                \
> > > > > +    }                                                                        \
> > > > > +  }
> > > > > +
> > > > > +FUNC(==, vcmpeq)
> > > > > +FUNC(!=, vcmpne)
> > > > > +FUNC(<, vcmplt)
> > > > > +FUNC(<=, vcmple)
> > > > > +FUNC(>, vcmpgt)
> > > > > +FUNC(>=, vcmpge)
> > > > > +
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\teq, q[0-9]+, q[0-
> 9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tne, q[0-9]+, q[0-
> 9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tlt, q[0-9]+, q[0-
> 9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tle, q[0-9]+, q[0-
> 9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tgt, q[0-9]+, q[0-
> 9]+\n} 1 } } */
> > > > > +/* { dg-final { scan-assembler-times {\tvcmp.f16\tge, q[0-9]+, q[0-
> 9]+\n} 1 } } */

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 6/9] arm: Auto-vectorization for MVE: vcmp
  2021-05-17 10:35         ` Kyrylo Tkachov
@ 2021-05-17 12:31           ` Christophe Lyon
  0 siblings, 0 replies; 35+ messages in thread
From: Christophe Lyon @ 2021-05-17 12:31 UTC (permalink / raw)
  To: Kyrylo Tkachov; +Cc: Andre Simoes Dias Vieira, gcc-patches

On Mon, 17 May 2021 at 12:35, Kyrylo Tkachov <Kyrylo.Tkachov@arm.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> > Christophe Lyon via Gcc-patches
> > Sent: 05 May 2021 15:08
> > To: Andre Simoes Dias Vieira <Andre.SimoesDiasVieira@arm.com>
> > Cc: gcc Patches <gcc-patches@gcc.gnu.org>
> > Subject: Re: [PATCH 6/9] arm: Auto-vectorization for MVE: vcmp
> >
> > On Tue, 4 May 2021 at 15:41, Christophe Lyon <christophe.lyon@linaro.org>
> > wrote:
> > >
> > > On Tue, 4 May 2021 at 13:29, Andre Vieira (lists)
> > > <andre.simoesdiasvieira@arm.com> wrote:
> > > >
> > > > Hi Christophe,
> > > >
> > > > On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:
> > > > > Since MVE has a different set of vector comparison operators from
> > > > > Neon, we have to update the expansion to take into account the new
> > > > > ones, for instance 'NE' for which MVE does not require to use 'EQ'
> > > > > with the inverted condition.
> > > > >
> > > > > Conversely, Neon supports comparisons with #0, MVE does not.
> > > > >
> > > > > For:
> > > > > typedef long int vs32 __attribute__((vector_size(16)));
> > > > > vs32 cmp_eq_vs32_reg (vs32 a, vs32 b) { return a == b; }
> > > > >
> > > > > we now generate:
> > > > > cmp_eq_vs32_reg:
> > > > >       vldr.64 d4, .L123       @ 8     [c=8 l=4]  *mve_movv4si/8
> > > > >       vldr.64 d5, .L123+8
> > > > >       vldr.64 d6, .L123+16    @ 9     [c=8 l=4]  *mve_movv4si/8
> > > > >       vldr.64 d7, .L123+24
> > > > >       vcmp.i32  eq, q0, q1    @ 7     [c=16 l=4]  mve_vcmpeqq_v4si
> > > > >       vpsel q0, q3, q2        @ 15    [c=8 l=4]  mve_vpselq_sv4si
> > > > >       bx      lr      @ 26    [c=8 l=4]  *thumb2_return
> > > > > .L124:
> > > > >       .align  3
> > > > > .L123:
> > > > >       .word   0
> > > > >       .word   0
> > > > >       .word   0
> > > > >       .word   0
> > > > >       .word   1
> > > > >       .word   1
> > > > >       .word   1
> > > > >       .word   1
> > > > >
> > > > > For some reason emit_move_insn (zero, CONST0_RTX (cmp_mode))
> > produces
> > > > > a pair of vldr instead of vmov.i32, qX, #0
> > > > I think ideally we would even want:
> > > > vpte  eq, q0, q1
> > > > vmovt.i32 q0, #0
> > > > vmove.i32 q0, #1
> > > >
> > > > But we don't have a way to generate VPT blocks with multiple
> > > > instructions yet unfortunately so I guess VPSEL will have to do for now.
> > >
> > > TBH,  I looked at what LLVM generates currently ;-)
> > >
> >
> > Here is an updated version, which adds
> > && (!<Is_float_mode> || flag_unsafe_math_optimizations)
> > to vcond_mask_
> >
> > This condition was not present in the neon.md version I move to vec-
> > common.md,
> > but since the VDQW iterator includes V2SF and V4SF, it should take
> > float-point flags into account.
> >
>
> -      emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1));
> +    case NE:
> +      if (TARGET_HAVE_MVE) {
> +       rtx vpr_p0;
>
> GNU style wants the '{' on the new line. This appears a few other times in the patch.
>
> +       if (vcond_mve)
> +         vpr_p0 = target;
> +       else
> +         vpr_p0 = gen_reg_rtx (HImode);
> +
> +       switch (cmp_mode)
> +         {
> +         case E_V16QImode:
> +         case E_V8HImode:
> +         case E_V4SImode:
> +           emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
> +           break;
> +         case E_V8HFmode:
> +         case E_V4SFmode:
> +           if (TARGET_HAVE_MVE_FLOAT)
> +             emit_insn (gen_mve_vcmpq_f (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
> +           else
> +             gcc_unreachable ();
> +           break;
> +         default:
> +           gcc_unreachable ();
> +         }
>
> Hmm, I think we can just check GET_MODE_CLASS (cmp_mode) for MODE_VECTOR_INT or MODE_VECTOR_FLOAT here rather than have this switch statement.
>
> +
> +       /* If we are not expanding a vcond, build the result here.  */
> +       if (!vcond_mve) {
> +         rtx zero = gen_reg_rtx (cmp_result_mode);
> +         rtx one = gen_reg_rtx (cmp_result_mode);
> +         emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
> +         emit_move_insn (one, CONST1_RTX (cmp_result_mode));
> +         emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
> +       }
> +      }
> +      else
>
> ...
>    bool inverted = arm_expand_vector_compare (mask, GET_CODE (operands[3]),
> -                                            operands[4], operands[5], true);
> +                                            operands[4], operands[5], true, vcond_mve);
>    if (inverted)
>      std::swap (operands[1], operands[2]);
> +  if (TARGET_NEON)
>    emit_insn (gen_neon_vbsl (GET_MODE (operands[0]), operands[0],
>                             mask, operands[1], operands[2]));
> +  else
> +    {
> +      machine_mode cmp_mode = GET_MODE (operands[4]);
> +      rtx vpr_p0 = mask;
> +      rtx zero = gen_reg_rtx (cmp_mode);
> +      rtx one = gen_reg_rtx (cmp_mode);
> +      emit_move_insn (zero, CONST0_RTX (cmp_mode));
> +      emit_move_insn (one, CONST1_RTX (cmp_mode));
> +      switch (cmp_mode)
> +       {
> +       case E_V16QImode:
> +       case E_V8HImode:
> +       case E_V4SImode:
> +         emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, operands[0], one, zero, vpr_p0));
> +         break;
> +       case E_V8HFmode:
> +       case E_V4SFmode:
> +         if (TARGET_HAVE_MVE_FLOAT)
> +           emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], one, zero, vpr_p0));
> +         break;
> +       default:
> +         gcc_unreachable ();
> +       }
>
> Similarly here.
> Ok with those changes.

Thanks, committed after testing.

Christophe

> Thanks,
> Kyrill

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 8/9] arm: Auto-vectorization for MVE: vld2/vst2
  2021-05-17  9:55   ` Christophe Lyon
@ 2021-05-24  7:19     ` Christophe Lyon
  0 siblings, 0 replies; 35+ messages in thread
From: Christophe Lyon @ 2021-05-24  7:19 UTC (permalink / raw)
  To: gcc Patches

ping?

On Mon, 17 May 2021 at 11:55, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
>
> ping?
>
> On Fri, 30 Apr 2021 at 16:09, Christophe Lyon
> <christophe.lyon@linaro.org> wrote:
> >
> > This patch enables MVE vld2/vst2 instructions for auto-vectorization.
> > We move the existing expanders from neon.md and enable them for MVE,
> > calling the respective emitter.
> >
> > 2021-03-12  Christophe Lyon  <christophe.lyon@linaro.org>
> >
> >         gcc/
> >         * config/arm/neon.md (vec_load_lanesoi<mode>)
> >         (vec_store_lanesoi<mode>): Move ...
> >         * config/arm/vec-common.md: here.
> >
> >         gcc/testsuite/
> >         * gcc.target/arm/simd/mve-vld2.c: New test, derived from
> >         slp-perm-2.c
> > ---
> >  gcc/config/arm/neon.md                       | 14 ----
> >  gcc/config/arm/vec-common.md                 | 27 ++++++++
> >  gcc/testsuite/gcc.target/arm/simd/mve-vld2.c | 96 ++++++++++++++++++++++++++++
> >  3 files changed, 123 insertions(+), 14 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vld2.c
> >
> > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> > index 6660846..bc8775c 100644
> > --- a/gcc/config/arm/neon.md
> > +++ b/gcc/config/arm/neon.md
> > @@ -5063,13 +5063,6 @@ (define_insn "neon_vld2<mode>"
> >                      (const_string "neon_load2_2reg<q>")))]
> >  )
> >
> > -(define_expand "vec_load_lanesoi<mode>"
> > -  [(set (match_operand:OI 0 "s_register_operand")
> > -        (unspec:OI [(match_operand:OI 1 "neon_struct_operand")
> > -                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> > -                  UNSPEC_VLD2))]
> > -  "TARGET_NEON")
> > -
> >  (define_insn "neon_vld2<mode>"
> >    [(set (match_operand:OI 0 "s_register_operand" "=w")
> >          (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um")
> > @@ -5197,13 +5190,6 @@ (define_insn "neon_vst2<mode>"
> >                      (const_string "neon_store2_one_lane<q>")))]
> >  )
> >
> > -(define_expand "vec_store_lanesoi<mode>"
> > -  [(set (match_operand:OI 0 "neon_struct_operand")
> > -       (unspec:OI [(match_operand:OI 1 "s_register_operand")
> > -                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> > -                   UNSPEC_VST2))]
> > -  "TARGET_NEON")
> > -
> >  (define_insn "neon_vst2<mode>"
> >    [(set (match_operand:OI 0 "neon_struct_operand" "=Um")
> >         (unspec:OI [(match_operand:OI 1 "s_register_operand" "w")
> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > index 3fd341c..7abefea 100644
> > --- a/gcc/config/arm/vec-common.md
> > +++ b/gcc/config/arm/vec-common.md
> > @@ -482,6 +482,33 @@ (define_expand "vcond_mask_<mode><v_cmp_result>"
> >      }
> >    else
> >      gcc_unreachable ();
> > +  DONE;
> > +})
> >
> > +(define_expand "vec_load_lanesoi<mode>"
> > +  [(set (match_operand:OI 0 "s_register_operand")
> > +        (unspec:OI [(match_operand:OI 1 "neon_struct_operand")
> > +                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> > +                  UNSPEC_VLD2))]
> > +  "TARGET_NEON || TARGET_HAVE_MVE"
> > +{
> > +  if (TARGET_NEON)
> > +    emit_insn (gen_neon_vld2<mode> (operands[0], operands[1]));
> > +  else
> > +    emit_insn (gen_mve_vld2q<mode> (operands[0], operands[1]));
> > +  DONE;
> > +})
> > +
> > +(define_expand "vec_store_lanesoi<mode>"
> > +  [(set (match_operand:OI 0 "neon_struct_operand")
> > +       (unspec:OI [(match_operand:OI 1 "s_register_operand")
> > +                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> > +                   UNSPEC_VST2))]
> > +  "TARGET_NEON || TARGET_HAVE_MVE"
> > +{
> > +  if (TARGET_NEON)
> > +    emit_insn (gen_neon_vst2<mode> (operands[0], operands[1]));
> > +  else
> > +    emit_insn (gen_mve_vst2q<mode> (operands[0], operands[1]));
> >    DONE;
> >  })
> > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vld2.c b/gcc/testsuite/gcc.target/arm/simd/mve-vld2.c
> > new file mode 100644
> > index 0000000..9c7c3f5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vld2.c
> > @@ -0,0 +1,96 @@
> > +/* { dg-do assemble } */
> > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > +/* { dg-additional-options "-O3" } */
> > +
> > +#include <stdint.h>
> > +
> > +#define M00 100
> > +#define M10 216
> > +#define M01 1322
> > +#define M11 13
> > +
> > +#define N 128
> > +
> > +
> > +/* Integer tests.  */
> > +#define FUNC(SIGN, TYPE, BITS)                                         \
> > +  void foo_##SIGN##BITS##x (TYPE##BITS##_t *__restrict__ pInput,       \
> > +                           TYPE##BITS##_t *__restrict__ pOutput)       \
> > +  {                                                                    \
> > +    unsigned int i;                                                    \
> > +    TYPE##BITS##_t  a, b;                                              \
> > +                                                                       \
> > +    for (i = 0; i < N / BITS; i++)                                     \
> > +      {                                                                        \
> > +       a = *pInput++;                                                  \
> > +       b = *pInput++;                                                  \
> > +                                                                       \
> > +       *pOutput++ = M00 * a + M01 * b;                                 \
> > +       *pOutput++ = M10 * a + M11 * b;                                 \
> > +      }                                                                        \
> > +  }
> > +
> > +FUNC(s, int, 8)
> > +FUNC(u, uint, 8)
> > +FUNC(s, int, 16)
> > +FUNC(u, uint, 16)
> > +FUNC(s, int, 32)
> > +FUNC(u, uint, 32)
> > +
> > +/* float test, keep the macro because it's similar to the above, but does not
> > +   need the ##BITS##_t.  */
> > +#define FUNC_FLOAT(SIGN, TYPE, BITS)                                   \
> > +  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,                 \
> > +                           TYPE *__restrict__ pOutput)                 \
> > +  {                                                                    \
> > +    unsigned int i;                                                    \
> > +    TYPE a, b;                                                         \
> > +                                                                       \
> > +    for (i = 0; i < N / BITS; i++)                                     \
> > +      {                                                                        \
> > +       a = *pInput++;                                                  \
> > +       b = *pInput++;                                                  \
> > +                                                                       \
> > +       *pOutput++ = M00 * a + M01 * b;                                 \
> > +       *pOutput++ = M10 * a + M11 * b;                                 \
> > +      }                                                                        \
> > +  }
> > +
> > +FUNC_FLOAT(f, float, 32)
> > +
> > +/* __fp16 test, needs explicit casts to avoid conversions to floating-point and
> > +   failure to vectorize.  */
> > +__fp16 M00_fp16 = 100.0f16;
> > +__fp16 M10_fp16 = 216.0f16;
> > +__fp16 M01_fp16 = 1322.0f16;
> > +__fp16 M11_fp16 = 13.0f16;
> > +
> > +#define FUNC_FLOAT_FP16(SIGN, TYPE, BITS)                              \
> > +  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,                 \
> > +                           TYPE *__restrict__ pOutput)                 \
> > +  {                                                                    \
> > +    unsigned int i;                                                    \
> > +    TYPE a, b;                                                         \
> > +                                                                       \
> > +    for (i = 0; i < N / BITS; i++)                                     \
> > +      {                                                                        \
> > +       a = *pInput++;                                                  \
> > +       b = *pInput++;                                                  \
> > +                                                                       \
> > +       *pOutput++ = (__fp16)(M00_fp16 * a) + (__fp16)(M01_fp16 * b);   \
> > +       *pOutput++ = (__fp16)(M10_fp16 * a) + (__fp16)(M11_fp16 * b);   \
> > +      }                                                                        \
> > +  }
> > +
> > +FUNC_FLOAT_FP16(f, __fp16, 16)
> > +
> > +/* vld2X.8 is used for signed and unsigned chars: 2 pairs.  */
> > +/* vld2X.16 is used for signed and unsigned shorts and __fp16: 3 pairs.  */
> > +/* vld2X.32 is used for signed and unsigned ints and float: 3 pairs.  */
> > +/* { dg-final { scan-assembler-times {vld2[01].8\t.q[0-9]+, q[0-9]+., } 4 } } */
> > +/* { dg-final { scan-assembler-times {vld2[01].16\t.q[0-9]+, q[0-9]+., } 6 } } */
> > +/* { dg-final { scan-assembler-times {vld2[01].32\t.q[0-9]+, q[0-9]+., } 6 } } */
> > +/* { dg-final { scan-assembler-times {vst2[01].8\t.q[0-9]+, q[0-9]+., } 4 } } */
> > +/* { dg-final { scan-assembler-times {vst2[01].16\t.q[0-9]+, q[0-9]+., } 6 } } */
> > +/* { dg-final { scan-assembler-times {vst2[01].32\t.q[0-9]+, q[0-9]+., } 6 } } */
> > --
> > 2.7.4
> >

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 9/9] arm: Auto-vectorization for MVE: vld4/vst4
  2021-05-17  9:55       ` Christophe Lyon
@ 2021-05-24  7:20         ` Christophe Lyon
  0 siblings, 0 replies; 35+ messages in thread
From: Christophe Lyon @ 2021-05-24  7:20 UTC (permalink / raw)
  To: gcc Patches

ping?

On Mon, 17 May 2021 at 11:55, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
>
> ping?
>
> On Tue, 4 May 2021 at 16:57, Christophe Lyon <christophe.lyon@linaro.org> wrote:
> >
> > On Tue, 4 May 2021 at 14:03, Andre Vieira (lists)
> > <andre.simoesdiasvieira@arm.com> wrote:
> > >
> > > Hi Christophe,
> > >
> > > The series LGTM but you'll need the approval of an arm port maintainer
> > > before committing. I only did code-review, did not try to build/run tests.
> > >
> >
> > Hi Andre,
> >
> > Thanks for the comments!
> >
> > > Kind regards,
> > > Andre
> > >
> > > On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:
> > > > This patch enables MVE vld4/vst4 instructions for auto-vectorization.
> > > > We move the existing expanders from neon.md and enable them for MVE,
> > > > calling the respective emitter.
> > > >
> > > > 2021-03-12  Christophe Lyon  <christophe.lyon@linaro.org>
> > > >
> > > >       gcc/
> > > >       * config/arm/neon.md (vec_load_lanesxi<mode>)
> > > >       (vec_store_lanexoi<mode>): Move ...
> > > >       * config/arm/vec-common.md: here.
> > > >
> > > >       gcc/testsuite/
> > > >       * gcc.target/arm/simd/mve-vld4.c: New test, derived from
> > > >       slp-perm-3.c
> > > > ---
> > > >   gcc/config/arm/neon.md                       |  20 ----
> > > >   gcc/config/arm/vec-common.md                 |  26 +++++
> > > >   gcc/testsuite/gcc.target/arm/simd/mve-vld4.c | 140 +++++++++++++++++++++++++++
> > > >   3 files changed, 166 insertions(+), 20 deletions(-)
> > > >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
> > > >
> > > > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> > > > index bc8775c..fb58baf 100644
> > > > --- a/gcc/config/arm/neon.md
> > > > +++ b/gcc/config/arm/neon.md
> > > > @@ -5617,16 +5617,6 @@ (define_insn "neon_vld4<mode>"
> > > >                       (const_string "neon_load4_4reg<q>")))]
> > > >   )
> > > >
> > > > -(define_expand "vec_load_lanesxi<mode>"
> > > > -  [(match_operand:XI 0 "s_register_operand")
> > > > -   (match_operand:XI 1 "neon_struct_operand")
> > > > -   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> > > > -  "TARGET_NEON"
> > > > -{
> > > > -  emit_insn (gen_neon_vld4<mode> (operands[0], operands[1]));
> > > > -  DONE;
> > > > -})
> > > > -
> > > >   (define_expand "neon_vld4<mode>"
> > > >     [(match_operand:XI 0 "s_register_operand")
> > > >      (match_operand:XI 1 "neon_struct_operand")
> > > > @@ -5818,16 +5808,6 @@ (define_insn "neon_vst4<mode>"
> > > >                       (const_string "neon_store4_4reg<q>")))]
> > > >   )
> > > >
> > > > -(define_expand "vec_store_lanesxi<mode>"
> > > > -  [(match_operand:XI 0 "neon_struct_operand")
> > > > -   (match_operand:XI 1 "s_register_operand")
> > > > -   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> > > > -  "TARGET_NEON"
> > > > -{
> > > > -  emit_insn (gen_neon_vst4<mode> (operands[0], operands[1]));
> > > > -  DONE;
> > > > -})
> > > > -
> > > >   (define_expand "neon_vst4<mode>"
> > > >     [(match_operand:XI 0 "neon_struct_operand")
> > > >      (match_operand:XI 1 "s_register_operand")
> > > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > > > index 7abefea..d46b78d 100644
> > > > --- a/gcc/config/arm/vec-common.md
> > > > +++ b/gcc/config/arm/vec-common.md
> > > > @@ -512,3 +512,29 @@ (define_expand "vec_store_lanesoi<mode>"
> > > >       emit_insn (gen_mve_vst2q<mode> (operands[0], operands[1]));
> > > >     DONE;
> > > >   })
> > > > +
> > > > +(define_expand "vec_load_lanesxi<mode>"
> > > > +  [(match_operand:XI 0 "s_register_operand")
> > > > +   (match_operand:XI 1 "neon_struct_operand")
> > > > +   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> > > > +  "TARGET_NEON || TARGET_HAVE_MVE"
> > > > +{
> > > > +  if (TARGET_NEON)
> > > > +    emit_insn (gen_neon_vld4<mode> (operands[0], operands[1]));
> > > > +  else
> > > > +    emit_insn (gen_mve_vld4q<mode> (operands[0], operands[1]));
> > > > +  DONE;
> > > > +})
> > > > +
> > > > +(define_expand "vec_store_lanesxi<mode>"
> > > > +  [(match_operand:XI 0 "neon_struct_operand")
> > > > +   (match_operand:XI 1 "s_register_operand")
> > > > +   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> > > > +  "TARGET_NEON || TARGET_HAVE_MVE"
> > > > +{
> > > > +  if (TARGET_NEON)
> > > > +    emit_insn (gen_neon_vst4<mode> (operands[0], operands[1]));
> > > > +  else
> > > > +    emit_insn (gen_mve_vst4q<mode> (operands[0], operands[1]));
> > > > +  DONE;
> > > > +})
> > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c b/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
> > > > new file mode 100644
> > > > index 0000000..ce3e755
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
> > > > @@ -0,0 +1,140 @@
> > > > +/* { dg-do assemble } */
> > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> > > > +/* { dg-add-options arm_v8_1m_mve_fp } */
> > > > +/* { dg-additional-options "-O3" } */
> > > > +
> > > > +#include <stdint.h>
> > > > +
> > > > +#define M00 100
> > > > +#define M10 216
> > > > +#define M20 23
> > > > +#define M30 237
> > > > +#define M01 1322
> > > > +#define M11 13
> > > > +#define M21 27271
> > > > +#define M31 2280
> > > > +#define M02 74
> > > > +#define M12 191
> > > > +#define M22 500
> > > > +#define M32 111
> > > > +#define M03 134
> > > > +#define M13 117
> > > > +#define M23 11
> > > > +#define M33 771
> > > > +
> > > > +#define N 128
> > > > +
> > > > +/* Integer tests.  */
> > > > +#define FUNC(SIGN, TYPE, BITS)                                               \
> > > > +  void foo_##SIGN##BITS##x (TYPE##BITS##_t *__restrict__ pInput,     \
> > > > +                         TYPE##BITS##_t *__restrict__ pOutput)       \
> > > > +  {                                                                  \
> > > > +    unsigned int i;                                                  \
> > > > +    TYPE##BITS##_t  a, b, c, d;                                              \
> > > > +                                                                     \
> > > > +    for (i = 0; i < N / BITS; i++)                                   \
> > > > +      {                                                                      \
> > > > +     a = *pInput++;                                                  \
> > > > +     b = *pInput++;                                                  \
> > > > +     c = *pInput++;                                                  \
> > > > +     d = *pInput++;                                                  \
> > > > +                                                                     \
> > > > +     *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;             \
> > > > +     *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;             \
> > > > +     *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;             \
> > > > +     *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;             \
> > > > +      }                                                                      \
> > > > +  }
> > > > +
> > > > +FUNC(s, int, 8)
> > > > +FUNC(u, uint, 8)
> > > > +FUNC(s, int, 16)
> > > > +FUNC(u, uint, 16)
> > > > +FUNC(s, int, 32)
> > > > +FUNC(u, uint, 32)
> > > > +
> > > > +/* float test, keep the macro because it's similar to the above, but does not
> > > > +   need the ##BITS##_t.  */
> > > > +#define FUNC_FLOAT(SIGN, TYPE, BITS)                                         \
> > > > +  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,                       \
> > > > +                         TYPE *__restrict__ pOutput)                 \
> > > > +  {                                                                  \
> > > > +    unsigned int i;                                                  \
> > > > +    TYPE a, b, c, d;                                                 \
> > > > +                                                                     \
> > > > +    for (i = 0; i < N / BITS; i++)                                   \
> > > > +      {                                                                      \
> > > > +     a = *pInput++;                                                  \
> > > > +     b = *pInput++;                                                  \
> > > > +     c = *pInput++;                                                  \
> > > > +     d = *pInput++;                                                  \
> > > > +                                                                     \
> > > > +     *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;             \
> > > > +     *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;             \
> > > > +     *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;             \
> > > > +     *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;             \
> > > > +      }                                                                      \
> > > > +  }
> > > > +
> > > > +FUNC_FLOAT(f, float, 32)
> > > > +
> > > > +/* __fp16 test, needs explicit casts to avoid conversions to floating-point and
> > > > +   failure to vectorize.  */
> > > > +__fp16 M00_fp16 = 100.0f16;
> > > > +__fp16 M10_fp16 = 216.0f16;
> > > > +__fp16 M20_fp16 = 23.0f16;
> > > > +__fp16 M30_fp16 = 237.0f16;
> > > > +__fp16 M01_fp16 = 1322.0f16;
> > > > +__fp16 M11_fp16 = 13.0f16;
> > > > +__fp16 M21_fp16 = 27271.0f16;
> > > > +__fp16 M31_fp16 = 2280.0f16;
> > > > +__fp16 M02_fp16 = 74.0f16;
> > > > +__fp16 M12_fp16 = 191.0f16;
> > > > +__fp16 M22_fp16 = 500.0f16;
> > > > +__fp16 M32_fp16 = 111.0f16;
> > > > +__fp16 M03_fp16 = 134.0f16;
> > > > +__fp16 M13_fp16 = 117.0f16;
> > > > +__fp16 M23_fp16 = 11.0f16;
> > > > +__fp16 M33_fp16 = 771.0f16;
> > > > +
> > > > +#define FUNC_FLOAT_FP16(SIGN, TYPE, BITS)                            \
> > > > +  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,                       \
> > > > +                         TYPE *__restrict__ pOutput)                 \
> > > > +  {                                                                  \
> > > > +    unsigned int i;                                                  \
> > > > +    TYPE a, b, c, d;                                                 \
> > > > +                                                                     \
> > > > +    for (i = 0; i < N / BITS; i++)                                   \
> > > > +      {                                                                      \
> > > > +     a = *pInput++;                                                  \
> > > > +     b = *pInput++;                                                  \
> > > > +     c = *pInput++;                                                  \
> > > > +     d = *pInput++;                                                  \
> > > > +                                                                     \
> > > > +     TYPE ab, cd;                                                    \
> > > > +     ab = (__fp16)(M00_fp16 * a) + (__fp16)(M01_fp16 * b);           \
> > > > +     cd = (__fp16)(M02_fp16 * c) + (__fp16)(M03_fp16 * d);           \
> > > > +     *pOutput++ = ab + cd;                                           \
> > > > +     ab = (__fp16)(M10_fp16 * a) + (__fp16)(M11_fp16 * b);           \
> > > > +     cd = (__fp16)(M12_fp16 * c) + (__fp16)(M13_fp16 * d);           \
> > > > +     *pOutput++ = ab + cd;                                           \
> > > > +     ab = (__fp16)(M20_fp16 * a) + (__fp16)(M21_fp16 * b);           \
> > > > +     cd = (__fp16)(M22_fp16 * c) + (__fp16)(M23_fp16 * d);           \
> > > > +     *pOutput++ = ab + cd;                                           \
> > > > +     ab = (__fp16)(M30_fp16 * a) + (__fp16)(M31_fp16 * b);           \
> > > > +     cd = (__fp16)(M32_fp16 * c) + (__fp16)(M33_fp16 * d);           \
> > > > +     *pOutput++ = ab + cd;                                           \
> > > > +      }                                                                      \
> > > > +  }
> > > > +
> > > > +FUNC_FLOAT_FP16(f, __fp16, 16)
> > > > +
> > > > +/* vld4X.8 is used for signed and unsigned chars: 2 * 4.  */
> > > > +/* vld4X.16 is used for signed and unsigned shorts and __fp16: 3 * 4.  */
> > > > +/* vld4X.32 is used for signed and unsigned ints and float: 3 * 4.  */
> > > > +/* { dg-final { scan-assembler-times {vld4[0123].8\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 8 } } */
> > > > +/* { dg-final { scan-assembler-times {vld4[0123].16\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
> > > > +/* { dg-final { scan-assembler-times {vld4[0123].32\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
> > > > +/* { dg-final { scan-assembler-times {vst4[0123].8\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 8 } } */
> > > > +/* { dg-final { scan-assembler-times {vst4[0123].16\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
> > > > +/* { dg-final { scan-assembler-times {vst4[0123].32\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 8/9] arm: Auto-vectorization for MVE: vld2/vst2
  2021-04-30 14:09 ` [PATCH 8/9] arm: Auto-vectorization for MVE: vld2/vst2 Christophe Lyon
  2021-05-17  9:55   ` Christophe Lyon
@ 2021-05-24 12:15   ` Kyrylo Tkachov
  1 sibling, 0 replies; 35+ messages in thread
From: Kyrylo Tkachov @ 2021-05-24 12:15 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches



> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> Christophe Lyon via Gcc-patches
> Sent: 30 April 2021 15:10
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH 8/9] arm: Auto-vectorization for MVE: vld2/vst2
> 
> This patch enables MVE vld2/vst2 instructions for auto-vectorization.
> We move the existing expanders from neon.md and enable them for MVE,
> calling the respective emitter.

Ok.
Thanks,
Kyrill

> 
> 2021-03-12  Christophe Lyon  <christophe.lyon@linaro.org>
> 
> 	gcc/
> 	* config/arm/neon.md (vec_load_lanesoi<mode>)
> 	(vec_store_lanesoi<mode>): Move ...
> 	* config/arm/vec-common.md: here.
> 
> 	gcc/testsuite/
> 	* gcc.target/arm/simd/mve-vld2.c: New test, derived from
> 	slp-perm-2.c
> ---
>  gcc/config/arm/neon.md                       | 14 ----
>  gcc/config/arm/vec-common.md                 | 27 ++++++++
>  gcc/testsuite/gcc.target/arm/simd/mve-vld2.c | 96
> ++++++++++++++++++++++++++++
>  3 files changed, 123 insertions(+), 14 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vld2.c
> 
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index 6660846..bc8775c 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -5063,13 +5063,6 @@ (define_insn "neon_vld2<mode>"
>                      (const_string "neon_load2_2reg<q>")))]
>  )
> 
> -(define_expand "vec_load_lanesoi<mode>"
> -  [(set (match_operand:OI 0 "s_register_operand")
> -        (unspec:OI [(match_operand:OI 1 "neon_struct_operand")
> -                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> -		   UNSPEC_VLD2))]
> -  "TARGET_NEON")
> -
>  (define_insn "neon_vld2<mode>"
>    [(set (match_operand:OI 0 "s_register_operand" "=w")
>          (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um")
> @@ -5197,13 +5190,6 @@ (define_insn "neon_vst2<mode>"
>                      (const_string "neon_store2_one_lane<q>")))]
>  )
> 
> -(define_expand "vec_store_lanesoi<mode>"
> -  [(set (match_operand:OI 0 "neon_struct_operand")
> -	(unspec:OI [(match_operand:OI 1 "s_register_operand")
> -                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> -                   UNSPEC_VST2))]
> -  "TARGET_NEON")
> -
>  (define_insn "neon_vst2<mode>"
>    [(set (match_operand:OI 0 "neon_struct_operand" "=Um")
>  	(unspec:OI [(match_operand:OI 1 "s_register_operand" "w")
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-
> common.md
> index 3fd341c..7abefea 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -482,6 +482,33 @@ (define_expand
> "vcond_mask_<mode><v_cmp_result>"
>      }
>    else
>      gcc_unreachable ();
> +  DONE;
> +})
> 
> +(define_expand "vec_load_lanesoi<mode>"
> +  [(set (match_operand:OI 0 "s_register_operand")
> +        (unspec:OI [(match_operand:OI 1 "neon_struct_operand")
> +                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> +		   UNSPEC_VLD2))]
> +  "TARGET_NEON || TARGET_HAVE_MVE"
> +{
> +  if (TARGET_NEON)
> +    emit_insn (gen_neon_vld2<mode> (operands[0], operands[1]));
> +  else
> +    emit_insn (gen_mve_vld2q<mode> (operands[0], operands[1]));
> +  DONE;
> +})
> +
> +(define_expand "vec_store_lanesoi<mode>"
> +  [(set (match_operand:OI 0 "neon_struct_operand")
> +	(unspec:OI [(match_operand:OI 1 "s_register_operand")
> +                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> +                   UNSPEC_VST2))]
> +  "TARGET_NEON || TARGET_HAVE_MVE"
> +{
> +  if (TARGET_NEON)
> +    emit_insn (gen_neon_vst2<mode> (operands[0], operands[1]));
> +  else
> +    emit_insn (gen_mve_vst2q<mode> (operands[0], operands[1]));
>    DONE;
>  })
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vld2.c
> b/gcc/testsuite/gcc.target/arm/simd/mve-vld2.c
> new file mode 100644
> index 0000000..9c7c3f5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vld2.c
> @@ -0,0 +1,96 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> +/* { dg-add-options arm_v8_1m_mve_fp } */
> +/* { dg-additional-options "-O3" } */
> +
> +#include <stdint.h>
> +
> +#define M00 100
> +#define M10 216
> +#define M01 1322
> +#define M11 13
> +
> +#define N 128
> +
> +
> +/* Integer tests.  */
> +#define FUNC(SIGN, TYPE, BITS)
> 	\
> +  void foo_##SIGN##BITS##x (TYPE##BITS##_t *__restrict__ pInput,	\
> +			    TYPE##BITS##_t *__restrict__ pOutput)	\
> +  {									\
> +    unsigned int i;							\
> +    TYPE##BITS##_t  a, b;						\
> +    									\
> +    for (i = 0; i < N / BITS; i++)					\
> +      {									\
> +	a = *pInput++;							\
> +	b = *pInput++;							\
> +									\
> +	*pOutput++ = M00 * a + M01 * b;
> 	\
> +	*pOutput++ = M10 * a + M11 * b;
> 	\
> +      }									\
> +  }
> +
> +FUNC(s, int, 8)
> +FUNC(u, uint, 8)
> +FUNC(s, int, 16)
> +FUNC(u, uint, 16)
> +FUNC(s, int, 32)
> +FUNC(u, uint, 32)
> +
> +/* float test, keep the macro because it's similar to the above, but does not
> +   need the ##BITS##_t.  */
> +#define FUNC_FLOAT(SIGN, TYPE, BITS)
> 	\
> +  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,
> 	\
> +			    TYPE *__restrict__ pOutput)			\
> +  {									\
> +    unsigned int i;							\
> +    TYPE a, b;								\
> +    									\
> +    for (i = 0; i < N / BITS; i++)					\
> +      {									\
> +	a = *pInput++;							\
> +	b = *pInput++;							\
> +									\
> +	*pOutput++ = M00 * a + M01 * b;
> 	\
> +	*pOutput++ = M10 * a + M11 * b;
> 	\
> +      }									\
> +  }
> +
> +FUNC_FLOAT(f, float, 32)
> +
> +/* __fp16 test, needs explicit casts to avoid conversions to floating-point
> and
> +   failure to vectorize.  */
> +__fp16 M00_fp16 = 100.0f16;
> +__fp16 M10_fp16 = 216.0f16;
> +__fp16 M01_fp16 = 1322.0f16;
> +__fp16 M11_fp16 = 13.0f16;
> +
> +#define FUNC_FLOAT_FP16(SIGN, TYPE, BITS)				\
> +  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,
> 	\
> +			    TYPE *__restrict__ pOutput)			\
> +  {									\
> +    unsigned int i;							\
> +    TYPE a, b;								\
> +    									\
> +    for (i = 0; i < N / BITS; i++)					\
> +      {									\
> +	a = *pInput++;							\
> +	b = *pInput++;							\
> +									\
> +	*pOutput++ = (__fp16)(M00_fp16 * a) + (__fp16)(M01_fp16 * b);
> 	\
> +	*pOutput++ = (__fp16)(M10_fp16 * a) + (__fp16)(M11_fp16 * b);
> 	\
> +      }									\
> +  }
> +
> +FUNC_FLOAT_FP16(f, __fp16, 16)
> +
> +/* vld2X.8 is used for signed and unsigned chars: 2 pairs.  */
> +/* vld2X.16 is used for signed and unsigned shorts and __fp16: 3 pairs.  */
> +/* vld2X.32 is used for signed and unsigned ints and float: 3 pairs.  */
> +/* { dg-final { scan-assembler-times {vld2[01].8\t.q[0-9]+, q[0-9]+., } 4 } } */
> +/* { dg-final { scan-assembler-times {vld2[01].16\t.q[0-9]+, q[0-9]+., } 6 } }
> */
> +/* { dg-final { scan-assembler-times {vld2[01].32\t.q[0-9]+, q[0-9]+., } 6 } }
> */
> +/* { dg-final { scan-assembler-times {vst2[01].8\t.q[0-9]+, q[0-9]+., } 4 } } */
> +/* { dg-final { scan-assembler-times {vst2[01].16\t.q[0-9]+, q[0-9]+., } 6 } }
> */
> +/* { dg-final { scan-assembler-times {vst2[01].32\t.q[0-9]+, q[0-9]+., } 6 } }
> */
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 9/9] arm: Auto-vectorization for MVE: vld4/vst4
  2021-04-30 14:09 ` [PATCH 9/9] arm: Auto-vectorization for MVE: vld4/vst4 Christophe Lyon
  2021-05-04 12:03   ` Andre Vieira (lists)
@ 2021-05-24 12:15   ` Kyrylo Tkachov
  1 sibling, 0 replies; 35+ messages in thread
From: Kyrylo Tkachov @ 2021-05-24 12:15 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches



> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> Christophe Lyon via Gcc-patches
> Sent: 30 April 2021 15:10
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH 9/9] arm: Auto-vectorization for MVE: vld4/vst4
> 
> This patch enables MVE vld4/vst4 instructions for auto-vectorization.
> We move the existing expanders from neon.md and enable them for MVE,
> calling the respective emitter.

Ok.
Thanks,
Kyrill

> 
> 2021-03-12  Christophe Lyon  <christophe.lyon@linaro.org>
> 
> 	gcc/
> 	* config/arm/neon.md (vec_load_lanesxi<mode>)
> 	(vec_store_lanexoi<mode>): Move ...
> 	* config/arm/vec-common.md: here.
> 
> 	gcc/testsuite/
> 	* gcc.target/arm/simd/mve-vld4.c: New test, derived from
> 	slp-perm-3.c
> ---
>  gcc/config/arm/neon.md                       |  20 ----
>  gcc/config/arm/vec-common.md                 |  26 +++++
>  gcc/testsuite/gcc.target/arm/simd/mve-vld4.c | 140
> +++++++++++++++++++++++++++
>  3 files changed, 166 insertions(+), 20 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
> 
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index bc8775c..fb58baf 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -5617,16 +5617,6 @@ (define_insn "neon_vld4<mode>"
>                      (const_string "neon_load4_4reg<q>")))]
>  )
> 
> -(define_expand "vec_load_lanesxi<mode>"
> -  [(match_operand:XI 0 "s_register_operand")
> -   (match_operand:XI 1 "neon_struct_operand")
> -   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> -  "TARGET_NEON"
> -{
> -  emit_insn (gen_neon_vld4<mode> (operands[0], operands[1]));
> -  DONE;
> -})
> -
>  (define_expand "neon_vld4<mode>"
>    [(match_operand:XI 0 "s_register_operand")
>     (match_operand:XI 1 "neon_struct_operand")
> @@ -5818,16 +5808,6 @@ (define_insn "neon_vst4<mode>"
>                      (const_string "neon_store4_4reg<q>")))]
>  )
> 
> -(define_expand "vec_store_lanesxi<mode>"
> -  [(match_operand:XI 0 "neon_struct_operand")
> -   (match_operand:XI 1 "s_register_operand")
> -   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> -  "TARGET_NEON"
> -{
> -  emit_insn (gen_neon_vst4<mode> (operands[0], operands[1]));
> -  DONE;
> -})
> -
>  (define_expand "neon_vst4<mode>"
>    [(match_operand:XI 0 "neon_struct_operand")
>     (match_operand:XI 1 "s_register_operand")
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-
> common.md
> index 7abefea..d46b78d 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -512,3 +512,29 @@ (define_expand "vec_store_lanesoi<mode>"
>      emit_insn (gen_mve_vst2q<mode> (operands[0], operands[1]));
>    DONE;
>  })
> +
> +(define_expand "vec_load_lanesxi<mode>"
> +  [(match_operand:XI 0 "s_register_operand")
> +   (match_operand:XI 1 "neon_struct_operand")
> +   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> +  "TARGET_NEON || TARGET_HAVE_MVE"
> +{
> +  if (TARGET_NEON)
> +    emit_insn (gen_neon_vld4<mode> (operands[0], operands[1]));
> +  else
> +    emit_insn (gen_mve_vld4q<mode> (operands[0], operands[1]));
> +  DONE;
> +})
> +
> +(define_expand "vec_store_lanesxi<mode>"
> +  [(match_operand:XI 0 "neon_struct_operand")
> +   (match_operand:XI 1 "s_register_operand")
> +   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> +  "TARGET_NEON || TARGET_HAVE_MVE"
> +{
> +  if (TARGET_NEON)
> +    emit_insn (gen_neon_vst4<mode> (operands[0], operands[1]));
> +  else
> +    emit_insn (gen_mve_vst4q<mode> (operands[0], operands[1]));
> +  DONE;
> +})
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
> b/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
> new file mode 100644
> index 0000000..ce3e755
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
> @@ -0,0 +1,140 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> +/* { dg-add-options arm_v8_1m_mve_fp } */
> +/* { dg-additional-options "-O3" } */
> +
> +#include <stdint.h>
> +
> +#define M00 100
> +#define M10 216
> +#define M20 23
> +#define M30 237
> +#define M01 1322
> +#define M11 13
> +#define M21 27271
> +#define M31 2280
> +#define M02 74
> +#define M12 191
> +#define M22 500
> +#define M32 111
> +#define M03 134
> +#define M13 117
> +#define M23 11
> +#define M33 771
> +
> +#define N 128
> +
> +/* Integer tests.  */
> +#define FUNC(SIGN, TYPE, BITS)
> 	\
> +  void foo_##SIGN##BITS##x (TYPE##BITS##_t *__restrict__ pInput,	\
> +			    TYPE##BITS##_t *__restrict__ pOutput)	\
> +  {									\
> +    unsigned int i;							\
> +    TYPE##BITS##_t  a, b, c, d;						\
> +    									\
> +    for (i = 0; i < N / BITS; i++)					\
> +      {									\
> +	a = *pInput++;							\
> +	b = *pInput++;							\
> +	c = *pInput++;							\
> +	d = *pInput++;							\
> +									\
> +	*pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;		\
> +	*pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;		\
> +	*pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;		\
> +	*pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;		\
> +      }									\
> +  }
> +
> +FUNC(s, int, 8)
> +FUNC(u, uint, 8)
> +FUNC(s, int, 16)
> +FUNC(u, uint, 16)
> +FUNC(s, int, 32)
> +FUNC(u, uint, 32)
> +
> +/* float test, keep the macro because it's similar to the above, but does not
> +   need the ##BITS##_t.  */
> +#define FUNC_FLOAT(SIGN, TYPE, BITS)
> 		\
> +  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,
> 	\
> +			    TYPE *__restrict__ pOutput)			\
> +  {									\
> +    unsigned int i;							\
> +    TYPE a, b, c, d;							\
> +    									\
> +    for (i = 0; i < N / BITS; i++)					\
> +      {									\
> +	a = *pInput++;							\
> +	b = *pInput++;							\
> +	c = *pInput++;							\
> +	d = *pInput++;							\
> +									\
> +	*pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;		\
> +	*pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;		\
> +	*pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;		\
> +	*pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;		\
> +      }									\
> +  }
> +
> +FUNC_FLOAT(f, float, 32)
> +
> +/* __fp16 test, needs explicit casts to avoid conversions to floating-point
> and
> +   failure to vectorize.  */
> +__fp16 M00_fp16 = 100.0f16;
> +__fp16 M10_fp16 = 216.0f16;
> +__fp16 M20_fp16 = 23.0f16;
> +__fp16 M30_fp16 = 237.0f16;
> +__fp16 M01_fp16 = 1322.0f16;
> +__fp16 M11_fp16 = 13.0f16;
> +__fp16 M21_fp16 = 27271.0f16;
> +__fp16 M31_fp16 = 2280.0f16;
> +__fp16 M02_fp16 = 74.0f16;
> +__fp16 M12_fp16 = 191.0f16;
> +__fp16 M22_fp16 = 500.0f16;
> +__fp16 M32_fp16 = 111.0f16;
> +__fp16 M03_fp16 = 134.0f16;
> +__fp16 M13_fp16 = 117.0f16;
> +__fp16 M23_fp16 = 11.0f16;
> +__fp16 M33_fp16 = 771.0f16;
> +
> +#define FUNC_FLOAT_FP16(SIGN, TYPE, BITS)				\
> +  void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,
> 	\
> +			    TYPE *__restrict__ pOutput)			\
> +  {									\
> +    unsigned int i;							\
> +    TYPE a, b, c, d;							\
> +    									\
> +    for (i = 0; i < N / BITS; i++)					\
> +      {									\
> +	a = *pInput++;							\
> +	b = *pInput++;							\
> +	c = *pInput++;							\
> +	d = *pInput++;							\
> +									\
> +	TYPE ab, cd;							\
> +	ab = (__fp16)(M00_fp16 * a) + (__fp16)(M01_fp16 * b);		\
> +	cd = (__fp16)(M02_fp16 * c) + (__fp16)(M03_fp16 * d);		\
> +	*pOutput++ = ab + cd;						\
> +	ab = (__fp16)(M10_fp16 * a) + (__fp16)(M11_fp16 * b);		\
> +	cd = (__fp16)(M12_fp16 * c) + (__fp16)(M13_fp16 * d);		\
> +	*pOutput++ = ab + cd;						\
> +	ab = (__fp16)(M20_fp16 * a) + (__fp16)(M21_fp16 * b);		\
> +	cd = (__fp16)(M22_fp16 * c) + (__fp16)(M23_fp16 * d);		\
> +	*pOutput++ = ab + cd;						\
> +	ab = (__fp16)(M30_fp16 * a) + (__fp16)(M31_fp16 * b);		\
> +	cd = (__fp16)(M32_fp16 * c) + (__fp16)(M33_fp16 * d);		\
> +	*pOutput++ = ab + cd;						\
> +      }									\
> +  }
> +
> +FUNC_FLOAT_FP16(f, __fp16, 16)
> +
> +/* vld4X.8 is used for signed and unsigned chars: 2 * 4.  */
> +/* vld4X.16 is used for signed and unsigned shorts and __fp16: 3 * 4.  */
> +/* vld4X.32 is used for signed and unsigned ints and float: 3 * 4.  */
> +/* { dg-final { scan-assembler-times {vld4[0123].8\t.q[0-9]+, q[0-9]+, q[0-
> 9]+, q[0-9]+., } 8 } } */
> +/* { dg-final { scan-assembler-times {vld4[0123].16\t.q[0-9]+, q[0-9]+, q[0-
> 9]+, q[0-9]+., } 12 } } */
> +/* { dg-final { scan-assembler-times {vld4[0123].32\t.q[0-9]+, q[0-9]+, q[0-
> 9]+, q[0-9]+., } 12 } } */
> +/* { dg-final { scan-assembler-times {vst4[0123].8\t.q[0-9]+, q[0-9]+, q[0-
> 9]+, q[0-9]+., } 8 } } */
> +/* { dg-final { scan-assembler-times {vst4[0123].16\t.q[0-9]+, q[0-9]+, q[0-
> 9]+, q[0-9]+., } 12 } } */
> +/* { dg-final { scan-assembler-times {vst4[0123].32\t.q[0-9]+, q[0-9]+, q[0-
> 9]+, q[0-9]+., } 12 } } */
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2021-05-24 12:16 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-30 14:09 [PATCH 1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version Christophe Lyon
2021-04-30 14:09 ` [PATCH 2/9] arm: MVE: Cleanup vcmpne/vcmpeq builtins Christophe Lyon
2021-05-10 11:57   ` Kyrylo Tkachov
2021-04-30 14:09 ` [PATCH 3/9] arm: MVE: Remove _s and _u suffixes from vcmp* builtins Christophe Lyon
2021-05-10 11:58   ` Kyrylo Tkachov
2021-04-30 14:09 ` [PATCH 4/9] arm: MVE: Factorize all vcmp* integer patterns Christophe Lyon
2021-05-10 11:59   ` Kyrylo Tkachov
2021-04-30 14:09 ` [PATCH 5/9] arm: MVE: Factorize vcmp_*f* Christophe Lyon
2021-05-10 11:59   ` Kyrylo Tkachov
2021-04-30 14:09 ` [PATCH 6/9] arm: Auto-vectorization for MVE: vcmp Christophe Lyon
2021-05-04 11:29   ` Andre Vieira (lists)
2021-05-04 13:41     ` Christophe Lyon
2021-05-05 14:08       ` Christophe Lyon
2021-05-17  9:54         ` Christophe Lyon
2021-05-17 10:35         ` Kyrylo Tkachov
2021-05-17 12:31           ` Christophe Lyon
2021-04-30 14:09 ` [PATCH 7/9] arm: Auto-vectorization for MVE: add __fp16 support to VCMP Christophe Lyon
2021-05-04 11:48   ` Andre Vieira (lists)
2021-05-04 13:43     ` Christophe Lyon
2021-05-04 17:03       ` Christophe Lyon
2021-05-05 14:09         ` Christophe Lyon
2021-05-17  9:54           ` Christophe Lyon
2021-05-17 10:49           ` Kyrylo Tkachov
2021-04-30 14:09 ` [PATCH 8/9] arm: Auto-vectorization for MVE: vld2/vst2 Christophe Lyon
2021-05-17  9:55   ` Christophe Lyon
2021-05-24  7:19     ` Christophe Lyon
2021-05-24 12:15   ` Kyrylo Tkachov
2021-04-30 14:09 ` [PATCH 9/9] arm: Auto-vectorization for MVE: vld4/vst4 Christophe Lyon
2021-05-04 12:03   ` Andre Vieira (lists)
2021-05-04 14:57     ` Christophe Lyon
2021-05-17  9:55       ` Christophe Lyon
2021-05-24  7:20         ` Christophe Lyon
2021-05-24 12:15   ` Kyrylo Tkachov
2021-05-10 11:21 ` [PATCH 1/9] arm: MVE: Convert vcmp[eq|ne]* in arm_mve.h to use only 's' builtin version Christophe Lyon
2021-05-10 11:54 ` Kyrylo Tkachov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).