public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [AArch64, 1/4] Add the missing support of vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64
@ 2016-05-16  9:09 Jiong Wang
  2016-05-17 12:20 ` James Greenhalgh
  0 siblings, 1 reply; 8+ messages in thread
From: Jiong Wang @ 2016-05-16  9:09 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1580 bytes --]

The support of vfma_n_f64, vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64 are
missing in current gcc arm_neon.h.

Meanwhile, besides "(fma (vec_dup (vec_select)))", fma by element can
also comes from "(fma (vec_dup(scalar" where the scalar value is already
sitting in vector register then duplicated to other lanes, and there is
no lane size change.

This patch implement this and can generate better code under some
context. For example:

cat test.c
===
typedef __Float32x2_t float32x2_t;
typedef float float32_t;

float32x2_t
vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
{
       return __builtin_aarch64_fmav2sf (__b,  (float32x2_t) {__c, __c}, 
__a);
}

before (-O2)
===
vfma_n_f32:
         dup     v2.2s, v2.s[0]
         fmla    v0.2s, v1.2s, v2.2s
         ret
after
===
vfma_n_f32:
         fmla    v0.2s, v1.2s, v2.s[0]
         ret

OK for trunk?

2016-05-16  Jiong Wang <jiong.wang@arm.com>

gcc/
   * config/aarch64/aarch64-simd.md (*aarch64_fma4_elt_to_128df): Rename
   to *aarch64_fma4_elt_from_dup<mode>.
   (*aarch64_fnma4_elt_to_128df): Rename to 
*aarch64_fnma4_elt_from_dup<mode>.
   * config/aarch64/arm_neon.h (vfma_n_f64): New.
   (vfms_n_f32): Likewise.
   (vfms_n_f64): Likewise.
   (vfmsq_n_f32): Likewise.
   (vfmsq_n_f64): Likewise.

gcc/testsuite/
   * gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c: Use standard 
syntax.
   * gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c: Likewise.
   * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: New entry for 
float64x1.
   * gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: New.

[-- Attachment #2: 1.patch --]
[-- Type: text/x-patch, Size: 28513 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index bd73bce64414e8bc01732d14311d742cf28f4586..90eaca176b4706e6cc42f16ce2c956f1c8ad17b1 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1579,16 +1579,16 @@
   [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
 )
 
-(define_insn "*aarch64_fma4_elt_to_128df"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
-    (fma:V2DF
-      (vec_duplicate:V2DF
-	  (match_operand:DF 1 "register_operand" "w"))
-      (match_operand:V2DF 2 "register_operand" "w")
-      (match_operand:V2DF 3 "register_operand" "0")))]
+(define_insn "*aarch64_fma4_elt_from_dup<mode>"
+  [(set (match_operand:VMUL 0 "register_operand" "=w")
+    (fma:VMUL
+      (vec_duplicate:VMUL
+	  (match_operand:<VEL> 1 "register_operand" "w"))
+      (match_operand:VMUL 2 "register_operand" "w")
+      (match_operand:VMUL 3 "register_operand" "0")))]
   "TARGET_SIMD"
-  "fmla\\t%0.2d, %2.2d, %1.2d[0]"
-  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
+  "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
+  [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
 )
 
 (define_insn "*aarch64_fma4_elt_to_64v2df"
@@ -1656,17 +1656,17 @@
   [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
 )
 
-(define_insn "*aarch64_fnma4_elt_to_128df"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
-    (fma:V2DF
-      (neg:V2DF
-        (match_operand:V2DF 2 "register_operand" "w"))
-      (vec_duplicate:V2DF
-	(match_operand:DF 1 "register_operand" "w"))
-      (match_operand:V2DF 3 "register_operand" "0")))]
-  "TARGET_SIMD"
-  "fmls\\t%0.2d, %2.2d, %1.2d[0]"
-  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
+(define_insn "*aarch64_fnma4_elt_from_dup<mode>"
+  [(set (match_operand:VMUL 0 "register_operand" "=w")
+    (fma:VMUL
+      (neg:VMUL
+        (match_operand:VMUL 2 "register_operand" "w"))
+      (vec_duplicate:VMUL
+	(match_operand:<VEL> 1 "register_operand" "w"))
+      (match_operand:VMUL 3 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
+  [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
 )
 
 (define_insn "*aarch64_fnma4_elt_to_64v2df"
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 2612a325718918cf7cd808f28c09c9c4c7b11c07..ca7ace5aa656163826569d046fcbf02f9f7d4d6c 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -14456,6 +14456,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
   return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
 }
 
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
+{
+  return (float64x1_t) {__b[0] * __c + __a[0]};
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
 {
@@ -14597,6 +14603,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
   return __builtin_aarch64_fmav2df (-__b, __c, __a);
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
+{
+  return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a);
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
+{
+  return (float64x1_t) {-__b[0] * __c + __a[0]};
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
+{
+  return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
+{
+  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a);
+}
 
 /* vfms_lane  */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
index 49fbd843e507ede8aa81d02c175a82a1221750a4..cf90825f87391b72aca9a29980210d21f4321c04 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
@@ -136,6 +136,7 @@ static ARRAY(result, poly, 16, 4);
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 static ARRAY(result, float, 16, 4);
 #endif
+static ARRAY(result, float, 64, 1);
 static ARRAY(result, float, 32, 2);
 static ARRAY(result, int, 8, 16);
 static ARRAY(result, int, 16, 8);
@@ -169,6 +170,7 @@ extern ARRAY(expected, poly, 8, 8);
 extern ARRAY(expected, poly, 16, 4);
 extern ARRAY(expected, hfloat, 16, 4);
 extern ARRAY(expected, hfloat, 32, 2);
+extern ARRAY(expected, hfloat, 64, 1);
 extern ARRAY(expected, int, 8, 16);
 extern ARRAY(expected, int, 16, 8);
 extern ARRAY(expected, int, 32, 4);
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
new file mode 100644
index 0000000000000000000000000000000000000000..26223763c59c849607b5320f6ec37098a556ce2e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
@@ -0,0 +1,490 @@
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define A0 123.4f
+#define A1 -3.8f
+#define A2 -29.4f
+#define A3 (__builtin_inff ())
+#define A4 0.0f
+#define A5 24.0f
+#define A6 124.0f
+#define A7 1024.0f
+
+#define B0 -5.8f
+#define B1 -0.0f
+#define B2 -10.8f
+#define B3 10.0f
+#define B4 23.4f
+#define B5 -1234.8f
+#define B6 8.9f
+#define B7 4.0f
+
+#define E0 9.8f
+#define E1 -1024.0f
+#define E2 (-__builtin_inff ())
+#define E3 479.0f
+float32_t elem0 = E0;
+float32_t elem1 = E1;
+float32_t elem2 = E2;
+float32_t elem3 = E3;
+
+#define DA0 1231234.4
+#define DA1 -3.8
+#define DA2 -2980.4
+#define DA3 -5.8
+#define DA4 0.01123
+#define DA5 24.0
+#define DA6 124.12345
+#define DA7 1024.0
+
+#define DB0 -5.8
+#define DB1 (__builtin_inf ())
+#define DB2 -105.8
+#define DB3 10.0
+#define DB4 (-__builtin_inf ())
+#define DB5 -1234.8
+#define DB6 848.9
+#define DB7 44444.0
+
+#define DE0 9.8
+#define DE1 -1024.0
+#define DE2 105.8
+#define DE3 479.0
+float64_t delem0 = DE0;
+float64_t delem1 = DE1;
+float64_t delem2 = DE2;
+float64_t delem3 = DE3;
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
+
+/* Expected results for vfms_n.  */
+
+VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1 * E0};
+VECT_VAR_DECL(expectedfms1, float, 32, 2) [] = {A2 + -B2 * E1, A3 + -B3 * E1};
+VECT_VAR_DECL(expectedfms2, float, 32, 2) [] = {A4 + -B4 * E2, A5 + -B5 * E2};
+VECT_VAR_DECL(expectedfms3, float, 32, 2) [] = {A6 + -B6 * E3, A7 + -B7 * E3};
+VECT_VAR_DECL(expectedfma0, float, 32, 2) [] = {A0 + B0 * E0, A1 + B1 * E0};
+VECT_VAR_DECL(expectedfma1, float, 32, 2) [] = {A2 + B2 * E1, A3 + B3 * E1};
+VECT_VAR_DECL(expectedfma2, float, 32, 2) [] = {A4 + B4 * E2, A5 + B5 * E2};
+VECT_VAR_DECL(expectedfma3, float, 32, 2) [] = {A6 + B6 * E3, A7 + B7 * E3};
+
+hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 2);
+hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 2) =
+  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 2);
+
+
+VECT_VAR_DECL(expectedfms0, float, 32, 4) [] = {A0 + -B0 * E0, A1 + -B1 * E0,
+						A2 + -B2 * E0, A3 + -B3 * E0};
+VECT_VAR_DECL(expectedfms1, float, 32, 4) [] = {A4 + -B4 * E1, A5 + -B5 * E1,
+						A6 + -B6 * E1, A7 + -B7 * E1};
+VECT_VAR_DECL(expectedfms2, float, 32, 4) [] = {A0 + -B0 * E2, A2 + -B2 * E2,
+						A4 + -B4 * E2, A6 + -B6 * E2};
+VECT_VAR_DECL(expectedfms3, float, 32, 4) [] = {A1 + -B1 * E3, A3 + -B3 * E3,
+						A5 + -B5 * E3, A7 + -B7 * E3};
+VECT_VAR_DECL(expectedfma0, float, 32, 4) [] = {A0 + B0 * E0, A1 + B1 * E0,
+						A2 + B2 * E0, A3 + B3 * E0};
+VECT_VAR_DECL(expectedfma1, float, 32, 4) [] = {A4 + B4 * E1, A5 + B5 * E1,
+						A6 + B6 * E1, A7 + B7 * E1};
+VECT_VAR_DECL(expectedfma2, float, 32, 4) [] = {A0 + B0 * E2, A2 + B2 * E2,
+						A4 + B4 * E2, A6 + B6 * E2};
+VECT_VAR_DECL(expectedfma3, float, 32, 4) [] = {A1 + B1 * E3, A3 + B3 * E3,
+						A5 + B5 * E3, A7 + B7 * E3};
+
+hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 4);
+hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 4) =
+  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 4);
+
+VECT_VAR_DECL(expectedfms0, float, 64, 2) [] = {DA0 + -DB0 * DE0,
+						DA1 + -DB1 * DE0};
+VECT_VAR_DECL(expectedfms1, float, 64, 2) [] = {DA2 + -DB2 * DE1,
+						DA3 + -DB3 * DE1};
+VECT_VAR_DECL(expectedfms2, float, 64, 2) [] = {DA4 + -DB4 * DE2,
+						DA5 + -DB5 * DE2};
+VECT_VAR_DECL(expectedfms3, float, 64, 2) [] = {DA6 + -DB6 * DE3,
+						DA7 + -DB7 * DE3};
+VECT_VAR_DECL(expectedfma0, float, 64, 2) [] = {DA0 + DB0 * DE0,
+						DA1 + DB1 * DE0};
+VECT_VAR_DECL(expectedfma1, float, 64, 2) [] = {DA2 + DB2 * DE1,
+						DA3 + DB3 * DE1};
+VECT_VAR_DECL(expectedfma2, float, 64, 2) [] = {DA4 + DB4 * DE2,
+						DA5 + DB5 * DE2};
+VECT_VAR_DECL(expectedfma3, float, 64, 2) [] = {DA6 + DB6 * DE3,
+						DA7 + DB7 * DE3};
+hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 2);
+hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 2) =
+  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 2);
+
+VECT_VAR_DECL(expectedfms0, float, 64, 1) [] = {DA0 + -DB0 * DE0};
+VECT_VAR_DECL(expectedfms1, float, 64, 1) [] = {DA2 + -DB2 * DE1};
+VECT_VAR_DECL(expectedfms2, float, 64, 1) [] = {DA4 + -DB4 * DE2};
+VECT_VAR_DECL(expectedfms3, float, 64, 1) [] = {DA6 + -DB6 * DE3};
+VECT_VAR_DECL(expectedfma0, float, 64, 1) [] = {DA0 + DB0 * DE0};
+VECT_VAR_DECL(expectedfma1, float, 64, 1) [] = {DA2 + DB2 * DE1};
+VECT_VAR_DECL(expectedfma2, float, 64, 1) [] = {DA4 + DB4 * DE2};
+VECT_VAR_DECL(expectedfma3, float, 64, 1) [] = {DA6 + DB6 * DE3};
+
+hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 1);
+hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 1) =
+  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 1);
+
+void exec_vfma_vfms_n (void)
+{
+#undef TEST_MSG
+#define TEST_MSG "VFMS_VFMA_N (FP32)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 32, 2);
+  DECL_VARIABLE(vsrc_2, float, 32, 2);
+  VECT_VAR_DECL (buf_src_1, float, 32, 2) [] = {A0, A1};
+  VECT_VAR_DECL (buf_src_2, float, 32, 2) [] = {B0, B1};
+  VLOAD (vsrc_1, buf_src_1, , float, f, 32, 2);
+  VLOAD (vsrc_2, buf_src_2, , float, f, 32, 2);
+  DECL_VARIABLE (vector_res, float, 32, 2) =
+    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem0);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms0_static, "");
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem0);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma0_static, "");
+
+  VECT_VAR_DECL (buf_src_3, float, 32, 2) [] = {A2, A3};
+  VECT_VAR_DECL (buf_src_4, float, 32, 2) [] = {B2, B3};
+  VLOAD (vsrc_1, buf_src_3, , float, f, 32, 2);
+  VLOAD (vsrc_2, buf_src_4, , float, f, 32, 2);
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem1);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms1_static, "");
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem1);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma1_static, "");
+
+  VECT_VAR_DECL (buf_src_5, float, 32, 2) [] = {A4, A5};
+  VECT_VAR_DECL (buf_src_6, float, 32, 2) [] = {B4, B5};
+  VLOAD (vsrc_1, buf_src_5, , float, f, 32, 2);
+  VLOAD (vsrc_2, buf_src_6, , float, f, 32, 2);
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem2);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms2_static, "");
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem2);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma2_static, "");
+
+  VECT_VAR_DECL (buf_src_7, float, 32, 2) [] = {A6, A7};
+  VECT_VAR_DECL (buf_src_8, float, 32, 2) [] = {B6, B7};
+  VLOAD (vsrc_1, buf_src_7, , float, f, 32, 2);
+  VLOAD (vsrc_2, buf_src_8, , float, f, 32, 2);
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem3);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms3_static, "");
+  VECT_VAR (vector_res, float, 32, 2) =
+    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
+		VECT_VAR (vsrc_2, float, 32, 2), elem3);
+  vst1_f32 (VECT_VAR (result, float, 32, 2),
+	    VECT_VAR (vector_res, float, 32, 2));
+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMSQ_VFMAQ_N (FP32)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 32, 4);
+  DECL_VARIABLE(vsrc_2, float, 32, 4);
+  VECT_VAR_DECL (buf_src_1, float, 32, 4) [] = {A0, A1, A2, A3};
+  VECT_VAR_DECL (buf_src_2, float, 32, 4) [] = {B0, B1, B2, B3};
+  VLOAD (vsrc_1, buf_src_1, q, float, f, 32, 4);
+  VLOAD (vsrc_2, buf_src_2, q, float, f, 32, 4);
+  DECL_VARIABLE (vector_res, float, 32, 4) =
+    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem0);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms0_static, "");
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem0);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma0_static, "");
+
+  VECT_VAR_DECL (buf_src_3, float, 32, 4) [] = {A4, A5, A6, A7};
+  VECT_VAR_DECL (buf_src_4, float, 32, 4) [] = {B4, B5, B6, B7};
+  VLOAD (vsrc_1, buf_src_3, q, float, f, 32, 4);
+  VLOAD (vsrc_2, buf_src_4, q, float, f, 32, 4);
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem1);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms1_static, "");
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem1);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma1_static, "");
+
+  VECT_VAR_DECL (buf_src_5, float, 32, 4) [] = {A0, A2, A4, A6};
+  VECT_VAR_DECL (buf_src_6, float, 32, 4) [] = {B0, B2, B4, B6};
+  VLOAD (vsrc_1, buf_src_5, q, float, f, 32, 4);
+  VLOAD (vsrc_2, buf_src_6, q, float, f, 32, 4);
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem2);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms2_static, "");
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem2);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma2_static, "");
+
+  VECT_VAR_DECL (buf_src_7, float, 32, 4) [] = {A1, A3, A5, A7};
+  VECT_VAR_DECL (buf_src_8, float, 32, 4) [] = {B1, B3, B5, B7};
+  VLOAD (vsrc_1, buf_src_7, q, float, f, 32, 4);
+  VLOAD (vsrc_2, buf_src_8, q, float, f, 32, 4);
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem3);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms3_static, "");
+  VECT_VAR (vector_res, float, 32, 4) =
+    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
+		 VECT_VAR (vsrc_2, float, 32, 4), elem3);
+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	     VECT_VAR (vector_res, float, 32, 4));
+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMSQ_VFMAQ_N (FP64)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 64, 2);
+  DECL_VARIABLE(vsrc_2, float, 64, 2);
+  VECT_VAR_DECL (buf_src_1, float, 64, 2) [] = {DA0, DA1};
+  VECT_VAR_DECL (buf_src_2, float, 64, 2) [] = {DB0, DB1};
+  VLOAD (vsrc_1, buf_src_1, q, float, f, 64, 2);
+  VLOAD (vsrc_2, buf_src_2, q, float, f, 64, 2);
+  DECL_VARIABLE (vector_res, float, 64, 2) =
+    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem0);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms0_static, "");
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem0);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma0_static, "");
+
+  VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3};
+  VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3};
+  VLOAD (vsrc_1, buf_src_3, q, float, f, 64, 2);
+  VLOAD (vsrc_2, buf_src_4, q, float, f, 64, 2);
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem1);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms1_static, "");
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem1);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma1_static, "");
+
+  VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5};
+  VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5};
+  VLOAD (vsrc_1, buf_src_5, q, float, f, 64, 2);
+  VLOAD (vsrc_2, buf_src_6, q, float, f, 64, 2);
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem2);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms2_static, "");
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem2);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma2_static, "");
+
+  VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7};
+  VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7};
+  VLOAD (vsrc_1, buf_src_7, q, float, f, 64, 2);
+  VLOAD (vsrc_2, buf_src_8, q, float, f, 64, 2);
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem3);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms3_static, "");
+  VECT_VAR (vector_res, float, 64, 2) =
+    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+		 VECT_VAR (vsrc_2, float, 64, 2), delem3);
+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	     VECT_VAR (vector_res, float, 64, 2));
+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMS_VFMA_N (FP64)"
+  clean_results ();
+
+  DECL_VARIABLE(vsrc_1, float, 64, 1);
+  DECL_VARIABLE(vsrc_2, float, 64, 1);
+  VECT_VAR_DECL (buf_src_1, float, 64, 1) [] = {DA0};
+  VECT_VAR_DECL (buf_src_2, float, 64, 1) [] = {DB0};
+  VLOAD (vsrc_1, buf_src_1, , float, f, 64, 1);
+  VLOAD (vsrc_2, buf_src_2, , float, f, 64, 1);
+  DECL_VARIABLE (vector_res, float, 64, 1) =
+    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem0);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms0_static, "");
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem0);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma0_static, "");
+
+  VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2};
+  VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2};
+  VLOAD (vsrc_1, buf_src_3, , float, f, 64, 1);
+  VLOAD (vsrc_2, buf_src_4, , float, f, 64, 1);
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem1);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms1_static, "");
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem1);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma1_static, "");
+
+  VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4};
+  VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4};
+  VLOAD (vsrc_1, buf_src_5, , float, f, 64, 1);
+  VLOAD (vsrc_2, buf_src_6, , float, f, 64, 1);
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem2);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms2_static, "");
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem2);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma2_static, "");
+
+  VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6};
+  VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6};
+  VLOAD (vsrc_1, buf_src_7, , float, f, 64, 1);
+  VLOAD (vsrc_2, buf_src_8, , float, f, 64, 1);
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem3);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms3_static, "");
+  VECT_VAR (vector_res, float, 64, 1) =
+    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+		VECT_VAR (vsrc_2, float, 64, 1), delem3);
+  vst1_f64 (VECT_VAR (result, float, 64, 1),
+	     VECT_VAR (vector_res, float, 64, 1));
+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma3_static, "");
+}
+#endif
+
+int
+main (void)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
+  exec_vfma_vfms_n ();
+#endif
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
index 1ba1fed98a0711496815e00d2d702e5bfa2a7d43..5b348827002dcfef1f589900a4cf5ff7ada26697 100644
--- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
@@ -110,6 +110,6 @@ main (int argc, char **argv)
 /* vfmaq_lane_f64.
    vfma_laneq_f64.
    vfmaq_laneq_f64.  */
-/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
+/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
 
 
diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
index 887ebae10da715c8d301a8494a2225e53f15bd7d..6c194a023d34ebafb4d732edc303985531f92a63 100644
--- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
@@ -111,6 +111,6 @@ main (int argc, char **argv)
 /* vfmsq_lane_f64.
    vfms_laneq_f64.
    vfmsq_laneq_f64.  */
-/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
+/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
 
 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [AArch64, 1/4] Add the missing support of vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64
  2016-05-16  9:09 [AArch64, 1/4] Add the missing support of vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64 Jiong Wang
@ 2016-05-17 12:20 ` James Greenhalgh
  2016-05-17 12:40   ` Kyrill Tkachov
  0 siblings, 1 reply; 8+ messages in thread
From: James Greenhalgh @ 2016-05-17 12:20 UTC (permalink / raw)
  To: Jiong Wang; +Cc: GCC Patches, nd, Kyrylo Tkachov

On Mon, May 16, 2016 at 10:09:26AM +0100, Jiong Wang wrote:
> The support of vfma_n_f64, vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64 are
> missing in current gcc arm_neon.h.
> 
> Meanwhile, besides "(fma (vec_dup (vec_select)))", fma by element can
> also comes from "(fma (vec_dup(scalar" where the scalar value is already
> sitting in vector register then duplicated to other lanes, and there is
> no lane size change.
> 
> This patch implement this and can generate better code under some
> context. For example:
> 
> cat test.c
> ===
> typedef __Float32x2_t float32x2_t;
> typedef float float32_t;
> 
> float32x2_t
> vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
> {
>       return __builtin_aarch64_fmav2sf (__b,  (float32x2_t) {__c,
> __c}, __a);
> }
> 
> before (-O2)
> ===
> vfma_n_f32:
>         dup     v2.2s, v2.s[0]
>         fmla    v0.2s, v1.2s, v2.2s
>         ret
> after
> ===
> vfma_n_f32:
>         fmla    v0.2s, v1.2s, v2.s[0]
>         ret
> 
> OK for trunk?
> 
> 2016-05-16  Jiong Wang <jiong.wang@arm.com>

This ChangeLog entry is not correctly formatted. There should be two
spaces between your name and your email, and each line should start with
a tab.

> 
> gcc/
>   * config/aarch64/aarch64-simd.md (*aarch64_fma4_elt_to_128df): Rename
>   to *aarch64_fma4_elt_from_dup<mode>.
>   (*aarch64_fnma4_elt_to_128df): Rename to
> *aarch64_fnma4_elt_from_dup<mode>.
>   * config/aarch64/arm_neon.h (vfma_n_f64): New.
>   (vfms_n_f32): Likewise.
>   (vfms_n_f64): Likewise.
>   (vfmsq_n_f32): Likewise.
>   (vfmsq_n_f64): Likewise.
> 
> gcc/testsuite/
>   * gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c: Use
> standard syntax.
>   * gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c: Likewise.

The paths of these two entries are incorrect. Remove the gcc/testsuite
from the front. I don't understand what you mean by "Use standard syntax.",
please fix this to describe what you are actually changing.

>   * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: New entry
> for float64x1.
>   * gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: New.

These two changes need approval from an ARM maintainer as they are in
common files.

From an AArch64 perspective, this patch is OK with a fixed ChangeLog. Please
wait for an ARM OK for the test changes.

Thanks,
James

> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index bd73bce64414e8bc01732d14311d742cf28f4586..90eaca176b4706e6cc42f16ce2c956f1c8ad17b1 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -1579,16 +1579,16 @@
>    [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
>  )
>  
> -(define_insn "*aarch64_fma4_elt_to_128df"
> -  [(set (match_operand:V2DF 0 "register_operand" "=w")
> -    (fma:V2DF
> -      (vec_duplicate:V2DF
> -	  (match_operand:DF 1 "register_operand" "w"))
> -      (match_operand:V2DF 2 "register_operand" "w")
> -      (match_operand:V2DF 3 "register_operand" "0")))]
> +(define_insn "*aarch64_fma4_elt_from_dup<mode>"
> +  [(set (match_operand:VMUL 0 "register_operand" "=w")
> +    (fma:VMUL
> +      (vec_duplicate:VMUL
> +	  (match_operand:<VEL> 1 "register_operand" "w"))
> +      (match_operand:VMUL 2 "register_operand" "w")
> +      (match_operand:VMUL 3 "register_operand" "0")))]
>    "TARGET_SIMD"
> -  "fmla\\t%0.2d, %2.2d, %1.2d[0]"
> -  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
> +  "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
> +  [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
>  )
>  
>  (define_insn "*aarch64_fma4_elt_to_64v2df"
> @@ -1656,17 +1656,17 @@
>    [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
>  )
>  
> -(define_insn "*aarch64_fnma4_elt_to_128df"
> -  [(set (match_operand:V2DF 0 "register_operand" "=w")
> -    (fma:V2DF
> -      (neg:V2DF
> -        (match_operand:V2DF 2 "register_operand" "w"))
> -      (vec_duplicate:V2DF
> -	(match_operand:DF 1 "register_operand" "w"))
> -      (match_operand:V2DF 3 "register_operand" "0")))]
> -  "TARGET_SIMD"
> -  "fmls\\t%0.2d, %2.2d, %1.2d[0]"
> -  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
> +(define_insn "*aarch64_fnma4_elt_from_dup<mode>"
> +  [(set (match_operand:VMUL 0 "register_operand" "=w")
> +    (fma:VMUL
> +      (neg:VMUL
> +        (match_operand:VMUL 2 "register_operand" "w"))
> +      (vec_duplicate:VMUL
> +	(match_operand:<VEL> 1 "register_operand" "w"))
> +      (match_operand:VMUL 3 "register_operand" "0")))]
> +  "TARGET_SIMD"
> +  "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
> +  [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
>  )
>  
>  (define_insn "*aarch64_fnma4_elt_to_64v2df"
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 2612a325718918cf7cd808f28c09c9c4c7b11c07..ca7ace5aa656163826569d046fcbf02f9f7d4d6c 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -14456,6 +14456,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
>    return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
>  }
>  
> +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
> +vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
> +{
> +  return (float64x1_t) {__b[0] * __c + __a[0]};
> +}
> +
>  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
>  vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
>  {
> @@ -14597,6 +14603,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
>    return __builtin_aarch64_fmav2df (-__b, __c, __a);
>  }
>  
> +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
> +vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
> +{
> +  return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a);
> +}
> +
> +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
> +vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
> +{
> +  return (float64x1_t) {-__b[0] * __c + __a[0]};
> +}
> +
> +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
> +vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
> +{
> +  return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a);
> +}
> +
> +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
> +vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
> +{
> +  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a);
> +}
>  
>  /* vfms_lane  */
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
> index 49fbd843e507ede8aa81d02c175a82a1221750a4..cf90825f87391b72aca9a29980210d21f4321c04 100644
> --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
> @@ -136,6 +136,7 @@ static ARRAY(result, poly, 16, 4);
>  #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
>  static ARRAY(result, float, 16, 4);
>  #endif
> +static ARRAY(result, float, 64, 1);
>  static ARRAY(result, float, 32, 2);
>  static ARRAY(result, int, 8, 16);
>  static ARRAY(result, int, 16, 8);
> @@ -169,6 +170,7 @@ extern ARRAY(expected, poly, 8, 8);
>  extern ARRAY(expected, poly, 16, 4);
>  extern ARRAY(expected, hfloat, 16, 4);
>  extern ARRAY(expected, hfloat, 32, 2);
> +extern ARRAY(expected, hfloat, 64, 1);
>  extern ARRAY(expected, int, 8, 16);
>  extern ARRAY(expected, int, 16, 8);
>  extern ARRAY(expected, int, 32, 4);
> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..26223763c59c849607b5320f6ec37098a556ce2e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
> @@ -0,0 +1,490 @@
> +#include <arm_neon.h>
> +#include "arm-neon-ref.h"
> +#include "compute-ref-data.h"
> +
> +#define A0 123.4f
> +#define A1 -3.8f
> +#define A2 -29.4f
> +#define A3 (__builtin_inff ())
> +#define A4 0.0f
> +#define A5 24.0f
> +#define A6 124.0f
> +#define A7 1024.0f
> +
> +#define B0 -5.8f
> +#define B1 -0.0f
> +#define B2 -10.8f
> +#define B3 10.0f
> +#define B4 23.4f
> +#define B5 -1234.8f
> +#define B6 8.9f
> +#define B7 4.0f
> +
> +#define E0 9.8f
> +#define E1 -1024.0f
> +#define E2 (-__builtin_inff ())
> +#define E3 479.0f
> +float32_t elem0 = E0;
> +float32_t elem1 = E1;
> +float32_t elem2 = E2;
> +float32_t elem3 = E3;
> +
> +#define DA0 1231234.4
> +#define DA1 -3.8
> +#define DA2 -2980.4
> +#define DA3 -5.8
> +#define DA4 0.01123
> +#define DA5 24.0
> +#define DA6 124.12345
> +#define DA7 1024.0
> +
> +#define DB0 -5.8
> +#define DB1 (__builtin_inf ())
> +#define DB2 -105.8
> +#define DB3 10.0
> +#define DB4 (-__builtin_inf ())
> +#define DB5 -1234.8
> +#define DB6 848.9
> +#define DB7 44444.0
> +
> +#define DE0 9.8
> +#define DE1 -1024.0
> +#define DE2 105.8
> +#define DE3 479.0
> +float64_t delem0 = DE0;
> +float64_t delem1 = DE1;
> +float64_t delem2 = DE2;
> +float64_t delem3 = DE3;
> +
> +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
> +
> +/* Expected results for vfms_n.  */
> +
> +VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1 * E0};
> +VECT_VAR_DECL(expectedfms1, float, 32, 2) [] = {A2 + -B2 * E1, A3 + -B3 * E1};
> +VECT_VAR_DECL(expectedfms2, float, 32, 2) [] = {A4 + -B4 * E2, A5 + -B5 * E2};
> +VECT_VAR_DECL(expectedfms3, float, 32, 2) [] = {A6 + -B6 * E3, A7 + -B7 * E3};
> +VECT_VAR_DECL(expectedfma0, float, 32, 2) [] = {A0 + B0 * E0, A1 + B1 * E0};
> +VECT_VAR_DECL(expectedfma1, float, 32, 2) [] = {A2 + B2 * E1, A3 + B3 * E1};
> +VECT_VAR_DECL(expectedfma2, float, 32, 2) [] = {A4 + B4 * E2, A5 + B5 * E2};
> +VECT_VAR_DECL(expectedfma3, float, 32, 2) [] = {A6 + B6 * E3, A7 + B7 * E3};
> +
> +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 2) =
> +  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 2);
> +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 2) =
> +  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 2);
> +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 2) =
> +  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 2);
> +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 2) =
> +  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 2);
> +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 2) =
> +  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 2);
> +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 2) =
> +  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 2);
> +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 2) =
> +  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 2);
> +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 2) =
> +  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 2);
> +
> +
> +VECT_VAR_DECL(expectedfms0, float, 32, 4) [] = {A0 + -B0 * E0, A1 + -B1 * E0,
> +						A2 + -B2 * E0, A3 + -B3 * E0};
> +VECT_VAR_DECL(expectedfms1, float, 32, 4) [] = {A4 + -B4 * E1, A5 + -B5 * E1,
> +						A6 + -B6 * E1, A7 + -B7 * E1};
> +VECT_VAR_DECL(expectedfms2, float, 32, 4) [] = {A0 + -B0 * E2, A2 + -B2 * E2,
> +						A4 + -B4 * E2, A6 + -B6 * E2};
> +VECT_VAR_DECL(expectedfms3, float, 32, 4) [] = {A1 + -B1 * E3, A3 + -B3 * E3,
> +						A5 + -B5 * E3, A7 + -B7 * E3};
> +VECT_VAR_DECL(expectedfma0, float, 32, 4) [] = {A0 + B0 * E0, A1 + B1 * E0,
> +						A2 + B2 * E0, A3 + B3 * E0};
> +VECT_VAR_DECL(expectedfma1, float, 32, 4) [] = {A4 + B4 * E1, A5 + B5 * E1,
> +						A6 + B6 * E1, A7 + B7 * E1};
> +VECT_VAR_DECL(expectedfma2, float, 32, 4) [] = {A0 + B0 * E2, A2 + B2 * E2,
> +						A4 + B4 * E2, A6 + B6 * E2};
> +VECT_VAR_DECL(expectedfma3, float, 32, 4) [] = {A1 + B1 * E3, A3 + B3 * E3,
> +						A5 + B5 * E3, A7 + B7 * E3};
> +
> +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 4) =
> +  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 4);
> +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 4) =
> +  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 4);
> +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 4) =
> +  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 4);
> +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 4) =
> +  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 4);
> +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 4) =
> +  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 4);
> +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 4) =
> +  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 4);
> +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 4) =
> +  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 4);
> +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 4) =
> +  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 4);
> +
> +VECT_VAR_DECL(expectedfms0, float, 64, 2) [] = {DA0 + -DB0 * DE0,
> +						DA1 + -DB1 * DE0};
> +VECT_VAR_DECL(expectedfms1, float, 64, 2) [] = {DA2 + -DB2 * DE1,
> +						DA3 + -DB3 * DE1};
> +VECT_VAR_DECL(expectedfms2, float, 64, 2) [] = {DA4 + -DB4 * DE2,
> +						DA5 + -DB5 * DE2};
> +VECT_VAR_DECL(expectedfms3, float, 64, 2) [] = {DA6 + -DB6 * DE3,
> +						DA7 + -DB7 * DE3};
> +VECT_VAR_DECL(expectedfma0, float, 64, 2) [] = {DA0 + DB0 * DE0,
> +						DA1 + DB1 * DE0};
> +VECT_VAR_DECL(expectedfma1, float, 64, 2) [] = {DA2 + DB2 * DE1,
> +						DA3 + DB3 * DE1};
> +VECT_VAR_DECL(expectedfma2, float, 64, 2) [] = {DA4 + DB4 * DE2,
> +						DA5 + DB5 * DE2};
> +VECT_VAR_DECL(expectedfma3, float, 64, 2) [] = {DA6 + DB6 * DE3,
> +						DA7 + DB7 * DE3};
> +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 2) =
> +  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 2);
> +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 2) =
> +  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 2);
> +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 2) =
> +  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 2);
> +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 2) =
> +  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 2);
> +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 2) =
> +  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 2);
> +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 2) =
> +  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 2);
> +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 2) =
> +  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 2);
> +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 2) =
> +  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 2);
> +
> +VECT_VAR_DECL(expectedfms0, float, 64, 1) [] = {DA0 + -DB0 * DE0};
> +VECT_VAR_DECL(expectedfms1, float, 64, 1) [] = {DA2 + -DB2 * DE1};
> +VECT_VAR_DECL(expectedfms2, float, 64, 1) [] = {DA4 + -DB4 * DE2};
> +VECT_VAR_DECL(expectedfms3, float, 64, 1) [] = {DA6 + -DB6 * DE3};
> +VECT_VAR_DECL(expectedfma0, float, 64, 1) [] = {DA0 + DB0 * DE0};
> +VECT_VAR_DECL(expectedfma1, float, 64, 1) [] = {DA2 + DB2 * DE1};
> +VECT_VAR_DECL(expectedfma2, float, 64, 1) [] = {DA4 + DB4 * DE2};
> +VECT_VAR_DECL(expectedfma3, float, 64, 1) [] = {DA6 + DB6 * DE3};
> +
> +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 1) =
> +  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 1);
> +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 1) =
> +  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 1);
> +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 1) =
> +  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 1);
> +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 1) =
> +  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 1);
> +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 1) =
> +  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 1);
> +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 1) =
> +  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 1);
> +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 1) =
> +  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 1);
> +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 1) =
> +  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 1);
> +
> +void exec_vfma_vfms_n (void)
> +{
> +#undef TEST_MSG
> +#define TEST_MSG "VFMS_VFMA_N (FP32)"
> +  clean_results ();
> +
> +  DECL_VARIABLE(vsrc_1, float, 32, 2);
> +  DECL_VARIABLE(vsrc_2, float, 32, 2);
> +  VECT_VAR_DECL (buf_src_1, float, 32, 2) [] = {A0, A1};
> +  VECT_VAR_DECL (buf_src_2, float, 32, 2) [] = {B0, B1};
> +  VLOAD (vsrc_1, buf_src_1, , float, f, 32, 2);
> +  VLOAD (vsrc_2, buf_src_2, , float, f, 32, 2);
> +  DECL_VARIABLE (vector_res, float, 32, 2) =
> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
> +		VECT_VAR (vsrc_2, float, 32, 2), elem0);
> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
> +	    VECT_VAR (vector_res, float, 32, 2));
> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms0_static, "");
> +  VECT_VAR (vector_res, float, 32, 2) =
> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
> +		VECT_VAR (vsrc_2, float, 32, 2), elem0);
> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
> +	    VECT_VAR (vector_res, float, 32, 2));
> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma0_static, "");
> +
> +  VECT_VAR_DECL (buf_src_3, float, 32, 2) [] = {A2, A3};
> +  VECT_VAR_DECL (buf_src_4, float, 32, 2) [] = {B2, B3};
> +  VLOAD (vsrc_1, buf_src_3, , float, f, 32, 2);
> +  VLOAD (vsrc_2, buf_src_4, , float, f, 32, 2);
> +  VECT_VAR (vector_res, float, 32, 2) =
> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
> +		VECT_VAR (vsrc_2, float, 32, 2), elem1);
> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
> +	    VECT_VAR (vector_res, float, 32, 2));
> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms1_static, "");
> +  VECT_VAR (vector_res, float, 32, 2) =
> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
> +		VECT_VAR (vsrc_2, float, 32, 2), elem1);
> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
> +	    VECT_VAR (vector_res, float, 32, 2));
> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma1_static, "");
> +
> +  VECT_VAR_DECL (buf_src_5, float, 32, 2) [] = {A4, A5};
> +  VECT_VAR_DECL (buf_src_6, float, 32, 2) [] = {B4, B5};
> +  VLOAD (vsrc_1, buf_src_5, , float, f, 32, 2);
> +  VLOAD (vsrc_2, buf_src_6, , float, f, 32, 2);
> +  VECT_VAR (vector_res, float, 32, 2) =
> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
> +		VECT_VAR (vsrc_2, float, 32, 2), elem2);
> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
> +	    VECT_VAR (vector_res, float, 32, 2));
> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms2_static, "");
> +  VECT_VAR (vector_res, float, 32, 2) =
> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
> +		VECT_VAR (vsrc_2, float, 32, 2), elem2);
> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
> +	    VECT_VAR (vector_res, float, 32, 2));
> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma2_static, "");
> +
> +  VECT_VAR_DECL (buf_src_7, float, 32, 2) [] = {A6, A7};
> +  VECT_VAR_DECL (buf_src_8, float, 32, 2) [] = {B6, B7};
> +  VLOAD (vsrc_1, buf_src_7, , float, f, 32, 2);
> +  VLOAD (vsrc_2, buf_src_8, , float, f, 32, 2);
> +  VECT_VAR (vector_res, float, 32, 2) =
> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
> +		VECT_VAR (vsrc_2, float, 32, 2), elem3);
> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
> +	    VECT_VAR (vector_res, float, 32, 2));
> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms3_static, "");
> +  VECT_VAR (vector_res, float, 32, 2) =
> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
> +		VECT_VAR (vsrc_2, float, 32, 2), elem3);
> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
> +	    VECT_VAR (vector_res, float, 32, 2));
> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma3_static, "");
> +
> +#undef TEST_MSG
> +#define TEST_MSG "VFMSQ_VFMAQ_N (FP32)"
> +  clean_results ();
> +
> +  DECL_VARIABLE(vsrc_1, float, 32, 4);
> +  DECL_VARIABLE(vsrc_2, float, 32, 4);
> +  VECT_VAR_DECL (buf_src_1, float, 32, 4) [] = {A0, A1, A2, A3};
> +  VECT_VAR_DECL (buf_src_2, float, 32, 4) [] = {B0, B1, B2, B3};
> +  VLOAD (vsrc_1, buf_src_1, q, float, f, 32, 4);
> +  VLOAD (vsrc_2, buf_src_2, q, float, f, 32, 4);
> +  DECL_VARIABLE (vector_res, float, 32, 4) =
> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
> +		 VECT_VAR (vsrc_2, float, 32, 4), elem0);
> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
> +	     VECT_VAR (vector_res, float, 32, 4));
> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms0_static, "");
> +  VECT_VAR (vector_res, float, 32, 4) =
> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
> +		 VECT_VAR (vsrc_2, float, 32, 4), elem0);
> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
> +	     VECT_VAR (vector_res, float, 32, 4));
> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma0_static, "");
> +
> +  VECT_VAR_DECL (buf_src_3, float, 32, 4) [] = {A4, A5, A6, A7};
> +  VECT_VAR_DECL (buf_src_4, float, 32, 4) [] = {B4, B5, B6, B7};
> +  VLOAD (vsrc_1, buf_src_3, q, float, f, 32, 4);
> +  VLOAD (vsrc_2, buf_src_4, q, float, f, 32, 4);
> +  VECT_VAR (vector_res, float, 32, 4) =
> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
> +		 VECT_VAR (vsrc_2, float, 32, 4), elem1);
> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
> +	     VECT_VAR (vector_res, float, 32, 4));
> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms1_static, "");
> +  VECT_VAR (vector_res, float, 32, 4) =
> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
> +		 VECT_VAR (vsrc_2, float, 32, 4), elem1);
> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
> +	     VECT_VAR (vector_res, float, 32, 4));
> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma1_static, "");
> +
> +  VECT_VAR_DECL (buf_src_5, float, 32, 4) [] = {A0, A2, A4, A6};
> +  VECT_VAR_DECL (buf_src_6, float, 32, 4) [] = {B0, B2, B4, B6};
> +  VLOAD (vsrc_1, buf_src_5, q, float, f, 32, 4);
> +  VLOAD (vsrc_2, buf_src_6, q, float, f, 32, 4);
> +  VECT_VAR (vector_res, float, 32, 4) =
> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
> +		 VECT_VAR (vsrc_2, float, 32, 4), elem2);
> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
> +	     VECT_VAR (vector_res, float, 32, 4));
> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms2_static, "");
> +  VECT_VAR (vector_res, float, 32, 4) =
> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
> +		 VECT_VAR (vsrc_2, float, 32, 4), elem2);
> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
> +	     VECT_VAR (vector_res, float, 32, 4));
> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma2_static, "");
> +
> +  VECT_VAR_DECL (buf_src_7, float, 32, 4) [] = {A1, A3, A5, A7};
> +  VECT_VAR_DECL (buf_src_8, float, 32, 4) [] = {B1, B3, B5, B7};
> +  VLOAD (vsrc_1, buf_src_7, q, float, f, 32, 4);
> +  VLOAD (vsrc_2, buf_src_8, q, float, f, 32, 4);
> +  VECT_VAR (vector_res, float, 32, 4) =
> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
> +		 VECT_VAR (vsrc_2, float, 32, 4), elem3);
> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
> +	     VECT_VAR (vector_res, float, 32, 4));
> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms3_static, "");
> +  VECT_VAR (vector_res, float, 32, 4) =
> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
> +		 VECT_VAR (vsrc_2, float, 32, 4), elem3);
> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
> +	     VECT_VAR (vector_res, float, 32, 4));
> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma3_static, "");
> +
> +#undef TEST_MSG
> +#define TEST_MSG "VFMSQ_VFMAQ_N (FP64)"
> +  clean_results ();
> +
> +  DECL_VARIABLE(vsrc_1, float, 64, 2);
> +  DECL_VARIABLE(vsrc_2, float, 64, 2);
> +  VECT_VAR_DECL (buf_src_1, float, 64, 2) [] = {DA0, DA1};
> +  VECT_VAR_DECL (buf_src_2, float, 64, 2) [] = {DB0, DB1};
> +  VLOAD (vsrc_1, buf_src_1, q, float, f, 64, 2);
> +  VLOAD (vsrc_2, buf_src_2, q, float, f, 64, 2);
> +  DECL_VARIABLE (vector_res, float, 64, 2) =
> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
> +		 VECT_VAR (vsrc_2, float, 64, 2), delem0);
> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
> +	     VECT_VAR (vector_res, float, 64, 2));
> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms0_static, "");
> +  VECT_VAR (vector_res, float, 64, 2) =
> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
> +		 VECT_VAR (vsrc_2, float, 64, 2), delem0);
> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
> +	     VECT_VAR (vector_res, float, 64, 2));
> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma0_static, "");
> +
> +  VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3};
> +  VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3};
> +  VLOAD (vsrc_1, buf_src_3, q, float, f, 64, 2);
> +  VLOAD (vsrc_2, buf_src_4, q, float, f, 64, 2);
> +  VECT_VAR (vector_res, float, 64, 2) =
> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
> +		 VECT_VAR (vsrc_2, float, 64, 2), delem1);
> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
> +	     VECT_VAR (vector_res, float, 64, 2));
> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms1_static, "");
> +  VECT_VAR (vector_res, float, 64, 2) =
> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
> +		 VECT_VAR (vsrc_2, float, 64, 2), delem1);
> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
> +	     VECT_VAR (vector_res, float, 64, 2));
> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma1_static, "");
> +
> +  VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5};
> +  VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5};
> +  VLOAD (vsrc_1, buf_src_5, q, float, f, 64, 2);
> +  VLOAD (vsrc_2, buf_src_6, q, float, f, 64, 2);
> +  VECT_VAR (vector_res, float, 64, 2) =
> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
> +		 VECT_VAR (vsrc_2, float, 64, 2), delem2);
> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
> +	     VECT_VAR (vector_res, float, 64, 2));
> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms2_static, "");
> +  VECT_VAR (vector_res, float, 64, 2) =
> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
> +		 VECT_VAR (vsrc_2, float, 64, 2), delem2);
> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
> +	     VECT_VAR (vector_res, float, 64, 2));
> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma2_static, "");
> +
> +  VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7};
> +  VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7};
> +  VLOAD (vsrc_1, buf_src_7, q, float, f, 64, 2);
> +  VLOAD (vsrc_2, buf_src_8, q, float, f, 64, 2);
> +  VECT_VAR (vector_res, float, 64, 2) =
> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
> +		 VECT_VAR (vsrc_2, float, 64, 2), delem3);
> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
> +	     VECT_VAR (vector_res, float, 64, 2));
> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms3_static, "");
> +  VECT_VAR (vector_res, float, 64, 2) =
> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
> +		 VECT_VAR (vsrc_2, float, 64, 2), delem3);
> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
> +	     VECT_VAR (vector_res, float, 64, 2));
> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma3_static, "");
> +
> +#undef TEST_MSG
> +#define TEST_MSG "VFMS_VFMA_N (FP64)"
> +  clean_results ();
> +
> +  DECL_VARIABLE(vsrc_1, float, 64, 1);
> +  DECL_VARIABLE(vsrc_2, float, 64, 1);
> +  VECT_VAR_DECL (buf_src_1, float, 64, 1) [] = {DA0};
> +  VECT_VAR_DECL (buf_src_2, float, 64, 1) [] = {DB0};
> +  VLOAD (vsrc_1, buf_src_1, , float, f, 64, 1);
> +  VLOAD (vsrc_2, buf_src_2, , float, f, 64, 1);
> +  DECL_VARIABLE (vector_res, float, 64, 1) =
> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
> +		VECT_VAR (vsrc_2, float, 64, 1), delem0);
> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
> +	     VECT_VAR (vector_res, float, 64, 1));
> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms0_static, "");
> +  VECT_VAR (vector_res, float, 64, 1) =
> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
> +		VECT_VAR (vsrc_2, float, 64, 1), delem0);
> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
> +	     VECT_VAR (vector_res, float, 64, 1));
> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma0_static, "");
> +
> +  VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2};
> +  VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2};
> +  VLOAD (vsrc_1, buf_src_3, , float, f, 64, 1);
> +  VLOAD (vsrc_2, buf_src_4, , float, f, 64, 1);
> +  VECT_VAR (vector_res, float, 64, 1) =
> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
> +		VECT_VAR (vsrc_2, float, 64, 1), delem1);
> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
> +	     VECT_VAR (vector_res, float, 64, 1));
> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms1_static, "");
> +  VECT_VAR (vector_res, float, 64, 1) =
> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
> +		VECT_VAR (vsrc_2, float, 64, 1), delem1);
> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
> +	     VECT_VAR (vector_res, float, 64, 1));
> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma1_static, "");
> +
> +  VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4};
> +  VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4};
> +  VLOAD (vsrc_1, buf_src_5, , float, f, 64, 1);
> +  VLOAD (vsrc_2, buf_src_6, , float, f, 64, 1);
> +  VECT_VAR (vector_res, float, 64, 1) =
> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
> +		VECT_VAR (vsrc_2, float, 64, 1), delem2);
> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
> +	     VECT_VAR (vector_res, float, 64, 1));
> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms2_static, "");
> +  VECT_VAR (vector_res, float, 64, 1) =
> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
> +		VECT_VAR (vsrc_2, float, 64, 1), delem2);
> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
> +	     VECT_VAR (vector_res, float, 64, 1));
> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma2_static, "");
> +
> +  VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6};
> +  VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6};
> +  VLOAD (vsrc_1, buf_src_7, , float, f, 64, 1);
> +  VLOAD (vsrc_2, buf_src_8, , float, f, 64, 1);
> +  VECT_VAR (vector_res, float, 64, 1) =
> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
> +		VECT_VAR (vsrc_2, float, 64, 1), delem3);
> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
> +	     VECT_VAR (vector_res, float, 64, 1));
> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms3_static, "");
> +  VECT_VAR (vector_res, float, 64, 1) =
> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
> +		VECT_VAR (vsrc_2, float, 64, 1), delem3);
> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
> +	     VECT_VAR (vector_res, float, 64, 1));
> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma3_static, "");
> +}
> +#endif
> +
> +int
> +main (void)
> +{
> +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
> +  exec_vfma_vfms_n ();
> +#endif
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
> index 1ba1fed98a0711496815e00d2d702e5bfa2a7d43..5b348827002dcfef1f589900a4cf5ff7ada26697 100644
> --- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
> @@ -110,6 +110,6 @@ main (int argc, char **argv)
>  /* vfmaq_lane_f64.
>     vfma_laneq_f64.
>     vfmaq_laneq_f64.  */
> -/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
> +/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
>  
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
> index 887ebae10da715c8d301a8494a2225e53f15bd7d..6c194a023d34ebafb4d732edc303985531f92a63 100644
> --- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
> @@ -111,6 +111,6 @@ main (int argc, char **argv)
>  /* vfmsq_lane_f64.
>     vfms_laneq_f64.
>     vfmsq_laneq_f64.  */
> -/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
> +/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
>  
>  
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [AArch64, 1/4] Add the missing support of vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64
  2016-05-17 12:20 ` James Greenhalgh
@ 2016-05-17 12:40   ` Kyrill Tkachov
  2016-05-17 12:42     ` Kyrill Tkachov
  0 siblings, 1 reply; 8+ messages in thread
From: Kyrill Tkachov @ 2016-05-17 12:40 UTC (permalink / raw)
  To: James Greenhalgh, Jiong Wang; +Cc: GCC Patches


On 17/05/16 13:20, James Greenhalgh wrote:
> On Mon, May 16, 2016 at 10:09:26AM +0100, Jiong Wang wrote:
>> The support of vfma_n_f64, vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64 are
>> missing in current gcc arm_neon.h.
>>
>> Meanwhile, besides "(fma (vec_dup (vec_select)))", fma by element can
>> also comes from "(fma (vec_dup(scalar" where the scalar value is already
>> sitting in vector register then duplicated to other lanes, and there is
>> no lane size change.
>>
>> This patch implement this and can generate better code under some
>> context. For example:
>>
>> cat test.c
>> ===
>> typedef __Float32x2_t float32x2_t;
>> typedef float float32_t;
>>
>> float32x2_t
>> vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
>> {
>>        return __builtin_aarch64_fmav2sf (__b,  (float32x2_t) {__c,
>> __c}, __a);
>> }
>>
>> before (-O2)
>> ===
>> vfma_n_f32:
>>          dup     v2.2s, v2.s[0]
>>          fmla    v0.2s, v1.2s, v2.2s
>>          ret
>> after
>> ===
>> vfma_n_f32:
>>          fmla    v0.2s, v1.2s, v2.s[0]
>>          ret
>>
>> OK for trunk?
>>
>> 2016-05-16  Jiong Wang <jiong.wang@arm.com>
> This ChangeLog entry is not correctly formatted. There should be two
> spaces between your name and your email, and each line should start with
> a tab.
>
>> gcc/
>>    * config/aarch64/aarch64-simd.md (*aarch64_fma4_elt_to_128df): Rename
>>    to *aarch64_fma4_elt_from_dup<mode>.
>>    (*aarch64_fnma4_elt_to_128df): Rename to
>> *aarch64_fnma4_elt_from_dup<mode>.
>>    * config/aarch64/arm_neon.h (vfma_n_f64): New.
>>    (vfms_n_f32): Likewise.
>>    (vfms_n_f64): Likewise.
>>    (vfmsq_n_f32): Likewise.
>>    (vfmsq_n_f64): Likewise.
>>
>> gcc/testsuite/
>>    * gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c: Use
>> standard syntax.
>>    * gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c: Likewise.
> The paths of these two entries are incorrect. Remove the gcc/testsuite
> from the front. I don't understand what you mean by "Use standard syntax.",
> please fix this to describe what you are actually changing.
>
>>    * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: New entry
>> for float64x1.
>>    * gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: New.
> These two changes need approval from an ARM maintainer as they are in
> common files.
>
>  From an AArch64 perspective, this patch is OK with a fixed ChangeLog. Please
> wait for an ARM OK for the test changes.

Considering that the tests' functionality is guarded on #if defined(__aarch64__)
it's a noop on arm and so is ok from that perspective (we have precedence for
tests guarded in such a way in advsimd-intrinsics.exp)

The arm-neon-ref.h additions are ok too.

Thanks,
Kyrill


> Thanks,
> James
>
>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
>> index bd73bce64414e8bc01732d14311d742cf28f4586..90eaca176b4706e6cc42f16ce2c956f1c8ad17b1 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -1579,16 +1579,16 @@
>>     [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
>>   )
>>   
>> -(define_insn "*aarch64_fma4_elt_to_128df"
>> -  [(set (match_operand:V2DF 0 "register_operand" "=w")
>> -    (fma:V2DF
>> -      (vec_duplicate:V2DF
>> -	  (match_operand:DF 1 "register_operand" "w"))
>> -      (match_operand:V2DF 2 "register_operand" "w")
>> -      (match_operand:V2DF 3 "register_operand" "0")))]
>> +(define_insn "*aarch64_fma4_elt_from_dup<mode>"
>> +  [(set (match_operand:VMUL 0 "register_operand" "=w")
>> +    (fma:VMUL
>> +      (vec_duplicate:VMUL
>> +	  (match_operand:<VEL> 1 "register_operand" "w"))
>> +      (match_operand:VMUL 2 "register_operand" "w")
>> +      (match_operand:VMUL 3 "register_operand" "0")))]
>>     "TARGET_SIMD"
>> -  "fmla\\t%0.2d, %2.2d, %1.2d[0]"
>> -  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
>> +  "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
>> +  [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
>>   )
>>   
>>   (define_insn "*aarch64_fma4_elt_to_64v2df"
>> @@ -1656,17 +1656,17 @@
>>     [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
>>   )
>>   
>> -(define_insn "*aarch64_fnma4_elt_to_128df"
>> -  [(set (match_operand:V2DF 0 "register_operand" "=w")
>> -    (fma:V2DF
>> -      (neg:V2DF
>> -        (match_operand:V2DF 2 "register_operand" "w"))
>> -      (vec_duplicate:V2DF
>> -	(match_operand:DF 1 "register_operand" "w"))
>> -      (match_operand:V2DF 3 "register_operand" "0")))]
>> -  "TARGET_SIMD"
>> -  "fmls\\t%0.2d, %2.2d, %1.2d[0]"
>> -  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
>> +(define_insn "*aarch64_fnma4_elt_from_dup<mode>"
>> +  [(set (match_operand:VMUL 0 "register_operand" "=w")
>> +    (fma:VMUL
>> +      (neg:VMUL
>> +        (match_operand:VMUL 2 "register_operand" "w"))
>> +      (vec_duplicate:VMUL
>> +	(match_operand:<VEL> 1 "register_operand" "w"))
>> +      (match_operand:VMUL 3 "register_operand" "0")))]
>> +  "TARGET_SIMD"
>> +  "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
>> +  [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
>>   )
>>   
>>   (define_insn "*aarch64_fnma4_elt_to_64v2df"
>> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
>> index 2612a325718918cf7cd808f28c09c9c4c7b11c07..ca7ace5aa656163826569d046fcbf02f9f7d4d6c 100644
>> --- a/gcc/config/aarch64/arm_neon.h
>> +++ b/gcc/config/aarch64/arm_neon.h
>> @@ -14456,6 +14456,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
>>     return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
>>   }
>>   
>> +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
>> +vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
>> +{
>> +  return (float64x1_t) {__b[0] * __c + __a[0]};
>> +}
>> +
>>   __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
>>   vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
>>   {
>> @@ -14597,6 +14603,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
>>     return __builtin_aarch64_fmav2df (-__b, __c, __a);
>>   }
>>   
>> +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
>> +vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
>> +{
>> +  return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a);
>> +}
>> +
>> +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
>> +vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
>> +{
>> +  return (float64x1_t) {-__b[0] * __c + __a[0]};
>> +}
>> +
>> +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
>> +vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
>> +{
>> +  return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a);
>> +}
>> +
>> +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
>> +vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
>> +{
>> +  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a);
>> +}
>>   
>>   /* vfms_lane  */
>>   
>> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
>> index 49fbd843e507ede8aa81d02c175a82a1221750a4..cf90825f87391b72aca9a29980210d21f4321c04 100644
>> --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
>> @@ -136,6 +136,7 @@ static ARRAY(result, poly, 16, 4);
>>   #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
>>   static ARRAY(result, float, 16, 4);
>>   #endif
>> +static ARRAY(result, float, 64, 1);
>>   static ARRAY(result, float, 32, 2);
>>   static ARRAY(result, int, 8, 16);
>>   static ARRAY(result, int, 16, 8);
>> @@ -169,6 +170,7 @@ extern ARRAY(expected, poly, 8, 8);
>>   extern ARRAY(expected, poly, 16, 4);
>>   extern ARRAY(expected, hfloat, 16, 4);
>>   extern ARRAY(expected, hfloat, 32, 2);
>> +extern ARRAY(expected, hfloat, 64, 1);
>>   extern ARRAY(expected, int, 8, 16);
>>   extern ARRAY(expected, int, 16, 8);
>>   extern ARRAY(expected, int, 32, 4);
>> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..26223763c59c849607b5320f6ec37098a556ce2e
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
>> @@ -0,0 +1,490 @@
>> +#include <arm_neon.h>
>> +#include "arm-neon-ref.h"
>> +#include "compute-ref-data.h"
>> +
>> +#define A0 123.4f
>> +#define A1 -3.8f
>> +#define A2 -29.4f
>> +#define A3 (__builtin_inff ())
>> +#define A4 0.0f
>> +#define A5 24.0f
>> +#define A6 124.0f
>> +#define A7 1024.0f
>> +
>> +#define B0 -5.8f
>> +#define B1 -0.0f
>> +#define B2 -10.8f
>> +#define B3 10.0f
>> +#define B4 23.4f
>> +#define B5 -1234.8f
>> +#define B6 8.9f
>> +#define B7 4.0f
>> +
>> +#define E0 9.8f
>> +#define E1 -1024.0f
>> +#define E2 (-__builtin_inff ())
>> +#define E3 479.0f
>> +float32_t elem0 = E0;
>> +float32_t elem1 = E1;
>> +float32_t elem2 = E2;
>> +float32_t elem3 = E3;
>> +
>> +#define DA0 1231234.4
>> +#define DA1 -3.8
>> +#define DA2 -2980.4
>> +#define DA3 -5.8
>> +#define DA4 0.01123
>> +#define DA5 24.0
>> +#define DA6 124.12345
>> +#define DA7 1024.0
>> +
>> +#define DB0 -5.8
>> +#define DB1 (__builtin_inf ())
>> +#define DB2 -105.8
>> +#define DB3 10.0
>> +#define DB4 (-__builtin_inf ())
>> +#define DB5 -1234.8
>> +#define DB6 848.9
>> +#define DB7 44444.0
>> +
>> +#define DE0 9.8
>> +#define DE1 -1024.0
>> +#define DE2 105.8
>> +#define DE3 479.0
>> +float64_t delem0 = DE0;
>> +float64_t delem1 = DE1;
>> +float64_t delem2 = DE2;
>> +float64_t delem3 = DE3;
>> +
>> +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
>> +
>> +/* Expected results for vfms_n.  */
>> +
>> +VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1 * E0};
>> +VECT_VAR_DECL(expectedfms1, float, 32, 2) [] = {A2 + -B2 * E1, A3 + -B3 * E1};
>> +VECT_VAR_DECL(expectedfms2, float, 32, 2) [] = {A4 + -B4 * E2, A5 + -B5 * E2};
>> +VECT_VAR_DECL(expectedfms3, float, 32, 2) [] = {A6 + -B6 * E3, A7 + -B7 * E3};
>> +VECT_VAR_DECL(expectedfma0, float, 32, 2) [] = {A0 + B0 * E0, A1 + B1 * E0};
>> +VECT_VAR_DECL(expectedfma1, float, 32, 2) [] = {A2 + B2 * E1, A3 + B3 * E1};
>> +VECT_VAR_DECL(expectedfma2, float, 32, 2) [] = {A4 + B4 * E2, A5 + B5 * E2};
>> +VECT_VAR_DECL(expectedfma3, float, 32, 2) [] = {A6 + B6 * E3, A7 + B7 * E3};
>> +
>> +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 2) =
>> +  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 2);
>> +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 2) =
>> +  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 2);
>> +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 2) =
>> +  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 2);
>> +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 2) =
>> +  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 2);
>> +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 2) =
>> +  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 2);
>> +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 2) =
>> +  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 2);
>> +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 2) =
>> +  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 2);
>> +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 2) =
>> +  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 2);
>> +
>> +
>> +VECT_VAR_DECL(expectedfms0, float, 32, 4) [] = {A0 + -B0 * E0, A1 + -B1 * E0,
>> +						A2 + -B2 * E0, A3 + -B3 * E0};
>> +VECT_VAR_DECL(expectedfms1, float, 32, 4) [] = {A4 + -B4 * E1, A5 + -B5 * E1,
>> +						A6 + -B6 * E1, A7 + -B7 * E1};
>> +VECT_VAR_DECL(expectedfms2, float, 32, 4) [] = {A0 + -B0 * E2, A2 + -B2 * E2,
>> +						A4 + -B4 * E2, A6 + -B6 * E2};
>> +VECT_VAR_DECL(expectedfms3, float, 32, 4) [] = {A1 + -B1 * E3, A3 + -B3 * E3,
>> +						A5 + -B5 * E3, A7 + -B7 * E3};
>> +VECT_VAR_DECL(expectedfma0, float, 32, 4) [] = {A0 + B0 * E0, A1 + B1 * E0,
>> +						A2 + B2 * E0, A3 + B3 * E0};
>> +VECT_VAR_DECL(expectedfma1, float, 32, 4) [] = {A4 + B4 * E1, A5 + B5 * E1,
>> +						A6 + B6 * E1, A7 + B7 * E1};
>> +VECT_VAR_DECL(expectedfma2, float, 32, 4) [] = {A0 + B0 * E2, A2 + B2 * E2,
>> +						A4 + B4 * E2, A6 + B6 * E2};
>> +VECT_VAR_DECL(expectedfma3, float, 32, 4) [] = {A1 + B1 * E3, A3 + B3 * E3,
>> +						A5 + B5 * E3, A7 + B7 * E3};
>> +
>> +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 4) =
>> +  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 4);
>> +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 4) =
>> +  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 4);
>> +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 4) =
>> +  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 4);
>> +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 4) =
>> +  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 4);
>> +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 4) =
>> +  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 4);
>> +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 4) =
>> +  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 4);
>> +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 4) =
>> +  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 4);
>> +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 4) =
>> +  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 4);
>> +
>> +VECT_VAR_DECL(expectedfms0, float, 64, 2) [] = {DA0 + -DB0 * DE0,
>> +						DA1 + -DB1 * DE0};
>> +VECT_VAR_DECL(expectedfms1, float, 64, 2) [] = {DA2 + -DB2 * DE1,
>> +						DA3 + -DB3 * DE1};
>> +VECT_VAR_DECL(expectedfms2, float, 64, 2) [] = {DA4 + -DB4 * DE2,
>> +						DA5 + -DB5 * DE2};
>> +VECT_VAR_DECL(expectedfms3, float, 64, 2) [] = {DA6 + -DB6 * DE3,
>> +						DA7 + -DB7 * DE3};
>> +VECT_VAR_DECL(expectedfma0, float, 64, 2) [] = {DA0 + DB0 * DE0,
>> +						DA1 + DB1 * DE0};
>> +VECT_VAR_DECL(expectedfma1, float, 64, 2) [] = {DA2 + DB2 * DE1,
>> +						DA3 + DB3 * DE1};
>> +VECT_VAR_DECL(expectedfma2, float, 64, 2) [] = {DA4 + DB4 * DE2,
>> +						DA5 + DB5 * DE2};
>> +VECT_VAR_DECL(expectedfma3, float, 64, 2) [] = {DA6 + DB6 * DE3,
>> +						DA7 + DB7 * DE3};
>> +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 2) =
>> +  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 2);
>> +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 2) =
>> +  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 2);
>> +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 2) =
>> +  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 2);
>> +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 2) =
>> +  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 2);
>> +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 2) =
>> +  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 2);
>> +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 2) =
>> +  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 2);
>> +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 2) =
>> +  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 2);
>> +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 2) =
>> +  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 2);
>> +
>> +VECT_VAR_DECL(expectedfms0, float, 64, 1) [] = {DA0 + -DB0 * DE0};
>> +VECT_VAR_DECL(expectedfms1, float, 64, 1) [] = {DA2 + -DB2 * DE1};
>> +VECT_VAR_DECL(expectedfms2, float, 64, 1) [] = {DA4 + -DB4 * DE2};
>> +VECT_VAR_DECL(expectedfms3, float, 64, 1) [] = {DA6 + -DB6 * DE3};
>> +VECT_VAR_DECL(expectedfma0, float, 64, 1) [] = {DA0 + DB0 * DE0};
>> +VECT_VAR_DECL(expectedfma1, float, 64, 1) [] = {DA2 + DB2 * DE1};
>> +VECT_VAR_DECL(expectedfma2, float, 64, 1) [] = {DA4 + DB4 * DE2};
>> +VECT_VAR_DECL(expectedfma3, float, 64, 1) [] = {DA6 + DB6 * DE3};
>> +
>> +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 1) =
>> +  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 1);
>> +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 1) =
>> +  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 1);
>> +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 1) =
>> +  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 1);
>> +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 1) =
>> +  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 1);
>> +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 1) =
>> +  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 1);
>> +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 1) =
>> +  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 1);
>> +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 1) =
>> +  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 1);
>> +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 1) =
>> +  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 1);
>> +
>> +void exec_vfma_vfms_n (void)
>> +{
>> +#undef TEST_MSG
>> +#define TEST_MSG "VFMS_VFMA_N (FP32)"
>> +  clean_results ();
>> +
>> +  DECL_VARIABLE(vsrc_1, float, 32, 2);
>> +  DECL_VARIABLE(vsrc_2, float, 32, 2);
>> +  VECT_VAR_DECL (buf_src_1, float, 32, 2) [] = {A0, A1};
>> +  VECT_VAR_DECL (buf_src_2, float, 32, 2) [] = {B0, B1};
>> +  VLOAD (vsrc_1, buf_src_1, , float, f, 32, 2);
>> +  VLOAD (vsrc_2, buf_src_2, , float, f, 32, 2);
>> +  DECL_VARIABLE (vector_res, float, 32, 2) =
>> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>> +		VECT_VAR (vsrc_2, float, 32, 2), elem0);
>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>> +	    VECT_VAR (vector_res, float, 32, 2));
>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms0_static, "");
>> +  VECT_VAR (vector_res, float, 32, 2) =
>> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>> +		VECT_VAR (vsrc_2, float, 32, 2), elem0);
>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>> +	    VECT_VAR (vector_res, float, 32, 2));
>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma0_static, "");
>> +
>> +  VECT_VAR_DECL (buf_src_3, float, 32, 2) [] = {A2, A3};
>> +  VECT_VAR_DECL (buf_src_4, float, 32, 2) [] = {B2, B3};
>> +  VLOAD (vsrc_1, buf_src_3, , float, f, 32, 2);
>> +  VLOAD (vsrc_2, buf_src_4, , float, f, 32, 2);
>> +  VECT_VAR (vector_res, float, 32, 2) =
>> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>> +		VECT_VAR (vsrc_2, float, 32, 2), elem1);
>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>> +	    VECT_VAR (vector_res, float, 32, 2));
>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms1_static, "");
>> +  VECT_VAR (vector_res, float, 32, 2) =
>> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>> +		VECT_VAR (vsrc_2, float, 32, 2), elem1);
>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>> +	    VECT_VAR (vector_res, float, 32, 2));
>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma1_static, "");
>> +
>> +  VECT_VAR_DECL (buf_src_5, float, 32, 2) [] = {A4, A5};
>> +  VECT_VAR_DECL (buf_src_6, float, 32, 2) [] = {B4, B5};
>> +  VLOAD (vsrc_1, buf_src_5, , float, f, 32, 2);
>> +  VLOAD (vsrc_2, buf_src_6, , float, f, 32, 2);
>> +  VECT_VAR (vector_res, float, 32, 2) =
>> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>> +		VECT_VAR (vsrc_2, float, 32, 2), elem2);
>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>> +	    VECT_VAR (vector_res, float, 32, 2));
>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms2_static, "");
>> +  VECT_VAR (vector_res, float, 32, 2) =
>> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>> +		VECT_VAR (vsrc_2, float, 32, 2), elem2);
>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>> +	    VECT_VAR (vector_res, float, 32, 2));
>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma2_static, "");
>> +
>> +  VECT_VAR_DECL (buf_src_7, float, 32, 2) [] = {A6, A7};
>> +  VECT_VAR_DECL (buf_src_8, float, 32, 2) [] = {B6, B7};
>> +  VLOAD (vsrc_1, buf_src_7, , float, f, 32, 2);
>> +  VLOAD (vsrc_2, buf_src_8, , float, f, 32, 2);
>> +  VECT_VAR (vector_res, float, 32, 2) =
>> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>> +		VECT_VAR (vsrc_2, float, 32, 2), elem3);
>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>> +	    VECT_VAR (vector_res, float, 32, 2));
>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms3_static, "");
>> +  VECT_VAR (vector_res, float, 32, 2) =
>> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>> +		VECT_VAR (vsrc_2, float, 32, 2), elem3);
>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>> +	    VECT_VAR (vector_res, float, 32, 2));
>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma3_static, "");
>> +
>> +#undef TEST_MSG
>> +#define TEST_MSG "VFMSQ_VFMAQ_N (FP32)"
>> +  clean_results ();
>> +
>> +  DECL_VARIABLE(vsrc_1, float, 32, 4);
>> +  DECL_VARIABLE(vsrc_2, float, 32, 4);
>> +  VECT_VAR_DECL (buf_src_1, float, 32, 4) [] = {A0, A1, A2, A3};
>> +  VECT_VAR_DECL (buf_src_2, float, 32, 4) [] = {B0, B1, B2, B3};
>> +  VLOAD (vsrc_1, buf_src_1, q, float, f, 32, 4);
>> +  VLOAD (vsrc_2, buf_src_2, q, float, f, 32, 4);
>> +  DECL_VARIABLE (vector_res, float, 32, 4) =
>> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>> +		 VECT_VAR (vsrc_2, float, 32, 4), elem0);
>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>> +	     VECT_VAR (vector_res, float, 32, 4));
>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms0_static, "");
>> +  VECT_VAR (vector_res, float, 32, 4) =
>> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>> +		 VECT_VAR (vsrc_2, float, 32, 4), elem0);
>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>> +	     VECT_VAR (vector_res, float, 32, 4));
>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma0_static, "");
>> +
>> +  VECT_VAR_DECL (buf_src_3, float, 32, 4) [] = {A4, A5, A6, A7};
>> +  VECT_VAR_DECL (buf_src_4, float, 32, 4) [] = {B4, B5, B6, B7};
>> +  VLOAD (vsrc_1, buf_src_3, q, float, f, 32, 4);
>> +  VLOAD (vsrc_2, buf_src_4, q, float, f, 32, 4);
>> +  VECT_VAR (vector_res, float, 32, 4) =
>> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>> +		 VECT_VAR (vsrc_2, float, 32, 4), elem1);
>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>> +	     VECT_VAR (vector_res, float, 32, 4));
>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms1_static, "");
>> +  VECT_VAR (vector_res, float, 32, 4) =
>> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>> +		 VECT_VAR (vsrc_2, float, 32, 4), elem1);
>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>> +	     VECT_VAR (vector_res, float, 32, 4));
>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma1_static, "");
>> +
>> +  VECT_VAR_DECL (buf_src_5, float, 32, 4) [] = {A0, A2, A4, A6};
>> +  VECT_VAR_DECL (buf_src_6, float, 32, 4) [] = {B0, B2, B4, B6};
>> +  VLOAD (vsrc_1, buf_src_5, q, float, f, 32, 4);
>> +  VLOAD (vsrc_2, buf_src_6, q, float, f, 32, 4);
>> +  VECT_VAR (vector_res, float, 32, 4) =
>> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>> +		 VECT_VAR (vsrc_2, float, 32, 4), elem2);
>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>> +	     VECT_VAR (vector_res, float, 32, 4));
>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms2_static, "");
>> +  VECT_VAR (vector_res, float, 32, 4) =
>> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>> +		 VECT_VAR (vsrc_2, float, 32, 4), elem2);
>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>> +	     VECT_VAR (vector_res, float, 32, 4));
>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma2_static, "");
>> +
>> +  VECT_VAR_DECL (buf_src_7, float, 32, 4) [] = {A1, A3, A5, A7};
>> +  VECT_VAR_DECL (buf_src_8, float, 32, 4) [] = {B1, B3, B5, B7};
>> +  VLOAD (vsrc_1, buf_src_7, q, float, f, 32, 4);
>> +  VLOAD (vsrc_2, buf_src_8, q, float, f, 32, 4);
>> +  VECT_VAR (vector_res, float, 32, 4) =
>> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>> +		 VECT_VAR (vsrc_2, float, 32, 4), elem3);
>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>> +	     VECT_VAR (vector_res, float, 32, 4));
>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms3_static, "");
>> +  VECT_VAR (vector_res, float, 32, 4) =
>> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>> +		 VECT_VAR (vsrc_2, float, 32, 4), elem3);
>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>> +	     VECT_VAR (vector_res, float, 32, 4));
>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma3_static, "");
>> +
>> +#undef TEST_MSG
>> +#define TEST_MSG "VFMSQ_VFMAQ_N (FP64)"
>> +  clean_results ();
>> +
>> +  DECL_VARIABLE(vsrc_1, float, 64, 2);
>> +  DECL_VARIABLE(vsrc_2, float, 64, 2);
>> +  VECT_VAR_DECL (buf_src_1, float, 64, 2) [] = {DA0, DA1};
>> +  VECT_VAR_DECL (buf_src_2, float, 64, 2) [] = {DB0, DB1};
>> +  VLOAD (vsrc_1, buf_src_1, q, float, f, 64, 2);
>> +  VLOAD (vsrc_2, buf_src_2, q, float, f, 64, 2);
>> +  DECL_VARIABLE (vector_res, float, 64, 2) =
>> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>> +		 VECT_VAR (vsrc_2, float, 64, 2), delem0);
>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>> +	     VECT_VAR (vector_res, float, 64, 2));
>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms0_static, "");
>> +  VECT_VAR (vector_res, float, 64, 2) =
>> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>> +		 VECT_VAR (vsrc_2, float, 64, 2), delem0);
>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>> +	     VECT_VAR (vector_res, float, 64, 2));
>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma0_static, "");
>> +
>> +  VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3};
>> +  VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3};
>> +  VLOAD (vsrc_1, buf_src_3, q, float, f, 64, 2);
>> +  VLOAD (vsrc_2, buf_src_4, q, float, f, 64, 2);
>> +  VECT_VAR (vector_res, float, 64, 2) =
>> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>> +		 VECT_VAR (vsrc_2, float, 64, 2), delem1);
>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>> +	     VECT_VAR (vector_res, float, 64, 2));
>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms1_static, "");
>> +  VECT_VAR (vector_res, float, 64, 2) =
>> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>> +		 VECT_VAR (vsrc_2, float, 64, 2), delem1);
>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>> +	     VECT_VAR (vector_res, float, 64, 2));
>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma1_static, "");
>> +
>> +  VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5};
>> +  VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5};
>> +  VLOAD (vsrc_1, buf_src_5, q, float, f, 64, 2);
>> +  VLOAD (vsrc_2, buf_src_6, q, float, f, 64, 2);
>> +  VECT_VAR (vector_res, float, 64, 2) =
>> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>> +		 VECT_VAR (vsrc_2, float, 64, 2), delem2);
>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>> +	     VECT_VAR (vector_res, float, 64, 2));
>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms2_static, "");
>> +  VECT_VAR (vector_res, float, 64, 2) =
>> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>> +		 VECT_VAR (vsrc_2, float, 64, 2), delem2);
>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>> +	     VECT_VAR (vector_res, float, 64, 2));
>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma2_static, "");
>> +
>> +  VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7};
>> +  VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7};
>> +  VLOAD (vsrc_1, buf_src_7, q, float, f, 64, 2);
>> +  VLOAD (vsrc_2, buf_src_8, q, float, f, 64, 2);
>> +  VECT_VAR (vector_res, float, 64, 2) =
>> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>> +		 VECT_VAR (vsrc_2, float, 64, 2), delem3);
>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>> +	     VECT_VAR (vector_res, float, 64, 2));
>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms3_static, "");
>> +  VECT_VAR (vector_res, float, 64, 2) =
>> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>> +		 VECT_VAR (vsrc_2, float, 64, 2), delem3);
>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>> +	     VECT_VAR (vector_res, float, 64, 2));
>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma3_static, "");
>> +
>> +#undef TEST_MSG
>> +#define TEST_MSG "VFMS_VFMA_N (FP64)"
>> +  clean_results ();
>> +
>> +  DECL_VARIABLE(vsrc_1, float, 64, 1);
>> +  DECL_VARIABLE(vsrc_2, float, 64, 1);
>> +  VECT_VAR_DECL (buf_src_1, float, 64, 1) [] = {DA0};
>> +  VECT_VAR_DECL (buf_src_2, float, 64, 1) [] = {DB0};
>> +  VLOAD (vsrc_1, buf_src_1, , float, f, 64, 1);
>> +  VLOAD (vsrc_2, buf_src_2, , float, f, 64, 1);
>> +  DECL_VARIABLE (vector_res, float, 64, 1) =
>> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>> +		VECT_VAR (vsrc_2, float, 64, 1), delem0);
>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>> +	     VECT_VAR (vector_res, float, 64, 1));
>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms0_static, "");
>> +  VECT_VAR (vector_res, float, 64, 1) =
>> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>> +		VECT_VAR (vsrc_2, float, 64, 1), delem0);
>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>> +	     VECT_VAR (vector_res, float, 64, 1));
>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma0_static, "");
>> +
>> +  VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2};
>> +  VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2};
>> +  VLOAD (vsrc_1, buf_src_3, , float, f, 64, 1);
>> +  VLOAD (vsrc_2, buf_src_4, , float, f, 64, 1);
>> +  VECT_VAR (vector_res, float, 64, 1) =
>> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>> +		VECT_VAR (vsrc_2, float, 64, 1), delem1);
>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>> +	     VECT_VAR (vector_res, float, 64, 1));
>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms1_static, "");
>> +  VECT_VAR (vector_res, float, 64, 1) =
>> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>> +		VECT_VAR (vsrc_2, float, 64, 1), delem1);
>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>> +	     VECT_VAR (vector_res, float, 64, 1));
>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma1_static, "");
>> +
>> +  VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4};
>> +  VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4};
>> +  VLOAD (vsrc_1, buf_src_5, , float, f, 64, 1);
>> +  VLOAD (vsrc_2, buf_src_6, , float, f, 64, 1);
>> +  VECT_VAR (vector_res, float, 64, 1) =
>> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>> +		VECT_VAR (vsrc_2, float, 64, 1), delem2);
>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>> +	     VECT_VAR (vector_res, float, 64, 1));
>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms2_static, "");
>> +  VECT_VAR (vector_res, float, 64, 1) =
>> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>> +		VECT_VAR (vsrc_2, float, 64, 1), delem2);
>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>> +	     VECT_VAR (vector_res, float, 64, 1));
>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma2_static, "");
>> +
>> +  VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6};
>> +  VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6};
>> +  VLOAD (vsrc_1, buf_src_7, , float, f, 64, 1);
>> +  VLOAD (vsrc_2, buf_src_8, , float, f, 64, 1);
>> +  VECT_VAR (vector_res, float, 64, 1) =
>> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>> +		VECT_VAR (vsrc_2, float, 64, 1), delem3);
>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>> +	     VECT_VAR (vector_res, float, 64, 1));
>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms3_static, "");
>> +  VECT_VAR (vector_res, float, 64, 1) =
>> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>> +		VECT_VAR (vsrc_2, float, 64, 1), delem3);
>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>> +	     VECT_VAR (vector_res, float, 64, 1));
>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma3_static, "");
>> +}
>> +#endif
>> +
>> +int
>> +main (void)
>> +{
>> +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
>> +  exec_vfma_vfms_n ();
>> +#endif
>> +  return 0;
>> +}
>> diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
>> index 1ba1fed98a0711496815e00d2d702e5bfa2a7d43..5b348827002dcfef1f589900a4cf5ff7ada26697 100644
>> --- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
>> +++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
>> @@ -110,6 +110,6 @@ main (int argc, char **argv)
>>   /* vfmaq_lane_f64.
>>      vfma_laneq_f64.
>>      vfmaq_laneq_f64.  */
>> -/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
>> +/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
>>   
>>   
>> diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
>> index 887ebae10da715c8d301a8494a2225e53f15bd7d..6c194a023d34ebafb4d732edc303985531f92a63 100644
>> --- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
>> +++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
>> @@ -111,6 +111,6 @@ main (int argc, char **argv)
>>   /* vfmsq_lane_f64.
>>      vfms_laneq_f64.
>>      vfmsq_laneq_f64.  */
>> -/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
>> +/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
>>   
>>   
>>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [AArch64, 1/4] Add the missing support of vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64
  2016-05-17 12:40   ` Kyrill Tkachov
@ 2016-05-17 12:42     ` Kyrill Tkachov
  2016-05-18  8:25       ` Christophe Lyon
  0 siblings, 1 reply; 8+ messages in thread
From: Kyrill Tkachov @ 2016-05-17 12:42 UTC (permalink / raw)
  To: James Greenhalgh, Jiong Wang; +Cc: GCC Patches


On 17/05/16 13:40, Kyrill Tkachov wrote:
>
> On 17/05/16 13:20, James Greenhalgh wrote:
>> On Mon, May 16, 2016 at 10:09:26AM +0100, Jiong Wang wrote:
>>> The support of vfma_n_f64, vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64 are
>>> missing in current gcc arm_neon.h.
>>>
>>> Meanwhile, besides "(fma (vec_dup (vec_select)))", fma by element can
>>> also comes from "(fma (vec_dup(scalar" where the scalar value is already
>>> sitting in vector register then duplicated to other lanes, and there is
>>> no lane size change.
>>>
>>> This patch implement this and can generate better code under some
>>> context. For example:
>>>
>>> cat test.c
>>> ===
>>> typedef __Float32x2_t float32x2_t;
>>> typedef float float32_t;
>>>
>>> float32x2_t
>>> vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
>>> {
>>>        return __builtin_aarch64_fmav2sf (__b,  (float32x2_t) {__c,
>>> __c}, __a);
>>> }
>>>
>>> before (-O2)
>>> ===
>>> vfma_n_f32:
>>>          dup     v2.2s, v2.s[0]
>>>          fmla    v0.2s, v1.2s, v2.2s
>>>          ret
>>> after
>>> ===
>>> vfma_n_f32:
>>>          fmla    v0.2s, v1.2s, v2.s[0]
>>>          ret
>>>
>>> OK for trunk?
>>>
>>> 2016-05-16  Jiong Wang <jiong.wang@arm.com>
>> This ChangeLog entry is not correctly formatted. There should be two
>> spaces between your name and your email, and each line should start with
>> a tab.
>>
>>> gcc/
>>>    * config/aarch64/aarch64-simd.md (*aarch64_fma4_elt_to_128df): Rename
>>>    to *aarch64_fma4_elt_from_dup<mode>.
>>>    (*aarch64_fnma4_elt_to_128df): Rename to
>>> *aarch64_fnma4_elt_from_dup<mode>.
>>>    * config/aarch64/arm_neon.h (vfma_n_f64): New.
>>>    (vfms_n_f32): Likewise.
>>>    (vfms_n_f64): Likewise.
>>>    (vfmsq_n_f32): Likewise.
>>>    (vfmsq_n_f64): Likewise.
>>>
>>> gcc/testsuite/
>>>    * gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c: Use
>>> standard syntax.
>>>    * gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c: Likewise.
>> The paths of these two entries are incorrect. Remove the gcc/testsuite
>> from the front. I don't understand what you mean by "Use standard syntax.",
>> please fix this to describe what you are actually changing.
>>
>>>    * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: New entry
>>> for float64x1.
>>>    * gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: New.
>> These two changes need approval from an ARM maintainer as they are in
>> common files.
>>
>>  From an AArch64 perspective, this patch is OK with a fixed ChangeLog. Please
>> wait for an ARM OK for the test changes.
>
> Considering that the tests' functionality is guarded on #if defined(__aarch64__)
> it's a noop on arm and so is ok from that perspective (we have precedence for
> tests guarded in such a way in advsimd-intrinsics.exp)
>

Of course I meant precedent rather than precedence :/

Kyrill

> The arm-neon-ref.h additions are ok too.
>
> Thanks,
> Kyrill
>
>
>> Thanks,
>> James
>>
>>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
>>> index bd73bce64414e8bc01732d14311d742cf28f4586..90eaca176b4706e6cc42f16ce2c956f1c8ad17b1 100644
>>> --- a/gcc/config/aarch64/aarch64-simd.md
>>> +++ b/gcc/config/aarch64/aarch64-simd.md
>>> @@ -1579,16 +1579,16 @@
>>>     [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
>>>   )
>>>   -(define_insn "*aarch64_fma4_elt_to_128df"
>>> -  [(set (match_operand:V2DF 0 "register_operand" "=w")
>>> -    (fma:V2DF
>>> -      (vec_duplicate:V2DF
>>> -      (match_operand:DF 1 "register_operand" "w"))
>>> -      (match_operand:V2DF 2 "register_operand" "w")
>>> -      (match_operand:V2DF 3 "register_operand" "0")))]
>>> +(define_insn "*aarch64_fma4_elt_from_dup<mode>"
>>> +  [(set (match_operand:VMUL 0 "register_operand" "=w")
>>> +    (fma:VMUL
>>> +      (vec_duplicate:VMUL
>>> +      (match_operand:<VEL> 1 "register_operand" "w"))
>>> +      (match_operand:VMUL 2 "register_operand" "w")
>>> +      (match_operand:VMUL 3 "register_operand" "0")))]
>>>     "TARGET_SIMD"
>>> -  "fmla\\t%0.2d, %2.2d, %1.2d[0]"
>>> -  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
>>> +  "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
>>> +  [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
>>>   )
>>>     (define_insn "*aarch64_fma4_elt_to_64v2df"
>>> @@ -1656,17 +1656,17 @@
>>>     [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
>>>   )
>>>   -(define_insn "*aarch64_fnma4_elt_to_128df"
>>> -  [(set (match_operand:V2DF 0 "register_operand" "=w")
>>> -    (fma:V2DF
>>> -      (neg:V2DF
>>> -        (match_operand:V2DF 2 "register_operand" "w"))
>>> -      (vec_duplicate:V2DF
>>> -    (match_operand:DF 1 "register_operand" "w"))
>>> -      (match_operand:V2DF 3 "register_operand" "0")))]
>>> -  "TARGET_SIMD"
>>> -  "fmls\\t%0.2d, %2.2d, %1.2d[0]"
>>> -  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
>>> +(define_insn "*aarch64_fnma4_elt_from_dup<mode>"
>>> +  [(set (match_operand:VMUL 0 "register_operand" "=w")
>>> +    (fma:VMUL
>>> +      (neg:VMUL
>>> +        (match_operand:VMUL 2 "register_operand" "w"))
>>> +      (vec_duplicate:VMUL
>>> +    (match_operand:<VEL> 1 "register_operand" "w"))
>>> +      (match_operand:VMUL 3 "register_operand" "0")))]
>>> +  "TARGET_SIMD"
>>> +  "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
>>> +  [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
>>>   )
>>>     (define_insn "*aarch64_fnma4_elt_to_64v2df"
>>> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
>>> index 2612a325718918cf7cd808f28c09c9c4c7b11c07..ca7ace5aa656163826569d046fcbf02f9f7d4d6c 100644
>>> --- a/gcc/config/aarch64/arm_neon.h
>>> +++ b/gcc/config/aarch64/arm_neon.h
>>> @@ -14456,6 +14456,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
>>>     return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
>>>   }
>>>   +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
>>> +vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
>>> +{
>>> +  return (float64x1_t) {__b[0] * __c + __a[0]};
>>> +}
>>> +
>>>   __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
>>>   vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
>>>   {
>>> @@ -14597,6 +14603,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
>>>     return __builtin_aarch64_fmav2df (-__b, __c, __a);
>>>   }
>>>   +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
>>> +vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
>>> +{
>>> +  return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a);
>>> +}
>>> +
>>> +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
>>> +vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
>>> +{
>>> +  return (float64x1_t) {-__b[0] * __c + __a[0]};
>>> +}
>>> +
>>> +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
>>> +vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
>>> +{
>>> +  return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a);
>>> +}
>>> +
>>> +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
>>> +vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
>>> +{
>>> +  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a);
>>> +}
>>>     /* vfms_lane  */
>>>   diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
>>> index 49fbd843e507ede8aa81d02c175a82a1221750a4..cf90825f87391b72aca9a29980210d21f4321c04 100644
>>> --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
>>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
>>> @@ -136,6 +136,7 @@ static ARRAY(result, poly, 16, 4);
>>>   #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
>>>   static ARRAY(result, float, 16, 4);
>>>   #endif
>>> +static ARRAY(result, float, 64, 1);
>>>   static ARRAY(result, float, 32, 2);
>>>   static ARRAY(result, int, 8, 16);
>>>   static ARRAY(result, int, 16, 8);
>>> @@ -169,6 +170,7 @@ extern ARRAY(expected, poly, 8, 8);
>>>   extern ARRAY(expected, poly, 16, 4);
>>>   extern ARRAY(expected, hfloat, 16, 4);
>>>   extern ARRAY(expected, hfloat, 32, 2);
>>> +extern ARRAY(expected, hfloat, 64, 1);
>>>   extern ARRAY(expected, int, 8, 16);
>>>   extern ARRAY(expected, int, 16, 8);
>>>   extern ARRAY(expected, int, 32, 4);
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
>>> new file mode 100644
>>> index 0000000000000000000000000000000000000000..26223763c59c849607b5320f6ec37098a556ce2e
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
>>> @@ -0,0 +1,490 @@
>>> +#include <arm_neon.h>
>>> +#include "arm-neon-ref.h"
>>> +#include "compute-ref-data.h"
>>> +
>>> +#define A0 123.4f
>>> +#define A1 -3.8f
>>> +#define A2 -29.4f
>>> +#define A3 (__builtin_inff ())
>>> +#define A4 0.0f
>>> +#define A5 24.0f
>>> +#define A6 124.0f
>>> +#define A7 1024.0f
>>> +
>>> +#define B0 -5.8f
>>> +#define B1 -0.0f
>>> +#define B2 -10.8f
>>> +#define B3 10.0f
>>> +#define B4 23.4f
>>> +#define B5 -1234.8f
>>> +#define B6 8.9f
>>> +#define B7 4.0f
>>> +
>>> +#define E0 9.8f
>>> +#define E1 -1024.0f
>>> +#define E2 (-__builtin_inff ())
>>> +#define E3 479.0f
>>> +float32_t elem0 = E0;
>>> +float32_t elem1 = E1;
>>> +float32_t elem2 = E2;
>>> +float32_t elem3 = E3;
>>> +
>>> +#define DA0 1231234.4
>>> +#define DA1 -3.8
>>> +#define DA2 -2980.4
>>> +#define DA3 -5.8
>>> +#define DA4 0.01123
>>> +#define DA5 24.0
>>> +#define DA6 124.12345
>>> +#define DA7 1024.0
>>> +
>>> +#define DB0 -5.8
>>> +#define DB1 (__builtin_inf ())
>>> +#define DB2 -105.8
>>> +#define DB3 10.0
>>> +#define DB4 (-__builtin_inf ())
>>> +#define DB5 -1234.8
>>> +#define DB6 848.9
>>> +#define DB7 44444.0
>>> +
>>> +#define DE0 9.8
>>> +#define DE1 -1024.0
>>> +#define DE2 105.8
>>> +#define DE3 479.0
>>> +float64_t delem0 = DE0;
>>> +float64_t delem1 = DE1;
>>> +float64_t delem2 = DE2;
>>> +float64_t delem3 = DE3;
>>> +
>>> +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
>>> +
>>> +/* Expected results for vfms_n.  */
>>> +
>>> +VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1 * E0};
>>> +VECT_VAR_DECL(expectedfms1, float, 32, 2) [] = {A2 + -B2 * E1, A3 + -B3 * E1};
>>> +VECT_VAR_DECL(expectedfms2, float, 32, 2) [] = {A4 + -B4 * E2, A5 + -B5 * E2};
>>> +VECT_VAR_DECL(expectedfms3, float, 32, 2) [] = {A6 + -B6 * E3, A7 + -B7 * E3};
>>> +VECT_VAR_DECL(expectedfma0, float, 32, 2) [] = {A0 + B0 * E0, A1 + B1 * E0};
>>> +VECT_VAR_DECL(expectedfma1, float, 32, 2) [] = {A2 + B2 * E1, A3 + B3 * E1};
>>> +VECT_VAR_DECL(expectedfma2, float, 32, 2) [] = {A4 + B4 * E2, A5 + B5 * E2};
>>> +VECT_VAR_DECL(expectedfma3, float, 32, 2) [] = {A6 + B6 * E3, A7 + B7 * E3};
>>> +
>>> +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 2) =
>>> +  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 2);
>>> +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 2) =
>>> +  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 2);
>>> +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 2) =
>>> +  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 2);
>>> +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 2) =
>>> +  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 2);
>>> +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 2) =
>>> +  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 2);
>>> +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 2) =
>>> +  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 2);
>>> +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 2) =
>>> +  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 2);
>>> +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 2) =
>>> +  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 2);
>>> +
>>> +
>>> +VECT_VAR_DECL(expectedfms0, float, 32, 4) [] = {A0 + -B0 * E0, A1 + -B1 * E0,
>>> +                        A2 + -B2 * E0, A3 + -B3 * E0};
>>> +VECT_VAR_DECL(expectedfms1, float, 32, 4) [] = {A4 + -B4 * E1, A5 + -B5 * E1,
>>> +                        A6 + -B6 * E1, A7 + -B7 * E1};
>>> +VECT_VAR_DECL(expectedfms2, float, 32, 4) [] = {A0 + -B0 * E2, A2 + -B2 * E2,
>>> +                        A4 + -B4 * E2, A6 + -B6 * E2};
>>> +VECT_VAR_DECL(expectedfms3, float, 32, 4) [] = {A1 + -B1 * E3, A3 + -B3 * E3,
>>> +                        A5 + -B5 * E3, A7 + -B7 * E3};
>>> +VECT_VAR_DECL(expectedfma0, float, 32, 4) [] = {A0 + B0 * E0, A1 + B1 * E0,
>>> +                        A2 + B2 * E0, A3 + B3 * E0};
>>> +VECT_VAR_DECL(expectedfma1, float, 32, 4) [] = {A4 + B4 * E1, A5 + B5 * E1,
>>> +                        A6 + B6 * E1, A7 + B7 * E1};
>>> +VECT_VAR_DECL(expectedfma2, float, 32, 4) [] = {A0 + B0 * E2, A2 + B2 * E2,
>>> +                        A4 + B4 * E2, A6 + B6 * E2};
>>> +VECT_VAR_DECL(expectedfma3, float, 32, 4) [] = {A1 + B1 * E3, A3 + B3 * E3,
>>> +                        A5 + B5 * E3, A7 + B7 * E3};
>>> +
>>> +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 4) =
>>> +  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 4);
>>> +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 4) =
>>> +  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 4);
>>> +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 4) =
>>> +  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 4);
>>> +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 4) =
>>> +  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 4);
>>> +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 4) =
>>> +  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 4);
>>> +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 4) =
>>> +  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 4);
>>> +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 4) =
>>> +  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 4);
>>> +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 4) =
>>> +  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 4);
>>> +
>>> +VECT_VAR_DECL(expectedfms0, float, 64, 2) [] = {DA0 + -DB0 * DE0,
>>> +                        DA1 + -DB1 * DE0};
>>> +VECT_VAR_DECL(expectedfms1, float, 64, 2) [] = {DA2 + -DB2 * DE1,
>>> +                        DA3 + -DB3 * DE1};
>>> +VECT_VAR_DECL(expectedfms2, float, 64, 2) [] = {DA4 + -DB4 * DE2,
>>> +                        DA5 + -DB5 * DE2};
>>> +VECT_VAR_DECL(expectedfms3, float, 64, 2) [] = {DA6 + -DB6 * DE3,
>>> +                        DA7 + -DB7 * DE3};
>>> +VECT_VAR_DECL(expectedfma0, float, 64, 2) [] = {DA0 + DB0 * DE0,
>>> +                        DA1 + DB1 * DE0};
>>> +VECT_VAR_DECL(expectedfma1, float, 64, 2) [] = {DA2 + DB2 * DE1,
>>> +                        DA3 + DB3 * DE1};
>>> +VECT_VAR_DECL(expectedfma2, float, 64, 2) [] = {DA4 + DB4 * DE2,
>>> +                        DA5 + DB5 * DE2};
>>> +VECT_VAR_DECL(expectedfma3, float, 64, 2) [] = {DA6 + DB6 * DE3,
>>> +                        DA7 + DB7 * DE3};
>>> +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 2) =
>>> +  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 2);
>>> +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 2) =
>>> +  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 2);
>>> +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 2) =
>>> +  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 2);
>>> +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 2) =
>>> +  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 2);
>>> +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 2) =
>>> +  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 2);
>>> +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 2) =
>>> +  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 2);
>>> +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 2) =
>>> +  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 2);
>>> +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 2) =
>>> +  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 2);
>>> +
>>> +VECT_VAR_DECL(expectedfms0, float, 64, 1) [] = {DA0 + -DB0 * DE0};
>>> +VECT_VAR_DECL(expectedfms1, float, 64, 1) [] = {DA2 + -DB2 * DE1};
>>> +VECT_VAR_DECL(expectedfms2, float, 64, 1) [] = {DA4 + -DB4 * DE2};
>>> +VECT_VAR_DECL(expectedfms3, float, 64, 1) [] = {DA6 + -DB6 * DE3};
>>> +VECT_VAR_DECL(expectedfma0, float, 64, 1) [] = {DA0 + DB0 * DE0};
>>> +VECT_VAR_DECL(expectedfma1, float, 64, 1) [] = {DA2 + DB2 * DE1};
>>> +VECT_VAR_DECL(expectedfma2, float, 64, 1) [] = {DA4 + DB4 * DE2};
>>> +VECT_VAR_DECL(expectedfma3, float, 64, 1) [] = {DA6 + DB6 * DE3};
>>> +
>>> +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 1) =
>>> +  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 1);
>>> +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 1) =
>>> +  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 1);
>>> +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 1) =
>>> +  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 1);
>>> +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 1) =
>>> +  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 1);
>>> +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 1) =
>>> +  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 1);
>>> +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 1) =
>>> +  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 1);
>>> +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 1) =
>>> +  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 1);
>>> +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 1) =
>>> +  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 1);
>>> +
>>> +void exec_vfma_vfms_n (void)
>>> +{
>>> +#undef TEST_MSG
>>> +#define TEST_MSG "VFMS_VFMA_N (FP32)"
>>> +  clean_results ();
>>> +
>>> +  DECL_VARIABLE(vsrc_1, float, 32, 2);
>>> +  DECL_VARIABLE(vsrc_2, float, 32, 2);
>>> +  VECT_VAR_DECL (buf_src_1, float, 32, 2) [] = {A0, A1};
>>> +  VECT_VAR_DECL (buf_src_2, float, 32, 2) [] = {B0, B1};
>>> +  VLOAD (vsrc_1, buf_src_1, , float, f, 32, 2);
>>> +  VLOAD (vsrc_2, buf_src_2, , float, f, 32, 2);
>>> +  DECL_VARIABLE (vector_res, float, 32, 2) =
>>> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem0);
>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>> +        VECT_VAR (vector_res, float, 32, 2));
>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms0_static, "");
>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem0);
>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>> +        VECT_VAR (vector_res, float, 32, 2));
>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma0_static, "");
>>> +
>>> +  VECT_VAR_DECL (buf_src_3, float, 32, 2) [] = {A2, A3};
>>> +  VECT_VAR_DECL (buf_src_4, float, 32, 2) [] = {B2, B3};
>>> +  VLOAD (vsrc_1, buf_src_3, , float, f, 32, 2);
>>> +  VLOAD (vsrc_2, buf_src_4, , float, f, 32, 2);
>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem1);
>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>> +        VECT_VAR (vector_res, float, 32, 2));
>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms1_static, "");
>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem1);
>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>> +        VECT_VAR (vector_res, float, 32, 2));
>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma1_static, "");
>>> +
>>> +  VECT_VAR_DECL (buf_src_5, float, 32, 2) [] = {A4, A5};
>>> +  VECT_VAR_DECL (buf_src_6, float, 32, 2) [] = {B4, B5};
>>> +  VLOAD (vsrc_1, buf_src_5, , float, f, 32, 2);
>>> +  VLOAD (vsrc_2, buf_src_6, , float, f, 32, 2);
>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem2);
>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>> +        VECT_VAR (vector_res, float, 32, 2));
>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms2_static, "");
>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem2);
>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>> +        VECT_VAR (vector_res, float, 32, 2));
>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma2_static, "");
>>> +
>>> +  VECT_VAR_DECL (buf_src_7, float, 32, 2) [] = {A6, A7};
>>> +  VECT_VAR_DECL (buf_src_8, float, 32, 2) [] = {B6, B7};
>>> +  VLOAD (vsrc_1, buf_src_7, , float, f, 32, 2);
>>> +  VLOAD (vsrc_2, buf_src_8, , float, f, 32, 2);
>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem3);
>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>> +        VECT_VAR (vector_res, float, 32, 2));
>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms3_static, "");
>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem3);
>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>> +        VECT_VAR (vector_res, float, 32, 2));
>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma3_static, "");
>>> +
>>> +#undef TEST_MSG
>>> +#define TEST_MSG "VFMSQ_VFMAQ_N (FP32)"
>>> +  clean_results ();
>>> +
>>> +  DECL_VARIABLE(vsrc_1, float, 32, 4);
>>> +  DECL_VARIABLE(vsrc_2, float, 32, 4);
>>> +  VECT_VAR_DECL (buf_src_1, float, 32, 4) [] = {A0, A1, A2, A3};
>>> +  VECT_VAR_DECL (buf_src_2, float, 32, 4) [] = {B0, B1, B2, B3};
>>> +  VLOAD (vsrc_1, buf_src_1, q, float, f, 32, 4);
>>> +  VLOAD (vsrc_2, buf_src_2, q, float, f, 32, 4);
>>> +  DECL_VARIABLE (vector_res, float, 32, 4) =
>>> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem0);
>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>> +         VECT_VAR (vector_res, float, 32, 4));
>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms0_static, "");
>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem0);
>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>> +         VECT_VAR (vector_res, float, 32, 4));
>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma0_static, "");
>>> +
>>> +  VECT_VAR_DECL (buf_src_3, float, 32, 4) [] = {A4, A5, A6, A7};
>>> +  VECT_VAR_DECL (buf_src_4, float, 32, 4) [] = {B4, B5, B6, B7};
>>> +  VLOAD (vsrc_1, buf_src_3, q, float, f, 32, 4);
>>> +  VLOAD (vsrc_2, buf_src_4, q, float, f, 32, 4);
>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem1);
>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>> +         VECT_VAR (vector_res, float, 32, 4));
>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms1_static, "");
>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem1);
>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>> +         VECT_VAR (vector_res, float, 32, 4));
>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma1_static, "");
>>> +
>>> +  VECT_VAR_DECL (buf_src_5, float, 32, 4) [] = {A0, A2, A4, A6};
>>> +  VECT_VAR_DECL (buf_src_6, float, 32, 4) [] = {B0, B2, B4, B6};
>>> +  VLOAD (vsrc_1, buf_src_5, q, float, f, 32, 4);
>>> +  VLOAD (vsrc_2, buf_src_6, q, float, f, 32, 4);
>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem2);
>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>> +         VECT_VAR (vector_res, float, 32, 4));
>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms2_static, "");
>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem2);
>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>> +         VECT_VAR (vector_res, float, 32, 4));
>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma2_static, "");
>>> +
>>> +  VECT_VAR_DECL (buf_src_7, float, 32, 4) [] = {A1, A3, A5, A7};
>>> +  VECT_VAR_DECL (buf_src_8, float, 32, 4) [] = {B1, B3, B5, B7};
>>> +  VLOAD (vsrc_1, buf_src_7, q, float, f, 32, 4);
>>> +  VLOAD (vsrc_2, buf_src_8, q, float, f, 32, 4);
>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem3);
>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>> +         VECT_VAR (vector_res, float, 32, 4));
>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms3_static, "");
>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem3);
>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>> +         VECT_VAR (vector_res, float, 32, 4));
>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma3_static, "");
>>> +
>>> +#undef TEST_MSG
>>> +#define TEST_MSG "VFMSQ_VFMAQ_N (FP64)"
>>> +  clean_results ();
>>> +
>>> +  DECL_VARIABLE(vsrc_1, float, 64, 2);
>>> +  DECL_VARIABLE(vsrc_2, float, 64, 2);
>>> +  VECT_VAR_DECL (buf_src_1, float, 64, 2) [] = {DA0, DA1};
>>> +  VECT_VAR_DECL (buf_src_2, float, 64, 2) [] = {DB0, DB1};
>>> +  VLOAD (vsrc_1, buf_src_1, q, float, f, 64, 2);
>>> +  VLOAD (vsrc_2, buf_src_2, q, float, f, 64, 2);
>>> +  DECL_VARIABLE (vector_res, float, 64, 2) =
>>> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem0);
>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>> +         VECT_VAR (vector_res, float, 64, 2));
>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms0_static, "");
>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem0);
>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>> +         VECT_VAR (vector_res, float, 64, 2));
>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma0_static, "");
>>> +
>>> +  VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3};
>>> +  VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3};
>>> +  VLOAD (vsrc_1, buf_src_3, q, float, f, 64, 2);
>>> +  VLOAD (vsrc_2, buf_src_4, q, float, f, 64, 2);
>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem1);
>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>> +         VECT_VAR (vector_res, float, 64, 2));
>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms1_static, "");
>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem1);
>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>> +         VECT_VAR (vector_res, float, 64, 2));
>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma1_static, "");
>>> +
>>> +  VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5};
>>> +  VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5};
>>> +  VLOAD (vsrc_1, buf_src_5, q, float, f, 64, 2);
>>> +  VLOAD (vsrc_2, buf_src_6, q, float, f, 64, 2);
>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem2);
>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>> +         VECT_VAR (vector_res, float, 64, 2));
>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms2_static, "");
>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem2);
>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>> +         VECT_VAR (vector_res, float, 64, 2));
>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma2_static, "");
>>> +
>>> +  VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7};
>>> +  VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7};
>>> +  VLOAD (vsrc_1, buf_src_7, q, float, f, 64, 2);
>>> +  VLOAD (vsrc_2, buf_src_8, q, float, f, 64, 2);
>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem3);
>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>> +         VECT_VAR (vector_res, float, 64, 2));
>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms3_static, "");
>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem3);
>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>> +         VECT_VAR (vector_res, float, 64, 2));
>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma3_static, "");
>>> +
>>> +#undef TEST_MSG
>>> +#define TEST_MSG "VFMS_VFMA_N (FP64)"
>>> +  clean_results ();
>>> +
>>> +  DECL_VARIABLE(vsrc_1, float, 64, 1);
>>> +  DECL_VARIABLE(vsrc_2, float, 64, 1);
>>> +  VECT_VAR_DECL (buf_src_1, float, 64, 1) [] = {DA0};
>>> +  VECT_VAR_DECL (buf_src_2, float, 64, 1) [] = {DB0};
>>> +  VLOAD (vsrc_1, buf_src_1, , float, f, 64, 1);
>>> +  VLOAD (vsrc_2, buf_src_2, , float, f, 64, 1);
>>> +  DECL_VARIABLE (vector_res, float, 64, 1) =
>>> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem0);
>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>> +         VECT_VAR (vector_res, float, 64, 1));
>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms0_static, "");
>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem0);
>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>> +         VECT_VAR (vector_res, float, 64, 1));
>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma0_static, "");
>>> +
>>> +  VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2};
>>> +  VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2};
>>> +  VLOAD (vsrc_1, buf_src_3, , float, f, 64, 1);
>>> +  VLOAD (vsrc_2, buf_src_4, , float, f, 64, 1);
>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem1);
>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>> +         VECT_VAR (vector_res, float, 64, 1));
>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms1_static, "");
>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem1);
>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>> +         VECT_VAR (vector_res, float, 64, 1));
>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma1_static, "");
>>> +
>>> +  VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4};
>>> +  VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4};
>>> +  VLOAD (vsrc_1, buf_src_5, , float, f, 64, 1);
>>> +  VLOAD (vsrc_2, buf_src_6, , float, f, 64, 1);
>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem2);
>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>> +         VECT_VAR (vector_res, float, 64, 1));
>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms2_static, "");
>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem2);
>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>> +         VECT_VAR (vector_res, float, 64, 1));
>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma2_static, "");
>>> +
>>> +  VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6};
>>> +  VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6};
>>> +  VLOAD (vsrc_1, buf_src_7, , float, f, 64, 1);
>>> +  VLOAD (vsrc_2, buf_src_8, , float, f, 64, 1);
>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem3);
>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>> +         VECT_VAR (vector_res, float, 64, 1));
>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms3_static, "");
>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem3);
>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>> +         VECT_VAR (vector_res, float, 64, 1));
>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma3_static, "");
>>> +}
>>> +#endif
>>> +
>>> +int
>>> +main (void)
>>> +{
>>> +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
>>> +  exec_vfma_vfms_n ();
>>> +#endif
>>> +  return 0;
>>> +}
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
>>> index 1ba1fed98a0711496815e00d2d702e5bfa2a7d43..5b348827002dcfef1f589900a4cf5ff7ada26697 100644
>>> --- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
>>> +++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
>>> @@ -110,6 +110,6 @@ main (int argc, char **argv)
>>>   /* vfmaq_lane_f64.
>>>      vfma_laneq_f64.
>>>      vfmaq_laneq_f64.  */
>>> -/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
>>> +/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
>>>     diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
>>> index 887ebae10da715c8d301a8494a2225e53f15bd7d..6c194a023d34ebafb4d732edc303985531f92a63 100644
>>> --- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
>>> +++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
>>> @@ -111,6 +111,6 @@ main (int argc, char **argv)
>>>   /* vfmsq_lane_f64.
>>>      vfms_laneq_f64.
>>>      vfmsq_laneq_f64.  */
>>> -/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
>>> +/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
>>>
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [AArch64, 1/4] Add the missing support of vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64
  2016-05-17 12:42     ` Kyrill Tkachov
@ 2016-05-18  8:25       ` Christophe Lyon
  2016-05-18  8:32         ` Kyrill Tkachov
  2016-05-18  9:44         ` Jiong Wang
  0 siblings, 2 replies; 8+ messages in thread
From: Christophe Lyon @ 2016-05-18  8:25 UTC (permalink / raw)
  To: Kyrill Tkachov; +Cc: James Greenhalgh, Jiong Wang, GCC Patches

On 17 May 2016 at 14:42, Kyrill Tkachov <kyrylo.tkachov@foss.arm.com> wrote:
>
> On 17/05/16 13:40, Kyrill Tkachov wrote:
>>
>>
>> On 17/05/16 13:20, James Greenhalgh wrote:
>>>
>>> On Mon, May 16, 2016 at 10:09:26AM +0100, Jiong Wang wrote:
>>>>
>>>> The support of vfma_n_f64, vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64 are
>>>> missing in current gcc arm_neon.h.
>>>>
>>>> Meanwhile, besides "(fma (vec_dup (vec_select)))", fma by element can
>>>> also comes from "(fma (vec_dup(scalar" where the scalar value is already
>>>> sitting in vector register then duplicated to other lanes, and there is
>>>> no lane size change.
>>>>
>>>> This patch implement this and can generate better code under some
>>>> context. For example:
>>>>
>>>> cat test.c
>>>> ===
>>>> typedef __Float32x2_t float32x2_t;
>>>> typedef float float32_t;
>>>>
>>>> float32x2_t
>>>> vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
>>>> {
>>>>        return __builtin_aarch64_fmav2sf (__b,  (float32x2_t) {__c,
>>>> __c}, __a);
>>>> }
>>>>
>>>> before (-O2)
>>>> ===
>>>> vfma_n_f32:
>>>>          dup     v2.2s, v2.s[0]
>>>>          fmla    v0.2s, v1.2s, v2.2s
>>>>          ret
>>>> after
>>>> ===
>>>> vfma_n_f32:
>>>>          fmla    v0.2s, v1.2s, v2.s[0]
>>>>          ret
>>>>
>>>> OK for trunk?
>>>>
>>>> 2016-05-16  Jiong Wang <jiong.wang@arm.com>
>>>
>>> This ChangeLog entry is not correctly formatted. There should be two
>>> spaces between your name and your email, and each line should start with
>>> a tab.
>>>
>>>> gcc/
>>>>    * config/aarch64/aarch64-simd.md (*aarch64_fma4_elt_to_128df): Rename
>>>>    to *aarch64_fma4_elt_from_dup<mode>.
>>>>    (*aarch64_fnma4_elt_to_128df): Rename to
>>>> *aarch64_fnma4_elt_from_dup<mode>.
>>>>    * config/aarch64/arm_neon.h (vfma_n_f64): New.
>>>>    (vfms_n_f32): Likewise.
>>>>    (vfms_n_f64): Likewise.
>>>>    (vfmsq_n_f32): Likewise.
>>>>    (vfmsq_n_f64): Likewise.
>>>>
>>>> gcc/testsuite/
>>>>    * gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c: Use
>>>> standard syntax.
>>>>    * gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c: Likewise.
>>>
>>> The paths of these two entries are incorrect. Remove the gcc/testsuite
>>> from the front. I don't understand what you mean by "Use standard
>>> syntax.",
>>> please fix this to describe what you are actually changing.
>>>
>>>>    * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: New entry
>>>> for float64x1.
>>>>    * gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: New.
>>>
>>> These two changes need approval from an ARM maintainer as they are in
>>> common files.
>>>
>>>  From an AArch64 perspective, this patch is OK with a fixed ChangeLog.
>>> Please
>>> wait for an ARM OK for the test changes.
>>
>>
>> Considering that the tests' functionality is guarded on #if
>> defined(__aarch64__)
>> it's a noop on arm and so is ok from that perspective (we have precedence
>> for
>> tests guarded in such a way in advsimd-intrinsics.exp)
>>
>
> Of course I meant precedent rather than precedence :/
>

Unfortunately, the guard is not correct :(

The float64_t type is not available on arm, so the new
declarations/definitions in arm-neon-ref.h
need a guard.

Since this patch was checked-in, all the advsimd intrinsics tests fail
to compile
on arm:
In file included from
/aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaba.c:2:0:
/aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h:139:22:
error: unknown type name 'float64_t'
/aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h:51:35:
note: in definition of macro 'VECT_VAR_DECL'
/aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h:139:8:
note: in expansion of macro 'ARRAY'


Christophe


> Kyrill
>
>
>> The arm-neon-ref.h additions are ok too.
>>
>> Thanks,
>> Kyrill
>>
>>
>>> Thanks,
>>> James
>>>
>>>> diff --git a/gcc/config/aarch64/aarch64-simd.md
>>>> b/gcc/config/aarch64/aarch64-simd.md
>>>> index
>>>> bd73bce64414e8bc01732d14311d742cf28f4586..90eaca176b4706e6cc42f16ce2c956f1c8ad17b1
>>>> 100644
>>>> --- a/gcc/config/aarch64/aarch64-simd.md
>>>> +++ b/gcc/config/aarch64/aarch64-simd.md
>>>> @@ -1579,16 +1579,16 @@
>>>>     [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
>>>>   )
>>>>   -(define_insn "*aarch64_fma4_elt_to_128df"
>>>> -  [(set (match_operand:V2DF 0 "register_operand" "=w")
>>>> -    (fma:V2DF
>>>> -      (vec_duplicate:V2DF
>>>> -      (match_operand:DF 1 "register_operand" "w"))
>>>> -      (match_operand:V2DF 2 "register_operand" "w")
>>>> -      (match_operand:V2DF 3 "register_operand" "0")))]
>>>> +(define_insn "*aarch64_fma4_elt_from_dup<mode>"
>>>> +  [(set (match_operand:VMUL 0 "register_operand" "=w")
>>>> +    (fma:VMUL
>>>> +      (vec_duplicate:VMUL
>>>> +      (match_operand:<VEL> 1 "register_operand" "w"))
>>>> +      (match_operand:VMUL 2 "register_operand" "w")
>>>> +      (match_operand:VMUL 3 "register_operand" "0")))]
>>>>     "TARGET_SIMD"
>>>> -  "fmla\\t%0.2d, %2.2d, %1.2d[0]"
>>>> -  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
>>>> +  "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
>>>> +  [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
>>>>   )
>>>>     (define_insn "*aarch64_fma4_elt_to_64v2df"
>>>> @@ -1656,17 +1656,17 @@
>>>>     [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
>>>>   )
>>>>   -(define_insn "*aarch64_fnma4_elt_to_128df"
>>>> -  [(set (match_operand:V2DF 0 "register_operand" "=w")
>>>> -    (fma:V2DF
>>>> -      (neg:V2DF
>>>> -        (match_operand:V2DF 2 "register_operand" "w"))
>>>> -      (vec_duplicate:V2DF
>>>> -    (match_operand:DF 1 "register_operand" "w"))
>>>> -      (match_operand:V2DF 3 "register_operand" "0")))]
>>>> -  "TARGET_SIMD"
>>>> -  "fmls\\t%0.2d, %2.2d, %1.2d[0]"
>>>> -  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
>>>> +(define_insn "*aarch64_fnma4_elt_from_dup<mode>"
>>>> +  [(set (match_operand:VMUL 0 "register_operand" "=w")
>>>> +    (fma:VMUL
>>>> +      (neg:VMUL
>>>> +        (match_operand:VMUL 2 "register_operand" "w"))
>>>> +      (vec_duplicate:VMUL
>>>> +    (match_operand:<VEL> 1 "register_operand" "w"))
>>>> +      (match_operand:VMUL 3 "register_operand" "0")))]
>>>> +  "TARGET_SIMD"
>>>> +  "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
>>>> +  [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
>>>>   )
>>>>     (define_insn "*aarch64_fnma4_elt_to_64v2df"
>>>> diff --git a/gcc/config/aarch64/arm_neon.h
>>>> b/gcc/config/aarch64/arm_neon.h
>>>> index
>>>> 2612a325718918cf7cd808f28c09c9c4c7b11c07..ca7ace5aa656163826569d046fcbf02f9f7d4d6c
>>>> 100644
>>>> --- a/gcc/config/aarch64/arm_neon.h
>>>> +++ b/gcc/config/aarch64/arm_neon.h
>>>> @@ -14456,6 +14456,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b,
>>>> float32_t __c)
>>>>     return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
>>>>   }
>>>>   +__extension__ static __inline float64x1_t __attribute__
>>>> ((__always_inline__))
>>>> +vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
>>>> +{
>>>> +  return (float64x1_t) {__b[0] * __c + __a[0]};
>>>> +}
>>>> +
>>>>   __extension__ static __inline float32x4_t __attribute__
>>>> ((__always_inline__))
>>>>   vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
>>>>   {
>>>> @@ -14597,6 +14603,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b,
>>>> float64x2_t __c)
>>>>     return __builtin_aarch64_fmav2df (-__b, __c, __a);
>>>>   }
>>>>   +__extension__ static __inline float32x2_t __attribute__
>>>> ((__always_inline__))
>>>> +vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
>>>> +{
>>>> +  return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a);
>>>> +}
>>>> +
>>>> +__extension__ static __inline float64x1_t __attribute__
>>>> ((__always_inline__))
>>>> +vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
>>>> +{
>>>> +  return (float64x1_t) {-__b[0] * __c + __a[0]};
>>>> +}
>>>> +
>>>> +__extension__ static __inline float32x4_t __attribute__
>>>> ((__always_inline__))
>>>> +vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
>>>> +{
>>>> +  return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a);
>>>> +}
>>>> +
>>>> +__extension__ static __inline float64x2_t __attribute__
>>>> ((__always_inline__))
>>>> +vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
>>>> +{
>>>> +  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a);
>>>> +}
>>>>     /* vfms_lane  */
>>>>   diff --git
>>>> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
>>>> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
>>>> index
>>>> 49fbd843e507ede8aa81d02c175a82a1221750a4..cf90825f87391b72aca9a29980210d21f4321c04
>>>> 100644
>>>> --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
>>>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
>>>> @@ -136,6 +136,7 @@ static ARRAY(result, poly, 16, 4);
>>>>   #if defined (__ARM_FP16_FORMAT_IEEE) || defined
>>>> (__ARM_FP16_FORMAT_ALTERNATIVE)
>>>>   static ARRAY(result, float, 16, 4);
>>>>   #endif
>>>> +static ARRAY(result, float, 64, 1);
>>>>   static ARRAY(result, float, 32, 2);
>>>>   static ARRAY(result, int, 8, 16);
>>>>   static ARRAY(result, int, 16, 8);
>>>> @@ -169,6 +170,7 @@ extern ARRAY(expected, poly, 8, 8);
>>>>   extern ARRAY(expected, poly, 16, 4);
>>>>   extern ARRAY(expected, hfloat, 16, 4);
>>>>   extern ARRAY(expected, hfloat, 32, 2);
>>>> +extern ARRAY(expected, hfloat, 64, 1);
>>>>   extern ARRAY(expected, int, 8, 16);
>>>>   extern ARRAY(expected, int, 16, 8);
>>>>   extern ARRAY(expected, int, 32, 4);
>>>> diff --git
>>>> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
>>>> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
>>>> new file mode 100644
>>>> index
>>>> 0000000000000000000000000000000000000000..26223763c59c849607b5320f6ec37098a556ce2e
>>>> --- /dev/null
>>>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
>>>> @@ -0,0 +1,490 @@
>>>> +#include <arm_neon.h>
>>>> +#include "arm-neon-ref.h"
>>>> +#include "compute-ref-data.h"
>>>> +
>>>> +#define A0 123.4f
>>>> +#define A1 -3.8f
>>>> +#define A2 -29.4f
>>>> +#define A3 (__builtin_inff ())
>>>> +#define A4 0.0f
>>>> +#define A5 24.0f
>>>> +#define A6 124.0f
>>>> +#define A7 1024.0f
>>>> +
>>>> +#define B0 -5.8f
>>>> +#define B1 -0.0f
>>>> +#define B2 -10.8f
>>>> +#define B3 10.0f
>>>> +#define B4 23.4f
>>>> +#define B5 -1234.8f
>>>> +#define B6 8.9f
>>>> +#define B7 4.0f
>>>> +
>>>> +#define E0 9.8f
>>>> +#define E1 -1024.0f
>>>> +#define E2 (-__builtin_inff ())
>>>> +#define E3 479.0f
>>>> +float32_t elem0 = E0;
>>>> +float32_t elem1 = E1;
>>>> +float32_t elem2 = E2;
>>>> +float32_t elem3 = E3;
>>>> +
>>>> +#define DA0 1231234.4
>>>> +#define DA1 -3.8
>>>> +#define DA2 -2980.4
>>>> +#define DA3 -5.8
>>>> +#define DA4 0.01123
>>>> +#define DA5 24.0
>>>> +#define DA6 124.12345
>>>> +#define DA7 1024.0
>>>> +
>>>> +#define DB0 -5.8
>>>> +#define DB1 (__builtin_inf ())
>>>> +#define DB2 -105.8
>>>> +#define DB3 10.0
>>>> +#define DB4 (-__builtin_inf ())
>>>> +#define DB5 -1234.8
>>>> +#define DB6 848.9
>>>> +#define DB7 44444.0
>>>> +
>>>> +#define DE0 9.8
>>>> +#define DE1 -1024.0
>>>> +#define DE2 105.8
>>>> +#define DE3 479.0
>>>> +float64_t delem0 = DE0;
>>>> +float64_t delem1 = DE1;
>>>> +float64_t delem2 = DE2;
>>>> +float64_t delem3 = DE3;
>>>> +
>>>> +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
>>>> +
>>>> +/* Expected results for vfms_n.  */
>>>> +
>>>> +VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1
>>>> * E0};
>>>> +VECT_VAR_DECL(expectedfms1, float, 32, 2) [] = {A2 + -B2 * E1, A3 + -B3
>>>> * E1};
>>>> +VECT_VAR_DECL(expectedfms2, float, 32, 2) [] = {A4 + -B4 * E2, A5 + -B5
>>>> * E2};
>>>> +VECT_VAR_DECL(expectedfms3, float, 32, 2) [] = {A6 + -B6 * E3, A7 + -B7
>>>> * E3};
>>>> +VECT_VAR_DECL(expectedfma0, float, 32, 2) [] = {A0 + B0 * E0, A1 + B1 *
>>>> E0};
>>>> +VECT_VAR_DECL(expectedfma1, float, 32, 2) [] = {A2 + B2 * E1, A3 + B3 *
>>>> E1};
>>>> +VECT_VAR_DECL(expectedfma2, float, 32, 2) [] = {A4 + B4 * E2, A5 + B5 *
>>>> E2};
>>>> +VECT_VAR_DECL(expectedfma3, float, 32, 2) [] = {A6 + B6 * E3, A7 + B7 *
>>>> E3};
>>>> +
>>>> +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 2) =
>>>> +  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 2);
>>>> +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 2) =
>>>> +  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 2);
>>>> +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 2) =
>>>> +  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 2);
>>>> +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 2) =
>>>> +  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 2);
>>>> +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 2) =
>>>> +  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 2);
>>>> +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 2) =
>>>> +  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 2);
>>>> +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 2) =
>>>> +  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 2);
>>>> +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 2) =
>>>> +  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 2);
>>>> +
>>>> +
>>>> +VECT_VAR_DECL(expectedfms0, float, 32, 4) [] = {A0 + -B0 * E0, A1 + -B1
>>>> * E0,
>>>> +                        A2 + -B2 * E0, A3 + -B3 * E0};
>>>> +VECT_VAR_DECL(expectedfms1, float, 32, 4) [] = {A4 + -B4 * E1, A5 + -B5
>>>> * E1,
>>>> +                        A6 + -B6 * E1, A7 + -B7 * E1};
>>>> +VECT_VAR_DECL(expectedfms2, float, 32, 4) [] = {A0 + -B0 * E2, A2 + -B2
>>>> * E2,
>>>> +                        A4 + -B4 * E2, A6 + -B6 * E2};
>>>> +VECT_VAR_DECL(expectedfms3, float, 32, 4) [] = {A1 + -B1 * E3, A3 + -B3
>>>> * E3,
>>>> +                        A5 + -B5 * E3, A7 + -B7 * E3};
>>>> +VECT_VAR_DECL(expectedfma0, float, 32, 4) [] = {A0 + B0 * E0, A1 + B1 *
>>>> E0,
>>>> +                        A2 + B2 * E0, A3 + B3 * E0};
>>>> +VECT_VAR_DECL(expectedfma1, float, 32, 4) [] = {A4 + B4 * E1, A5 + B5 *
>>>> E1,
>>>> +                        A6 + B6 * E1, A7 + B7 * E1};
>>>> +VECT_VAR_DECL(expectedfma2, float, 32, 4) [] = {A0 + B0 * E2, A2 + B2 *
>>>> E2,
>>>> +                        A4 + B4 * E2, A6 + B6 * E2};
>>>> +VECT_VAR_DECL(expectedfma3, float, 32, 4) [] = {A1 + B1 * E3, A3 + B3 *
>>>> E3,
>>>> +                        A5 + B5 * E3, A7 + B7 * E3};
>>>> +
>>>> +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 4) =
>>>> +  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 4);
>>>> +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 4) =
>>>> +  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 4);
>>>> +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 4) =
>>>> +  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 4);
>>>> +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 4) =
>>>> +  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 4);
>>>> +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 4) =
>>>> +  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 4);
>>>> +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 4) =
>>>> +  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 4);
>>>> +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 4) =
>>>> +  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 4);
>>>> +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 4) =
>>>> +  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 4);
>>>> +
>>>> +VECT_VAR_DECL(expectedfms0, float, 64, 2) [] = {DA0 + -DB0 * DE0,
>>>> +                        DA1 + -DB1 * DE0};
>>>> +VECT_VAR_DECL(expectedfms1, float, 64, 2) [] = {DA2 + -DB2 * DE1,
>>>> +                        DA3 + -DB3 * DE1};
>>>> +VECT_VAR_DECL(expectedfms2, float, 64, 2) [] = {DA4 + -DB4 * DE2,
>>>> +                        DA5 + -DB5 * DE2};
>>>> +VECT_VAR_DECL(expectedfms3, float, 64, 2) [] = {DA6 + -DB6 * DE3,
>>>> +                        DA7 + -DB7 * DE3};
>>>> +VECT_VAR_DECL(expectedfma0, float, 64, 2) [] = {DA0 + DB0 * DE0,
>>>> +                        DA1 + DB1 * DE0};
>>>> +VECT_VAR_DECL(expectedfma1, float, 64, 2) [] = {DA2 + DB2 * DE1,
>>>> +                        DA3 + DB3 * DE1};
>>>> +VECT_VAR_DECL(expectedfma2, float, 64, 2) [] = {DA4 + DB4 * DE2,
>>>> +                        DA5 + DB5 * DE2};
>>>> +VECT_VAR_DECL(expectedfma3, float, 64, 2) [] = {DA6 + DB6 * DE3,
>>>> +                        DA7 + DB7 * DE3};
>>>> +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 2) =
>>>> +  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 2);
>>>> +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 2) =
>>>> +  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 2);
>>>> +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 2) =
>>>> +  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 2);
>>>> +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 2) =
>>>> +  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 2);
>>>> +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 2) =
>>>> +  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 2);
>>>> +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 2) =
>>>> +  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 2);
>>>> +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 2) =
>>>> +  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 2);
>>>> +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 2) =
>>>> +  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 2);
>>>> +
>>>> +VECT_VAR_DECL(expectedfms0, float, 64, 1) [] = {DA0 + -DB0 * DE0};
>>>> +VECT_VAR_DECL(expectedfms1, float, 64, 1) [] = {DA2 + -DB2 * DE1};
>>>> +VECT_VAR_DECL(expectedfms2, float, 64, 1) [] = {DA4 + -DB4 * DE2};
>>>> +VECT_VAR_DECL(expectedfms3, float, 64, 1) [] = {DA6 + -DB6 * DE3};
>>>> +VECT_VAR_DECL(expectedfma0, float, 64, 1) [] = {DA0 + DB0 * DE0};
>>>> +VECT_VAR_DECL(expectedfma1, float, 64, 1) [] = {DA2 + DB2 * DE1};
>>>> +VECT_VAR_DECL(expectedfma2, float, 64, 1) [] = {DA4 + DB4 * DE2};
>>>> +VECT_VAR_DECL(expectedfma3, float, 64, 1) [] = {DA6 + DB6 * DE3};
>>>> +
>>>> +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 1) =
>>>> +  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 1);
>>>> +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 1) =
>>>> +  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 1);
>>>> +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 1) =
>>>> +  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 1);
>>>> +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 1) =
>>>> +  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 1);
>>>> +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 1) =
>>>> +  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 1);
>>>> +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 1) =
>>>> +  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 1);
>>>> +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 1) =
>>>> +  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 1);
>>>> +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 1) =
>>>> +  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 1);
>>>> +
>>>> +void exec_vfma_vfms_n (void)
>>>> +{
>>>> +#undef TEST_MSG
>>>> +#define TEST_MSG "VFMS_VFMA_N (FP32)"
>>>> +  clean_results ();
>>>> +
>>>> +  DECL_VARIABLE(vsrc_1, float, 32, 2);
>>>> +  DECL_VARIABLE(vsrc_2, float, 32, 2);
>>>> +  VECT_VAR_DECL (buf_src_1, float, 32, 2) [] = {A0, A1};
>>>> +  VECT_VAR_DECL (buf_src_2, float, 32, 2) [] = {B0, B1};
>>>> +  VLOAD (vsrc_1, buf_src_1, , float, f, 32, 2);
>>>> +  VLOAD (vsrc_2, buf_src_2, , float, f, 32, 2);
>>>> +  DECL_VARIABLE (vector_res, float, 32, 2) =
>>>> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem0);
>>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>>> +        VECT_VAR (vector_res, float, 32, 2));
>>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms0_static, "");
>>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>>> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem0);
>>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>>> +        VECT_VAR (vector_res, float, 32, 2));
>>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma0_static, "");
>>>> +
>>>> +  VECT_VAR_DECL (buf_src_3, float, 32, 2) [] = {A2, A3};
>>>> +  VECT_VAR_DECL (buf_src_4, float, 32, 2) [] = {B2, B3};
>>>> +  VLOAD (vsrc_1, buf_src_3, , float, f, 32, 2);
>>>> +  VLOAD (vsrc_2, buf_src_4, , float, f, 32, 2);
>>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>>> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem1);
>>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>>> +        VECT_VAR (vector_res, float, 32, 2));
>>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms1_static, "");
>>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>>> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem1);
>>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>>> +        VECT_VAR (vector_res, float, 32, 2));
>>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma1_static, "");
>>>> +
>>>> +  VECT_VAR_DECL (buf_src_5, float, 32, 2) [] = {A4, A5};
>>>> +  VECT_VAR_DECL (buf_src_6, float, 32, 2) [] = {B4, B5};
>>>> +  VLOAD (vsrc_1, buf_src_5, , float, f, 32, 2);
>>>> +  VLOAD (vsrc_2, buf_src_6, , float, f, 32, 2);
>>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>>> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem2);
>>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>>> +        VECT_VAR (vector_res, float, 32, 2));
>>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms2_static, "");
>>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>>> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem2);
>>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>>> +        VECT_VAR (vector_res, float, 32, 2));
>>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma2_static, "");
>>>> +
>>>> +  VECT_VAR_DECL (buf_src_7, float, 32, 2) [] = {A6, A7};
>>>> +  VECT_VAR_DECL (buf_src_8, float, 32, 2) [] = {B6, B7};
>>>> +  VLOAD (vsrc_1, buf_src_7, , float, f, 32, 2);
>>>> +  VLOAD (vsrc_2, buf_src_8, , float, f, 32, 2);
>>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>>> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem3);
>>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>>> +        VECT_VAR (vector_res, float, 32, 2));
>>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms3_static, "");
>>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>>> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem3);
>>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>>> +        VECT_VAR (vector_res, float, 32, 2));
>>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma3_static, "");
>>>> +
>>>> +#undef TEST_MSG
>>>> +#define TEST_MSG "VFMSQ_VFMAQ_N (FP32)"
>>>> +  clean_results ();
>>>> +
>>>> +  DECL_VARIABLE(vsrc_1, float, 32, 4);
>>>> +  DECL_VARIABLE(vsrc_2, float, 32, 4);
>>>> +  VECT_VAR_DECL (buf_src_1, float, 32, 4) [] = {A0, A1, A2, A3};
>>>> +  VECT_VAR_DECL (buf_src_2, float, 32, 4) [] = {B0, B1, B2, B3};
>>>> +  VLOAD (vsrc_1, buf_src_1, q, float, f, 32, 4);
>>>> +  VLOAD (vsrc_2, buf_src_2, q, float, f, 32, 4);
>>>> +  DECL_VARIABLE (vector_res, float, 32, 4) =
>>>> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem0);
>>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>>> +         VECT_VAR (vector_res, float, 32, 4));
>>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms0_static, "");
>>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>>> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem0);
>>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>>> +         VECT_VAR (vector_res, float, 32, 4));
>>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma0_static, "");
>>>> +
>>>> +  VECT_VAR_DECL (buf_src_3, float, 32, 4) [] = {A4, A5, A6, A7};
>>>> +  VECT_VAR_DECL (buf_src_4, float, 32, 4) [] = {B4, B5, B6, B7};
>>>> +  VLOAD (vsrc_1, buf_src_3, q, float, f, 32, 4);
>>>> +  VLOAD (vsrc_2, buf_src_4, q, float, f, 32, 4);
>>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>>> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem1);
>>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>>> +         VECT_VAR (vector_res, float, 32, 4));
>>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms1_static, "");
>>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>>> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem1);
>>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>>> +         VECT_VAR (vector_res, float, 32, 4));
>>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma1_static, "");
>>>> +
>>>> +  VECT_VAR_DECL (buf_src_5, float, 32, 4) [] = {A0, A2, A4, A6};
>>>> +  VECT_VAR_DECL (buf_src_6, float, 32, 4) [] = {B0, B2, B4, B6};
>>>> +  VLOAD (vsrc_1, buf_src_5, q, float, f, 32, 4);
>>>> +  VLOAD (vsrc_2, buf_src_6, q, float, f, 32, 4);
>>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>>> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem2);
>>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>>> +         VECT_VAR (vector_res, float, 32, 4));
>>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms2_static, "");
>>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>>> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem2);
>>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>>> +         VECT_VAR (vector_res, float, 32, 4));
>>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma2_static, "");
>>>> +
>>>> +  VECT_VAR_DECL (buf_src_7, float, 32, 4) [] = {A1, A3, A5, A7};
>>>> +  VECT_VAR_DECL (buf_src_8, float, 32, 4) [] = {B1, B3, B5, B7};
>>>> +  VLOAD (vsrc_1, buf_src_7, q, float, f, 32, 4);
>>>> +  VLOAD (vsrc_2, buf_src_8, q, float, f, 32, 4);
>>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>>> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem3);
>>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>>> +         VECT_VAR (vector_res, float, 32, 4));
>>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms3_static, "");
>>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>>> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem3);
>>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>>> +         VECT_VAR (vector_res, float, 32, 4));
>>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma3_static, "");
>>>> +
>>>> +#undef TEST_MSG
>>>> +#define TEST_MSG "VFMSQ_VFMAQ_N (FP64)"
>>>> +  clean_results ();
>>>> +
>>>> +  DECL_VARIABLE(vsrc_1, float, 64, 2);
>>>> +  DECL_VARIABLE(vsrc_2, float, 64, 2);
>>>> +  VECT_VAR_DECL (buf_src_1, float, 64, 2) [] = {DA0, DA1};
>>>> +  VECT_VAR_DECL (buf_src_2, float, 64, 2) [] = {DB0, DB1};
>>>> +  VLOAD (vsrc_1, buf_src_1, q, float, f, 64, 2);
>>>> +  VLOAD (vsrc_2, buf_src_2, q, float, f, 64, 2);
>>>> +  DECL_VARIABLE (vector_res, float, 64, 2) =
>>>> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem0);
>>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>>> +         VECT_VAR (vector_res, float, 64, 2));
>>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms0_static, "");
>>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>>> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem0);
>>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>>> +         VECT_VAR (vector_res, float, 64, 2));
>>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma0_static, "");
>>>> +
>>>> +  VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3};
>>>> +  VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3};
>>>> +  VLOAD (vsrc_1, buf_src_3, q, float, f, 64, 2);
>>>> +  VLOAD (vsrc_2, buf_src_4, q, float, f, 64, 2);
>>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>>> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem1);
>>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>>> +         VECT_VAR (vector_res, float, 64, 2));
>>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms1_static, "");
>>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>>> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem1);
>>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>>> +         VECT_VAR (vector_res, float, 64, 2));
>>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma1_static, "");
>>>> +
>>>> +  VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5};
>>>> +  VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5};
>>>> +  VLOAD (vsrc_1, buf_src_5, q, float, f, 64, 2);
>>>> +  VLOAD (vsrc_2, buf_src_6, q, float, f, 64, 2);
>>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>>> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem2);
>>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>>> +         VECT_VAR (vector_res, float, 64, 2));
>>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms2_static, "");
>>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>>> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem2);
>>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>>> +         VECT_VAR (vector_res, float, 64, 2));
>>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma2_static, "");
>>>> +
>>>> +  VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7};
>>>> +  VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7};
>>>> +  VLOAD (vsrc_1, buf_src_7, q, float, f, 64, 2);
>>>> +  VLOAD (vsrc_2, buf_src_8, q, float, f, 64, 2);
>>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>>> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem3);
>>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>>> +         VECT_VAR (vector_res, float, 64, 2));
>>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms3_static, "");
>>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>>> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem3);
>>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>>> +         VECT_VAR (vector_res, float, 64, 2));
>>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma3_static, "");
>>>> +
>>>> +#undef TEST_MSG
>>>> +#define TEST_MSG "VFMS_VFMA_N (FP64)"
>>>> +  clean_results ();
>>>> +
>>>> +  DECL_VARIABLE(vsrc_1, float, 64, 1);
>>>> +  DECL_VARIABLE(vsrc_2, float, 64, 1);
>>>> +  VECT_VAR_DECL (buf_src_1, float, 64, 1) [] = {DA0};
>>>> +  VECT_VAR_DECL (buf_src_2, float, 64, 1) [] = {DB0};
>>>> +  VLOAD (vsrc_1, buf_src_1, , float, f, 64, 1);
>>>> +  VLOAD (vsrc_2, buf_src_2, , float, f, 64, 1);
>>>> +  DECL_VARIABLE (vector_res, float, 64, 1) =
>>>> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem0);
>>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>>> +         VECT_VAR (vector_res, float, 64, 1));
>>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms0_static, "");
>>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>>> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem0);
>>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>>> +         VECT_VAR (vector_res, float, 64, 1));
>>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma0_static, "");
>>>> +
>>>> +  VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2};
>>>> +  VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2};
>>>> +  VLOAD (vsrc_1, buf_src_3, , float, f, 64, 1);
>>>> +  VLOAD (vsrc_2, buf_src_4, , float, f, 64, 1);
>>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>>> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem1);
>>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>>> +         VECT_VAR (vector_res, float, 64, 1));
>>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms1_static, "");
>>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>>> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem1);
>>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>>> +         VECT_VAR (vector_res, float, 64, 1));
>>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma1_static, "");
>>>> +
>>>> +  VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4};
>>>> +  VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4};
>>>> +  VLOAD (vsrc_1, buf_src_5, , float, f, 64, 1);
>>>> +  VLOAD (vsrc_2, buf_src_6, , float, f, 64, 1);
>>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>>> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem2);
>>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>>> +         VECT_VAR (vector_res, float, 64, 1));
>>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms2_static, "");
>>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>>> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem2);
>>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>>> +         VECT_VAR (vector_res, float, 64, 1));
>>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma2_static, "");
>>>> +
>>>> +  VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6};
>>>> +  VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6};
>>>> +  VLOAD (vsrc_1, buf_src_7, , float, f, 64, 1);
>>>> +  VLOAD (vsrc_2, buf_src_8, , float, f, 64, 1);
>>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>>> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem3);
>>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>>> +         VECT_VAR (vector_res, float, 64, 1));
>>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms3_static, "");
>>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>>> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem3);
>>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>>> +         VECT_VAR (vector_res, float, 64, 1));
>>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma3_static, "");
>>>> +}
>>>> +#endif
>>>> +
>>>> +int
>>>> +main (void)
>>>> +{
>>>> +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
>>>> +  exec_vfma_vfms_n ();
>>>> +#endif
>>>> +  return 0;
>>>> +}
>>>> diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
>>>> b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
>>>> index
>>>> 1ba1fed98a0711496815e00d2d702e5bfa2a7d43..5b348827002dcfef1f589900a4cf5ff7ada26697
>>>> 100644
>>>> --- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
>>>> +++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
>>>> @@ -110,6 +110,6 @@ main (int argc, char **argv)
>>>>   /* vfmaq_lane_f64.
>>>>      vfma_laneq_f64.
>>>>      vfmaq_laneq_f64.  */
>>>> -/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d,
>>>> v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
>>>> +/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d,
>>>> v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
>>>>     diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
>>>> b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
>>>> index
>>>> 887ebae10da715c8d301a8494a2225e53f15bd7d..6c194a023d34ebafb4d732edc303985531f92a63
>>>> 100644
>>>> --- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
>>>> +++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
>>>> @@ -111,6 +111,6 @@ main (int argc, char **argv)
>>>>   /* vfmsq_lane_f64.
>>>>      vfms_laneq_f64.
>>>>      vfmsq_laneq_f64.  */
>>>> -/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d,
>>>> v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
>>>> +/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d,
>>>> v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
>>>>
>>
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [AArch64, 1/4] Add the missing support of vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64
  2016-05-18  8:25       ` Christophe Lyon
@ 2016-05-18  8:32         ` Kyrill Tkachov
  2016-05-18  9:44         ` Jiong Wang
  1 sibling, 0 replies; 8+ messages in thread
From: Kyrill Tkachov @ 2016-05-18  8:32 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: James Greenhalgh, Jiong Wang, GCC Patches


On 18/05/16 09:25, Christophe Lyon wrote:
> On 17 May 2016 at 14:42, Kyrill Tkachov <kyrylo.tkachov@foss.arm.com> wrote:
>> On 17/05/16 13:40, Kyrill Tkachov wrote:
>>>
>>> On 17/05/16 13:20, James Greenhalgh wrote:
>>>> On Mon, May 16, 2016 at 10:09:26AM +0100, Jiong Wang wrote:
>>>>> The support of vfma_n_f64, vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64 are
>>>>> missing in current gcc arm_neon.h.
>>>>>
>>>>> Meanwhile, besides "(fma (vec_dup (vec_select)))", fma by element can
>>>>> also comes from "(fma (vec_dup(scalar" where the scalar value is already
>>>>> sitting in vector register then duplicated to other lanes, and there is
>>>>> no lane size change.
>>>>>
>>>>> This patch implement this and can generate better code under some
>>>>> context. For example:
>>>>>
>>>>> cat test.c
>>>>> ===
>>>>> typedef __Float32x2_t float32x2_t;
>>>>> typedef float float32_t;
>>>>>
>>>>> float32x2_t
>>>>> vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
>>>>> {
>>>>>         return __builtin_aarch64_fmav2sf (__b,  (float32x2_t) {__c,
>>>>> __c}, __a);
>>>>> }
>>>>>
>>>>> before (-O2)
>>>>> ===
>>>>> vfma_n_f32:
>>>>>           dup     v2.2s, v2.s[0]
>>>>>           fmla    v0.2s, v1.2s, v2.2s
>>>>>           ret
>>>>> after
>>>>> ===
>>>>> vfma_n_f32:
>>>>>           fmla    v0.2s, v1.2s, v2.s[0]
>>>>>           ret
>>>>>
>>>>> OK for trunk?
>>>>>
>>>>> 2016-05-16  Jiong Wang <jiong.wang@arm.com>
>>>> This ChangeLog entry is not correctly formatted. There should be two
>>>> spaces between your name and your email, and each line should start with
>>>> a tab.
>>>>
>>>>> gcc/
>>>>>     * config/aarch64/aarch64-simd.md (*aarch64_fma4_elt_to_128df): Rename
>>>>>     to *aarch64_fma4_elt_from_dup<mode>.
>>>>>     (*aarch64_fnma4_elt_to_128df): Rename to
>>>>> *aarch64_fnma4_elt_from_dup<mode>.
>>>>>     * config/aarch64/arm_neon.h (vfma_n_f64): New.
>>>>>     (vfms_n_f32): Likewise.
>>>>>     (vfms_n_f64): Likewise.
>>>>>     (vfmsq_n_f32): Likewise.
>>>>>     (vfmsq_n_f64): Likewise.
>>>>>
>>>>> gcc/testsuite/
>>>>>     * gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c: Use
>>>>> standard syntax.
>>>>>     * gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c: Likewise.
>>>> The paths of these two entries are incorrect. Remove the gcc/testsuite
>>>> from the front. I don't understand what you mean by "Use standard
>>>> syntax.",
>>>> please fix this to describe what you are actually changing.
>>>>
>>>>>     * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: New entry
>>>>> for float64x1.
>>>>>     * gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: New.
>>>> These two changes need approval from an ARM maintainer as they are in
>>>> common files.
>>>>
>>>>   From an AArch64 perspective, this patch is OK with a fixed ChangeLog.
>>>> Please
>>>> wait for an ARM OK for the test changes.
>>>
>>> Considering that the tests' functionality is guarded on #if
>>> defined(__aarch64__)
>>> it's a noop on arm and so is ok from that perspective (we have precedence
>>> for
>>> tests guarded in such a way in advsimd-intrinsics.exp)
>>>
>> Of course I meant precedent rather than precedence :/
>>
> Unfortunately, the guard is not correct :(
>
> The float64_t type is not available on arm, so the new
> declarations/definitions in arm-neon-ref.h
> need a guard.

Indeed.
A patch to guard the float64_t definition in arm-neon-ref.h on __aarch64__
is preapproved (or I think would count as obvious in the interest of unbreaking
a large number of tests)

Sorry for not catching it earlier.

Kyrill

> Since this patch was checked-in, all the advsimd intrinsics tests fail
> to compile
> on arm:
> In file included from
> /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaba.c:2:0:
> /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h:139:22:
> error: unknown type name 'float64_t'
> /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h:51:35:
> note: in definition of macro 'VECT_VAR_DECL'
> /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h:139:8:
> note: in expansion of macro 'ARRAY'
>
>
> Christophe
>
>
>> Kyrill
>>
>>
>>> The arm-neon-ref.h additions are ok too.
>>>
>>> Thanks,
>>> Kyrill
>>>
>>>
>>>> Thanks,
>>>> James
>>>>
>>>>> diff --git a/gcc/config/aarch64/aarch64-simd.md
>>>>> b/gcc/config/aarch64/aarch64-simd.md
>>>>> index
>>>>> bd73bce64414e8bc01732d14311d742cf28f4586..90eaca176b4706e6cc42f16ce2c956f1c8ad17b1
>>>>> 100644
>>>>> --- a/gcc/config/aarch64/aarch64-simd.md
>>>>> +++ b/gcc/config/aarch64/aarch64-simd.md
>>>>> @@ -1579,16 +1579,16 @@
>>>>>      [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
>>>>>    )
>>>>>    -(define_insn "*aarch64_fma4_elt_to_128df"
>>>>> -  [(set (match_operand:V2DF 0 "register_operand" "=w")
>>>>> -    (fma:V2DF
>>>>> -      (vec_duplicate:V2DF
>>>>> -      (match_operand:DF 1 "register_operand" "w"))
>>>>> -      (match_operand:V2DF 2 "register_operand" "w")
>>>>> -      (match_operand:V2DF 3 "register_operand" "0")))]
>>>>> +(define_insn "*aarch64_fma4_elt_from_dup<mode>"
>>>>> +  [(set (match_operand:VMUL 0 "register_operand" "=w")
>>>>> +    (fma:VMUL
>>>>> +      (vec_duplicate:VMUL
>>>>> +      (match_operand:<VEL> 1 "register_operand" "w"))
>>>>> +      (match_operand:VMUL 2 "register_operand" "w")
>>>>> +      (match_operand:VMUL 3 "register_operand" "0")))]
>>>>>      "TARGET_SIMD"
>>>>> -  "fmla\\t%0.2d, %2.2d, %1.2d[0]"
>>>>> -  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
>>>>> +  "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
>>>>> +  [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
>>>>>    )
>>>>>      (define_insn "*aarch64_fma4_elt_to_64v2df"
>>>>> @@ -1656,17 +1656,17 @@
>>>>>      [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
>>>>>    )
>>>>>    -(define_insn "*aarch64_fnma4_elt_to_128df"
>>>>> -  [(set (match_operand:V2DF 0 "register_operand" "=w")
>>>>> -    (fma:V2DF
>>>>> -      (neg:V2DF
>>>>> -        (match_operand:V2DF 2 "register_operand" "w"))
>>>>> -      (vec_duplicate:V2DF
>>>>> -    (match_operand:DF 1 "register_operand" "w"))
>>>>> -      (match_operand:V2DF 3 "register_operand" "0")))]
>>>>> -  "TARGET_SIMD"
>>>>> -  "fmls\\t%0.2d, %2.2d, %1.2d[0]"
>>>>> -  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
>>>>> +(define_insn "*aarch64_fnma4_elt_from_dup<mode>"
>>>>> +  [(set (match_operand:VMUL 0 "register_operand" "=w")
>>>>> +    (fma:VMUL
>>>>> +      (neg:VMUL
>>>>> +        (match_operand:VMUL 2 "register_operand" "w"))
>>>>> +      (vec_duplicate:VMUL
>>>>> +    (match_operand:<VEL> 1 "register_operand" "w"))
>>>>> +      (match_operand:VMUL 3 "register_operand" "0")))]
>>>>> +  "TARGET_SIMD"
>>>>> +  "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
>>>>> +  [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
>>>>>    )
>>>>>      (define_insn "*aarch64_fnma4_elt_to_64v2df"
>>>>> diff --git a/gcc/config/aarch64/arm_neon.h
>>>>> b/gcc/config/aarch64/arm_neon.h
>>>>> index
>>>>> 2612a325718918cf7cd808f28c09c9c4c7b11c07..ca7ace5aa656163826569d046fcbf02f9f7d4d6c
>>>>> 100644
>>>>> --- a/gcc/config/aarch64/arm_neon.h
>>>>> +++ b/gcc/config/aarch64/arm_neon.h
>>>>> @@ -14456,6 +14456,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b,
>>>>> float32_t __c)
>>>>>      return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
>>>>>    }
>>>>>    +__extension__ static __inline float64x1_t __attribute__
>>>>> ((__always_inline__))
>>>>> +vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
>>>>> +{
>>>>> +  return (float64x1_t) {__b[0] * __c + __a[0]};
>>>>> +}
>>>>> +
>>>>>    __extension__ static __inline float32x4_t __attribute__
>>>>> ((__always_inline__))
>>>>>    vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
>>>>>    {
>>>>> @@ -14597,6 +14603,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b,
>>>>> float64x2_t __c)
>>>>>      return __builtin_aarch64_fmav2df (-__b, __c, __a);
>>>>>    }
>>>>>    +__extension__ static __inline float32x2_t __attribute__
>>>>> ((__always_inline__))
>>>>> +vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
>>>>> +{
>>>>> +  return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a);
>>>>> +}
>>>>> +
>>>>> +__extension__ static __inline float64x1_t __attribute__
>>>>> ((__always_inline__))
>>>>> +vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
>>>>> +{
>>>>> +  return (float64x1_t) {-__b[0] * __c + __a[0]};
>>>>> +}
>>>>> +
>>>>> +__extension__ static __inline float32x4_t __attribute__
>>>>> ((__always_inline__))
>>>>> +vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
>>>>> +{
>>>>> +  return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a);
>>>>> +}
>>>>> +
>>>>> +__extension__ static __inline float64x2_t __attribute__
>>>>> ((__always_inline__))
>>>>> +vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
>>>>> +{
>>>>> +  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a);
>>>>> +}
>>>>>      /* vfms_lane  */
>>>>>    diff --git
>>>>> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
>>>>> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
>>>>> index
>>>>> 49fbd843e507ede8aa81d02c175a82a1221750a4..cf90825f87391b72aca9a29980210d21f4321c04
>>>>> 100644
>>>>> --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
>>>>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
>>>>> @@ -136,6 +136,7 @@ static ARRAY(result, poly, 16, 4);
>>>>>    #if defined (__ARM_FP16_FORMAT_IEEE) || defined
>>>>> (__ARM_FP16_FORMAT_ALTERNATIVE)
>>>>>    static ARRAY(result, float, 16, 4);
>>>>>    #endif
>>>>> +static ARRAY(result, float, 64, 1);
>>>>>    static ARRAY(result, float, 32, 2);
>>>>>    static ARRAY(result, int, 8, 16);
>>>>>    static ARRAY(result, int, 16, 8);
>>>>> @@ -169,6 +170,7 @@ extern ARRAY(expected, poly, 8, 8);
>>>>>    extern ARRAY(expected, poly, 16, 4);
>>>>>    extern ARRAY(expected, hfloat, 16, 4);
>>>>>    extern ARRAY(expected, hfloat, 32, 2);
>>>>> +extern ARRAY(expected, hfloat, 64, 1);
>>>>>    extern ARRAY(expected, int, 8, 16);
>>>>>    extern ARRAY(expected, int, 16, 8);
>>>>>    extern ARRAY(expected, int, 32, 4);
>>>>> diff --git
>>>>> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
>>>>> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
>>>>> new file mode 100644
>>>>> index
>>>>> 0000000000000000000000000000000000000000..26223763c59c849607b5320f6ec37098a556ce2e
>>>>> --- /dev/null
>>>>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
>>>>> @@ -0,0 +1,490 @@
>>>>> +#include <arm_neon.h>
>>>>> +#include "arm-neon-ref.h"
>>>>> +#include "compute-ref-data.h"
>>>>> +
>>>>> +#define A0 123.4f
>>>>> +#define A1 -3.8f
>>>>> +#define A2 -29.4f
>>>>> +#define A3 (__builtin_inff ())
>>>>> +#define A4 0.0f
>>>>> +#define A5 24.0f
>>>>> +#define A6 124.0f
>>>>> +#define A7 1024.0f
>>>>> +
>>>>> +#define B0 -5.8f
>>>>> +#define B1 -0.0f
>>>>> +#define B2 -10.8f
>>>>> +#define B3 10.0f
>>>>> +#define B4 23.4f
>>>>> +#define B5 -1234.8f
>>>>> +#define B6 8.9f
>>>>> +#define B7 4.0f
>>>>> +
>>>>> +#define E0 9.8f
>>>>> +#define E1 -1024.0f
>>>>> +#define E2 (-__builtin_inff ())
>>>>> +#define E3 479.0f
>>>>> +float32_t elem0 = E0;
>>>>> +float32_t elem1 = E1;
>>>>> +float32_t elem2 = E2;
>>>>> +float32_t elem3 = E3;
>>>>> +
>>>>> +#define DA0 1231234.4
>>>>> +#define DA1 -3.8
>>>>> +#define DA2 -2980.4
>>>>> +#define DA3 -5.8
>>>>> +#define DA4 0.01123
>>>>> +#define DA5 24.0
>>>>> +#define DA6 124.12345
>>>>> +#define DA7 1024.0
>>>>> +
>>>>> +#define DB0 -5.8
>>>>> +#define DB1 (__builtin_inf ())
>>>>> +#define DB2 -105.8
>>>>> +#define DB3 10.0
>>>>> +#define DB4 (-__builtin_inf ())
>>>>> +#define DB5 -1234.8
>>>>> +#define DB6 848.9
>>>>> +#define DB7 44444.0
>>>>> +
>>>>> +#define DE0 9.8
>>>>> +#define DE1 -1024.0
>>>>> +#define DE2 105.8
>>>>> +#define DE3 479.0
>>>>> +float64_t delem0 = DE0;
>>>>> +float64_t delem1 = DE1;
>>>>> +float64_t delem2 = DE2;
>>>>> +float64_t delem3 = DE3;
>>>>> +
>>>>> +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
>>>>> +
>>>>> +/* Expected results for vfms_n.  */
>>>>> +
>>>>> +VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1
>>>>> * E0};
>>>>> +VECT_VAR_DECL(expectedfms1, float, 32, 2) [] = {A2 + -B2 * E1, A3 + -B3
>>>>> * E1};
>>>>> +VECT_VAR_DECL(expectedfms2, float, 32, 2) [] = {A4 + -B4 * E2, A5 + -B5
>>>>> * E2};
>>>>> +VECT_VAR_DECL(expectedfms3, float, 32, 2) [] = {A6 + -B6 * E3, A7 + -B7
>>>>> * E3};
>>>>> +VECT_VAR_DECL(expectedfma0, float, 32, 2) [] = {A0 + B0 * E0, A1 + B1 *
>>>>> E0};
>>>>> +VECT_VAR_DECL(expectedfma1, float, 32, 2) [] = {A2 + B2 * E1, A3 + B3 *
>>>>> E1};
>>>>> +VECT_VAR_DECL(expectedfma2, float, 32, 2) [] = {A4 + B4 * E2, A5 + B5 *
>>>>> E2};
>>>>> +VECT_VAR_DECL(expectedfma3, float, 32, 2) [] = {A6 + B6 * E3, A7 + B7 *
>>>>> E3};
>>>>> +
>>>>> +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 2) =
>>>>> +  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 2);
>>>>> +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 2) =
>>>>> +  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 2);
>>>>> +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 2) =
>>>>> +  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 2);
>>>>> +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 2) =
>>>>> +  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 2);
>>>>> +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 2) =
>>>>> +  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 2);
>>>>> +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 2) =
>>>>> +  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 2);
>>>>> +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 2) =
>>>>> +  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 2);
>>>>> +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 2) =
>>>>> +  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 2);
>>>>> +
>>>>> +
>>>>> +VECT_VAR_DECL(expectedfms0, float, 32, 4) [] = {A0 + -B0 * E0, A1 + -B1
>>>>> * E0,
>>>>> +                        A2 + -B2 * E0, A3 + -B3 * E0};
>>>>> +VECT_VAR_DECL(expectedfms1, float, 32, 4) [] = {A4 + -B4 * E1, A5 + -B5
>>>>> * E1,
>>>>> +                        A6 + -B6 * E1, A7 + -B7 * E1};
>>>>> +VECT_VAR_DECL(expectedfms2, float, 32, 4) [] = {A0 + -B0 * E2, A2 + -B2
>>>>> * E2,
>>>>> +                        A4 + -B4 * E2, A6 + -B6 * E2};
>>>>> +VECT_VAR_DECL(expectedfms3, float, 32, 4) [] = {A1 + -B1 * E3, A3 + -B3
>>>>> * E3,
>>>>> +                        A5 + -B5 * E3, A7 + -B7 * E3};
>>>>> +VECT_VAR_DECL(expectedfma0, float, 32, 4) [] = {A0 + B0 * E0, A1 + B1 *
>>>>> E0,
>>>>> +                        A2 + B2 * E0, A3 + B3 * E0};
>>>>> +VECT_VAR_DECL(expectedfma1, float, 32, 4) [] = {A4 + B4 * E1, A5 + B5 *
>>>>> E1,
>>>>> +                        A6 + B6 * E1, A7 + B7 * E1};
>>>>> +VECT_VAR_DECL(expectedfma2, float, 32, 4) [] = {A0 + B0 * E2, A2 + B2 *
>>>>> E2,
>>>>> +                        A4 + B4 * E2, A6 + B6 * E2};
>>>>> +VECT_VAR_DECL(expectedfma3, float, 32, 4) [] = {A1 + B1 * E3, A3 + B3 *
>>>>> E3,
>>>>> +                        A5 + B5 * E3, A7 + B7 * E3};
>>>>> +
>>>>> +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 4) =
>>>>> +  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 4);
>>>>> +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 4) =
>>>>> +  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 4);
>>>>> +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 4) =
>>>>> +  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 4);
>>>>> +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 4) =
>>>>> +  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 4);
>>>>> +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 4) =
>>>>> +  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 4);
>>>>> +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 4) =
>>>>> +  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 4);
>>>>> +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 4) =
>>>>> +  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 4);
>>>>> +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 4) =
>>>>> +  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 4);
>>>>> +
>>>>> +VECT_VAR_DECL(expectedfms0, float, 64, 2) [] = {DA0 + -DB0 * DE0,
>>>>> +                        DA1 + -DB1 * DE0};
>>>>> +VECT_VAR_DECL(expectedfms1, float, 64, 2) [] = {DA2 + -DB2 * DE1,
>>>>> +                        DA3 + -DB3 * DE1};
>>>>> +VECT_VAR_DECL(expectedfms2, float, 64, 2) [] = {DA4 + -DB4 * DE2,
>>>>> +                        DA5 + -DB5 * DE2};
>>>>> +VECT_VAR_DECL(expectedfms3, float, 64, 2) [] = {DA6 + -DB6 * DE3,
>>>>> +                        DA7 + -DB7 * DE3};
>>>>> +VECT_VAR_DECL(expectedfma0, float, 64, 2) [] = {DA0 + DB0 * DE0,
>>>>> +                        DA1 + DB1 * DE0};
>>>>> +VECT_VAR_DECL(expectedfma1, float, 64, 2) [] = {DA2 + DB2 * DE1,
>>>>> +                        DA3 + DB3 * DE1};
>>>>> +VECT_VAR_DECL(expectedfma2, float, 64, 2) [] = {DA4 + DB4 * DE2,
>>>>> +                        DA5 + DB5 * DE2};
>>>>> +VECT_VAR_DECL(expectedfma3, float, 64, 2) [] = {DA6 + DB6 * DE3,
>>>>> +                        DA7 + DB7 * DE3};
>>>>> +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 2) =
>>>>> +  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 2);
>>>>> +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 2) =
>>>>> +  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 2);
>>>>> +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 2) =
>>>>> +  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 2);
>>>>> +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 2) =
>>>>> +  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 2);
>>>>> +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 2) =
>>>>> +  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 2);
>>>>> +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 2) =
>>>>> +  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 2);
>>>>> +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 2) =
>>>>> +  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 2);
>>>>> +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 2) =
>>>>> +  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 2);
>>>>> +
>>>>> +VECT_VAR_DECL(expectedfms0, float, 64, 1) [] = {DA0 + -DB0 * DE0};
>>>>> +VECT_VAR_DECL(expectedfms1, float, 64, 1) [] = {DA2 + -DB2 * DE1};
>>>>> +VECT_VAR_DECL(expectedfms2, float, 64, 1) [] = {DA4 + -DB4 * DE2};
>>>>> +VECT_VAR_DECL(expectedfms3, float, 64, 1) [] = {DA6 + -DB6 * DE3};
>>>>> +VECT_VAR_DECL(expectedfma0, float, 64, 1) [] = {DA0 + DB0 * DE0};
>>>>> +VECT_VAR_DECL(expectedfma1, float, 64, 1) [] = {DA2 + DB2 * DE1};
>>>>> +VECT_VAR_DECL(expectedfma2, float, 64, 1) [] = {DA4 + DB4 * DE2};
>>>>> +VECT_VAR_DECL(expectedfma3, float, 64, 1) [] = {DA6 + DB6 * DE3};
>>>>> +
>>>>> +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 1) =
>>>>> +  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 1);
>>>>> +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 1) =
>>>>> +  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 1);
>>>>> +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 1) =
>>>>> +  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 1);
>>>>> +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 1) =
>>>>> +  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 1);
>>>>> +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 1) =
>>>>> +  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 1);
>>>>> +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 1) =
>>>>> +  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 1);
>>>>> +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 1) =
>>>>> +  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 1);
>>>>> +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 1) =
>>>>> +  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 1);
>>>>> +
>>>>> +void exec_vfma_vfms_n (void)
>>>>> +{
>>>>> +#undef TEST_MSG
>>>>> +#define TEST_MSG "VFMS_VFMA_N (FP32)"
>>>>> +  clean_results ();
>>>>> +
>>>>> +  DECL_VARIABLE(vsrc_1, float, 32, 2);
>>>>> +  DECL_VARIABLE(vsrc_2, float, 32, 2);
>>>>> +  VECT_VAR_DECL (buf_src_1, float, 32, 2) [] = {A0, A1};
>>>>> +  VECT_VAR_DECL (buf_src_2, float, 32, 2) [] = {B0, B1};
>>>>> +  VLOAD (vsrc_1, buf_src_1, , float, f, 32, 2);
>>>>> +  VLOAD (vsrc_2, buf_src_2, , float, f, 32, 2);
>>>>> +  DECL_VARIABLE (vector_res, float, 32, 2) =
>>>>> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem0);
>>>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>>>> +        VECT_VAR (vector_res, float, 32, 2));
>>>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms0_static, "");
>>>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>>>> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem0);
>>>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>>>> +        VECT_VAR (vector_res, float, 32, 2));
>>>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma0_static, "");
>>>>> +
>>>>> +  VECT_VAR_DECL (buf_src_3, float, 32, 2) [] = {A2, A3};
>>>>> +  VECT_VAR_DECL (buf_src_4, float, 32, 2) [] = {B2, B3};
>>>>> +  VLOAD (vsrc_1, buf_src_3, , float, f, 32, 2);
>>>>> +  VLOAD (vsrc_2, buf_src_4, , float, f, 32, 2);
>>>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>>>> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem1);
>>>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>>>> +        VECT_VAR (vector_res, float, 32, 2));
>>>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms1_static, "");
>>>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>>>> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem1);
>>>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>>>> +        VECT_VAR (vector_res, float, 32, 2));
>>>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma1_static, "");
>>>>> +
>>>>> +  VECT_VAR_DECL (buf_src_5, float, 32, 2) [] = {A4, A5};
>>>>> +  VECT_VAR_DECL (buf_src_6, float, 32, 2) [] = {B4, B5};
>>>>> +  VLOAD (vsrc_1, buf_src_5, , float, f, 32, 2);
>>>>> +  VLOAD (vsrc_2, buf_src_6, , float, f, 32, 2);
>>>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>>>> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem2);
>>>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>>>> +        VECT_VAR (vector_res, float, 32, 2));
>>>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms2_static, "");
>>>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>>>> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem2);
>>>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>>>> +        VECT_VAR (vector_res, float, 32, 2));
>>>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma2_static, "");
>>>>> +
>>>>> +  VECT_VAR_DECL (buf_src_7, float, 32, 2) [] = {A6, A7};
>>>>> +  VECT_VAR_DECL (buf_src_8, float, 32, 2) [] = {B6, B7};
>>>>> +  VLOAD (vsrc_1, buf_src_7, , float, f, 32, 2);
>>>>> +  VLOAD (vsrc_2, buf_src_8, , float, f, 32, 2);
>>>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>>>> +    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem3);
>>>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>>>> +        VECT_VAR (vector_res, float, 32, 2));
>>>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms3_static, "");
>>>>> +  VECT_VAR (vector_res, float, 32, 2) =
>>>>> +    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
>>>>> +        VECT_VAR (vsrc_2, float, 32, 2), elem3);
>>>>> +  vst1_f32 (VECT_VAR (result, float, 32, 2),
>>>>> +        VECT_VAR (vector_res, float, 32, 2));
>>>>> +  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma3_static, "");
>>>>> +
>>>>> +#undef TEST_MSG
>>>>> +#define TEST_MSG "VFMSQ_VFMAQ_N (FP32)"
>>>>> +  clean_results ();
>>>>> +
>>>>> +  DECL_VARIABLE(vsrc_1, float, 32, 4);
>>>>> +  DECL_VARIABLE(vsrc_2, float, 32, 4);
>>>>> +  VECT_VAR_DECL (buf_src_1, float, 32, 4) [] = {A0, A1, A2, A3};
>>>>> +  VECT_VAR_DECL (buf_src_2, float, 32, 4) [] = {B0, B1, B2, B3};
>>>>> +  VLOAD (vsrc_1, buf_src_1, q, float, f, 32, 4);
>>>>> +  VLOAD (vsrc_2, buf_src_2, q, float, f, 32, 4);
>>>>> +  DECL_VARIABLE (vector_res, float, 32, 4) =
>>>>> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem0);
>>>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>>>> +         VECT_VAR (vector_res, float, 32, 4));
>>>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms0_static, "");
>>>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>>>> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem0);
>>>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>>>> +         VECT_VAR (vector_res, float, 32, 4));
>>>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma0_static, "");
>>>>> +
>>>>> +  VECT_VAR_DECL (buf_src_3, float, 32, 4) [] = {A4, A5, A6, A7};
>>>>> +  VECT_VAR_DECL (buf_src_4, float, 32, 4) [] = {B4, B5, B6, B7};
>>>>> +  VLOAD (vsrc_1, buf_src_3, q, float, f, 32, 4);
>>>>> +  VLOAD (vsrc_2, buf_src_4, q, float, f, 32, 4);
>>>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>>>> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem1);
>>>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>>>> +         VECT_VAR (vector_res, float, 32, 4));
>>>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms1_static, "");
>>>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>>>> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem1);
>>>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>>>> +         VECT_VAR (vector_res, float, 32, 4));
>>>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma1_static, "");
>>>>> +
>>>>> +  VECT_VAR_DECL (buf_src_5, float, 32, 4) [] = {A0, A2, A4, A6};
>>>>> +  VECT_VAR_DECL (buf_src_6, float, 32, 4) [] = {B0, B2, B4, B6};
>>>>> +  VLOAD (vsrc_1, buf_src_5, q, float, f, 32, 4);
>>>>> +  VLOAD (vsrc_2, buf_src_6, q, float, f, 32, 4);
>>>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>>>> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem2);
>>>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>>>> +         VECT_VAR (vector_res, float, 32, 4));
>>>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms2_static, "");
>>>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>>>> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem2);
>>>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>>>> +         VECT_VAR (vector_res, float, 32, 4));
>>>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma2_static, "");
>>>>> +
>>>>> +  VECT_VAR_DECL (buf_src_7, float, 32, 4) [] = {A1, A3, A5, A7};
>>>>> +  VECT_VAR_DECL (buf_src_8, float, 32, 4) [] = {B1, B3, B5, B7};
>>>>> +  VLOAD (vsrc_1, buf_src_7, q, float, f, 32, 4);
>>>>> +  VLOAD (vsrc_2, buf_src_8, q, float, f, 32, 4);
>>>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>>>> +    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem3);
>>>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>>>> +         VECT_VAR (vector_res, float, 32, 4));
>>>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms3_static, "");
>>>>> +  VECT_VAR (vector_res, float, 32, 4) =
>>>>> +    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
>>>>> +         VECT_VAR (vsrc_2, float, 32, 4), elem3);
>>>>> +  vst1q_f32 (VECT_VAR (result, float, 32, 4),
>>>>> +         VECT_VAR (vector_res, float, 32, 4));
>>>>> +  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma3_static, "");
>>>>> +
>>>>> +#undef TEST_MSG
>>>>> +#define TEST_MSG "VFMSQ_VFMAQ_N (FP64)"
>>>>> +  clean_results ();
>>>>> +
>>>>> +  DECL_VARIABLE(vsrc_1, float, 64, 2);
>>>>> +  DECL_VARIABLE(vsrc_2, float, 64, 2);
>>>>> +  VECT_VAR_DECL (buf_src_1, float, 64, 2) [] = {DA0, DA1};
>>>>> +  VECT_VAR_DECL (buf_src_2, float, 64, 2) [] = {DB0, DB1};
>>>>> +  VLOAD (vsrc_1, buf_src_1, q, float, f, 64, 2);
>>>>> +  VLOAD (vsrc_2, buf_src_2, q, float, f, 64, 2);
>>>>> +  DECL_VARIABLE (vector_res, float, 64, 2) =
>>>>> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem0);
>>>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>>>> +         VECT_VAR (vector_res, float, 64, 2));
>>>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms0_static, "");
>>>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>>>> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem0);
>>>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>>>> +         VECT_VAR (vector_res, float, 64, 2));
>>>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma0_static, "");
>>>>> +
>>>>> +  VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3};
>>>>> +  VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3};
>>>>> +  VLOAD (vsrc_1, buf_src_3, q, float, f, 64, 2);
>>>>> +  VLOAD (vsrc_2, buf_src_4, q, float, f, 64, 2);
>>>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>>>> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem1);
>>>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>>>> +         VECT_VAR (vector_res, float, 64, 2));
>>>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms1_static, "");
>>>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>>>> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem1);
>>>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>>>> +         VECT_VAR (vector_res, float, 64, 2));
>>>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma1_static, "");
>>>>> +
>>>>> +  VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5};
>>>>> +  VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5};
>>>>> +  VLOAD (vsrc_1, buf_src_5, q, float, f, 64, 2);
>>>>> +  VLOAD (vsrc_2, buf_src_6, q, float, f, 64, 2);
>>>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>>>> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem2);
>>>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>>>> +         VECT_VAR (vector_res, float, 64, 2));
>>>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms2_static, "");
>>>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>>>> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem2);
>>>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>>>> +         VECT_VAR (vector_res, float, 64, 2));
>>>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma2_static, "");
>>>>> +
>>>>> +  VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7};
>>>>> +  VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7};
>>>>> +  VLOAD (vsrc_1, buf_src_7, q, float, f, 64, 2);
>>>>> +  VLOAD (vsrc_2, buf_src_8, q, float, f, 64, 2);
>>>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>>>> +    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem3);
>>>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>>>> +         VECT_VAR (vector_res, float, 64, 2));
>>>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms3_static, "");
>>>>> +  VECT_VAR (vector_res, float, 64, 2) =
>>>>> +    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
>>>>> +         VECT_VAR (vsrc_2, float, 64, 2), delem3);
>>>>> +  vst1q_f64 (VECT_VAR (result, float, 64, 2),
>>>>> +         VECT_VAR (vector_res, float, 64, 2));
>>>>> +  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma3_static, "");
>>>>> +
>>>>> +#undef TEST_MSG
>>>>> +#define TEST_MSG "VFMS_VFMA_N (FP64)"
>>>>> +  clean_results ();
>>>>> +
>>>>> +  DECL_VARIABLE(vsrc_1, float, 64, 1);
>>>>> +  DECL_VARIABLE(vsrc_2, float, 64, 1);
>>>>> +  VECT_VAR_DECL (buf_src_1, float, 64, 1) [] = {DA0};
>>>>> +  VECT_VAR_DECL (buf_src_2, float, 64, 1) [] = {DB0};
>>>>> +  VLOAD (vsrc_1, buf_src_1, , float, f, 64, 1);
>>>>> +  VLOAD (vsrc_2, buf_src_2, , float, f, 64, 1);
>>>>> +  DECL_VARIABLE (vector_res, float, 64, 1) =
>>>>> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem0);
>>>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>>>> +         VECT_VAR (vector_res, float, 64, 1));
>>>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms0_static, "");
>>>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>>>> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem0);
>>>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>>>> +         VECT_VAR (vector_res, float, 64, 1));
>>>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma0_static, "");
>>>>> +
>>>>> +  VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2};
>>>>> +  VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2};
>>>>> +  VLOAD (vsrc_1, buf_src_3, , float, f, 64, 1);
>>>>> +  VLOAD (vsrc_2, buf_src_4, , float, f, 64, 1);
>>>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>>>> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem1);
>>>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>>>> +         VECT_VAR (vector_res, float, 64, 1));
>>>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms1_static, "");
>>>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>>>> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem1);
>>>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>>>> +         VECT_VAR (vector_res, float, 64, 1));
>>>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma1_static, "");
>>>>> +
>>>>> +  VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4};
>>>>> +  VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4};
>>>>> +  VLOAD (vsrc_1, buf_src_5, , float, f, 64, 1);
>>>>> +  VLOAD (vsrc_2, buf_src_6, , float, f, 64, 1);
>>>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>>>> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem2);
>>>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>>>> +         VECT_VAR (vector_res, float, 64, 1));
>>>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms2_static, "");
>>>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>>>> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem2);
>>>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>>>> +         VECT_VAR (vector_res, float, 64, 1));
>>>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma2_static, "");
>>>>> +
>>>>> +  VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6};
>>>>> +  VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6};
>>>>> +  VLOAD (vsrc_1, buf_src_7, , float, f, 64, 1);
>>>>> +  VLOAD (vsrc_2, buf_src_8, , float, f, 64, 1);
>>>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>>>> +    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem3);
>>>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>>>> +         VECT_VAR (vector_res, float, 64, 1));
>>>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms3_static, "");
>>>>> +  VECT_VAR (vector_res, float, 64, 1) =
>>>>> +    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
>>>>> +        VECT_VAR (vsrc_2, float, 64, 1), delem3);
>>>>> +  vst1_f64 (VECT_VAR (result, float, 64, 1),
>>>>> +         VECT_VAR (vector_res, float, 64, 1));
>>>>> +  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma3_static, "");
>>>>> +}
>>>>> +#endif
>>>>> +
>>>>> +int
>>>>> +main (void)
>>>>> +{
>>>>> +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
>>>>> +  exec_vfma_vfms_n ();
>>>>> +#endif
>>>>> +  return 0;
>>>>> +}
>>>>> diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
>>>>> b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
>>>>> index
>>>>> 1ba1fed98a0711496815e00d2d702e5bfa2a7d43..5b348827002dcfef1f589900a4cf5ff7ada26697
>>>>> 100644
>>>>> --- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
>>>>> +++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
>>>>> @@ -110,6 +110,6 @@ main (int argc, char **argv)
>>>>>    /* vfmaq_lane_f64.
>>>>>       vfma_laneq_f64.
>>>>>       vfmaq_laneq_f64.  */
>>>>> -/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d,
>>>>> v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
>>>>> +/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d,
>>>>> v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
>>>>>      diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
>>>>> b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
>>>>> index
>>>>> 887ebae10da715c8d301a8494a2225e53f15bd7d..6c194a023d34ebafb4d732edc303985531f92a63
>>>>> 100644
>>>>> --- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
>>>>> +++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
>>>>> @@ -111,6 +111,6 @@ main (int argc, char **argv)
>>>>>    /* vfmsq_lane_f64.
>>>>>       vfms_laneq_f64.
>>>>>       vfmsq_laneq_f64.  */
>>>>> -/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d,
>>>>> v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
>>>>> +/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d,
>>>>> v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
>>>>>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [AArch64, 1/4] Add the missing support of vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64
  2016-05-18  8:25       ` Christophe Lyon
  2016-05-18  8:32         ` Kyrill Tkachov
@ 2016-05-18  9:44         ` Jiong Wang
  2016-05-18 10:00           ` Christophe Lyon
  1 sibling, 1 reply; 8+ messages in thread
From: Jiong Wang @ 2016-05-18  9:44 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: Kyrill Tkachov, James Greenhalgh, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1513 bytes --]



On 18/05/16 09:25, Christophe Lyon wrote:
>
> Unfortunately, the guard is not correct :(
>
> The float64_t type is not available on arm, so the new
> declarations/definitions in arm-neon-ref.h
> need a guard.
>
> Since this patch was checked-in, all the advsimd intrinsics tests fail
> to compile
> on arm:
> In file included from
> /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaba.c:2:0:
> /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h:139:22:
> error: unknown type name 'float64_t'
> /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h:51:35:
> note: in definition of macro 'VECT_VAR_DECL'
> /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h:139:8:
> note: in expansion of macro 'ARRAY'
>
>

Hi Christophe

  sorry for the breakage.
  I have run aarch64 regression before commit, I will keep in mind to run arm
  also next time as this directory is shared.

  I committed the following fix, r236370, as obivious after retest of advsimd-intrinsics.exp
  for both aarch64 and arm OK.

gcc/testsuite/

         2016-05-18  Jiong Wang  <jiong.wang@arm.com>

         * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: Guard float64_t
         with __aarch64__.
         * gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: Guard variable
         declaration under __aarch64__ and __ARM_FEATURE_FMA.


[-- Attachment #2: k.patch --]
[-- Type: text/x-patch, Size: 1533 bytes --]

diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
index cf90825..dde0e45 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
@@ -136,8 +136,10 @@ static ARRAY(result, poly, 16, 4);
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 static ARRAY(result, float, 16, 4);
 #endif
-static ARRAY(result, float, 64, 1);
 static ARRAY(result, float, 32, 2);
+#ifdef __aarch64__
+static ARRAY(result, float, 64, 1);
+#endif
 static ARRAY(result, int, 8, 16);
 static ARRAY(result, int, 16, 8);
 static ARRAY(result, int, 32, 4);
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
index 2622376..efa9b5f 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
@@ -2,6 +2,8 @@
 #include "arm-neon-ref.h"
 #include "compute-ref-data.h"
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
+
 #define A0 123.4f
 #define A1 -3.8f
 #define A2 -29.4f
@@ -56,8 +58,6 @@ float64_t delem1 = DE1;
 float64_t delem2 = DE2;
 float64_t delem3 = DE3;
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
-
 /* Expected results for vfms_n.  */
 
 VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1 * E0};

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [AArch64, 1/4] Add the missing support of vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64
  2016-05-18  9:44         ` Jiong Wang
@ 2016-05-18 10:00           ` Christophe Lyon
  0 siblings, 0 replies; 8+ messages in thread
From: Christophe Lyon @ 2016-05-18 10:00 UTC (permalink / raw)
  To: Jiong Wang; +Cc: Kyrill Tkachov, James Greenhalgh, GCC Patches

On 18 May 2016 at 11:44, Jiong Wang <jiong.wang@foss.arm.com> wrote:
>
>
> On 18/05/16 09:25, Christophe Lyon wrote:
>>
>>
>> Unfortunately, the guard is not correct :(
>>
>> The float64_t type is not available on arm, so the new
>> declarations/definitions in arm-neon-ref.h
>> need a guard.
>>
>> Since this patch was checked-in, all the advsimd intrinsics tests fail
>> to compile
>> on arm:
>> In file included from
>>
>> /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaba.c:2:0:
>>
>> /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h:139:22:
>> error: unknown type name 'float64_t'
>>
>> /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h:51:35:
>> note: in definition of macro 'VECT_VAR_DECL'
>>
>> /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h:139:8:
>> note: in expansion of macro 'ARRAY'
>>
>>
>
> Hi Christophe
>
>  sorry for the breakage.
>  I have run aarch64 regression before commit, I will keep in mind to run arm
>  also next time as this directory is shared.
>
>  I committed the following fix, r236370, as obivious after retest of
> advsimd-intrinsics.exp
>  for both aarch64 and arm OK.
>
> gcc/testsuite/
>
>         2016-05-18  Jiong Wang  <jiong.wang@arm.com>
>
>         * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: Guard
> float64_t
>         with __aarch64__.
>         * gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: Guard
> variable
>         declaration under __aarch64__ and __ARM_FEATURE_FMA.
>

Cool, thanks. I was running the tests with the same patch before
committing too :)

Christophe.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2016-05-18 10:00 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-05-16  9:09 [AArch64, 1/4] Add the missing support of vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64 Jiong Wang
2016-05-17 12:20 ` James Greenhalgh
2016-05-17 12:40   ` Kyrill Tkachov
2016-05-17 12:42     ` Kyrill Tkachov
2016-05-18  8:25       ` Christophe Lyon
2016-05-18  8:32         ` Kyrill Tkachov
2016-05-18  9:44         ` Jiong Wang
2016-05-18 10:00           ` Christophe Lyon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).