[GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma<b/t> for AArch32 AdvSIMD

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma<b/t> for AArch32 AdvSIMD
@ 2019-12-20 18:46 Delia Burduv
  2020-01-22 17:45 ` Delia Burduv
  0 siblings, 1 reply; 10+ messages in thread
From: Delia Burduv @ 2019-12-20 18:46 UTC (permalink / raw)
  To: gcc-patches; +Cc: nickc, Richard Earnshaw, Ramana Radhakrishnan, Kyrylo Tkachov

[-- Attachment #1: Type: text/plain, Size: 1904 bytes --]

This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab and vfmat 
as part of the BFloat16 extension.
(https://developer.arm.com/docs/101028/latest.)
The intrinsics are declared in arm_neon.h and the RTL patterns are 
defined in neon.md.
Two new tests are added to check assembler output and lane indices.

This patch depends on the Arm back-end patche. 
(https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)

Tested for regression on arm-none-eabi and armeb-none-eabi. I don't have 
commit rights, so if this is ok can someone please commit it for me?

gcc/ChangeLog:

2019-11-12  Delia Burduv  <delia.burduv@arm.com>

	* config/arm/arm_neon.h (vbfmmlaq_f32): New.
	  (vbfmlalbq_f32): New.
	  (vbfmlaltq_f32): New.
	  (vbfmlalbq_lane_f32): New.
	  (vbfmlaltq_lane_f32): New.
   	  (vbfmlalbq_laneq_f32): New.
	  (vbfmlaltq_laneq_f32): New.
	* config/arm/arm_neon_builtins.def (vbfmmla): New.
           (vbfmab): New.
           (vbfmat): New.
           (vbfmab_lane): New.
           (vbfmat_lane): New.
           (vbfmab_laneq): New.
           (vbfmat_laneq): New.
  	* config/arm/iterators.md (BF_MA): New int iterator.
           (bt): New int attribute.
           (VQXBF): Copy of VQX with V8BF.
           (V_HALF): Added V8BF.
   	* config/arm/neon.md (neon_vbfmmlav8hi): New insn.
           (neon_vbfma<bt>v8hi): New insn.
           (neon_vbfma<bt>_lanev8hi): New insn.
           (neon_vbfma<bt>_laneqv8hi): New expand.
           (neon_vget_high<mode>): Changed iterator to VQXBF.
	* config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC.
           (UNSPEC_BFMAB): New UNSPEC.
           (UNSPEC_BFMAT): New UNSPEC.

2019-11-12  Delia Burduv  <delia.burduv@arm.com>

         * gcc.target/arm/simd/bf16_ma_1.c: New test.
         * gcc.target/arm/simd/bf16_ma_2.c: New test.
         * gcc.target/arm/simd/bf16_mmla_1.c: New test.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: rb12263.patch --]
[-- Type: text/x-patch; name="rb12263.patch", Size: 12794 bytes --]

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 71e7568e4315a9354062dee5442ca4af9d9660a9..097d7bb30ad0109ca2f41885206b1cfb2ce962dc 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -91,6 +91,60 @@ typedef float float32_t;
 #ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
 typedef __simd128_bfloat16_t bfloat16x8_t;
 typedef __simd64_bfloat16_t bfloat16x4_t;
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmmlav8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmabv8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmatv8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		    const int __index)
+{
+  return __builtin_neon_vbfmab_lanev8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		    const int __index)
+{
+  return __builtin_neon_vbfmat_lanev8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		     const int __index)
+{
+  return __builtin_neon_vbfmab_laneqv8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		     const int __index)
+{
+  return __builtin_neon_vbfmat_laneqv8bf (__r, __a, __b, __index);
+}
+
 #endif
 #pragma GCC pop_options
 #pragma GCC pop_options
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index bcccf93f7fa2750e9006e5856efecbec0fb331b9..169781fa9a07930eb755165019427be055dc36ef 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -373,3 +373,12 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane0, v4sf, v8hf)
 VAR2 (MAC_LANE_PAIR, vcmlaq_lane90, v4sf, v8hf)
 VAR2 (MAC_LANE_PAIR, vcmlaq_lane180, v4sf, v8hf)
 VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf)
+
+VAR1 (TERNOP, vbfmmla, v8bf)
+
+VAR1 (TERNOP, vbfmab, v8bf)
+VAR1 (TERNOP, vbfmat, v8bf)
+VAR1 (MAC_LANE, vbfmab_lane, v8bf)
+VAR1 (MAC_LANE, vbfmat_lane, v8bf)
+VAR1 (MAC_LANE, vbfmab_laneq, v8bf)
+VAR1 (MAC_LANE, vbfmat_laneq, v8bf)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 439021fa0733ac31706287c4f98d62b080afc3a1..b31f54ffe8957d3dad0a7e3d3fedc48911e7b2c4 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -108,6 +108,9 @@
 ;; Quad-width vector modes plus 64-bit elements.
 (define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI])
 
+;; Quad-width vector modes plus 64-bit elements and V8BF.
+(define_mode_iterator VQXBF [V16QI V8HI V8HF (V8BF "TARGET_BF16_SIMD") V4SI V4SF V2DI])
+
 ;; Quad-width vector modes without floating-point elements.
 (define_mode_iterator VQI [V16QI V8HI V4SI])
 
@@ -488,6 +491,8 @@
 (define_int_iterator VCADD [UNSPEC_VCADD90 UNSPEC_VCADD270])
 (define_int_iterator VCMLA [UNSPEC_VCMLA UNSPEC_VCMLA90 UNSPEC_VCMLA180 UNSPEC_VCMLA270])
 
+(define_int_iterator BF_MA [UNSPEC_BFMAB UNSPEC_BFMAT])
+
 ;;----------------------------------------------------------------------------
 ;; Mode attributes
 ;;----------------------------------------------------------------------------
@@ -612,7 +617,8 @@
 (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI")
 			  (V8HF "V4HF") (V4SI  "V2SI")
 			  (V4SF "V2SF") (V2DF "DF")
-			  (V2DI "DI") (V4HF "HF")])
+			  (V2DI "DI") (V4HF "HF")
+			  (V8BF "V4BF")])
 
 ;; Same, but lower-case.
 (define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi")
@@ -1174,4 +1180,7 @@
 (define_int_attr opsuffix [(UNSPEC_DOT_S "s8")
 			   (UNSPEC_DOT_U "u8")])
 
+;; An iterator for VFMA<bt>
+(define_int_attr bt [(UNSPEC_BFMAB "b") (UNSPEC_BFMAT "t")])
+
 (define_int_attr smlaw_op [(UNSPEC_SMLAWB "smlawb") (UNSPEC_SMLAWT "smlawt")])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index b724aab65f720bf0e48bb828f0874426effd235c..42763de178a96422f9df7f4500e4328adfa81d27 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -3879,7 +3879,7 @@ if (BYTES_BIG_ENDIAN)
 
 (define_expand "neon_vget_high<mode>"
   [(match_operand:<V_HALF> 0 "s_register_operand")
-   (match_operand:VQX 1 "s_register_operand")]
+   (match_operand:VQXBF 1 "s_register_operand")]
   "TARGET_NEON"
 {
   emit_move_insn (operands[0],
@@ -6556,3 +6556,62 @@ if (BYTES_BIG_ENDIAN)
  "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
  [(set_attr "type" "neon_fp_abd_s<q>")]
 )
+
+(define_insn "neon_vbfmmlav8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus:V4SF (match_operand:V4SF 1 "register_operand" "0")
+                   (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                 (match_operand:V8BF 3 "register_operand" "w")]
+                    UNSPEC_BFMMLA)))]
+  "TARGET_BF16_SIMD"
+  "vmmla.bf16\\t%q0, %q2, %q3"
+  [(set_attr "type" "neon_mla_s_q")]
+)
+
+(define_insn "neon_vbfma<bt>v8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V8BF 3 "register_operand" "w")]
+                     BF_MA)))]
+  "TARGET_BF16_SIMD"
+  "vfma<bt>.bf16\\t%q0, %q2, %q3"
+  [(set_attr "type" "neon_fp_mla_s")]
+)
+
+(define_insn "neon_vbfma<bt>_lanev8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V4BF 3 "register_operand" "x")
+                                  (match_operand:SI 4 "const_int_operand" "n")]
+                     BF_MA)))]
+  "TARGET_BF16_SIMD"
+  "vfma<bt>.bf16\\t%q0, %q2, %P3[%c4]"
+  [(set_attr "type" "neon_fp_mla_s")]
+)
+
+(define_expand "neon_vbfma<bt>_laneqv8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V8BF 3 "register_operand" "x")
+                                  (match_operand:SI 4 "const_int_operand" "n")]
+                     BF_MA)))]
+  "TARGET_BF16_SIMD"
+  {
+    int lane = INTVAL (operands[4]);
+    gcc_assert (lane >=0 && lane <=7);
+    if (lane < 4)
+	emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], operands[1], operands[2], operands[3], operands[4]));
+    else
+      {
+	rtx op_highpart = gen_reg_rtx (V4BFmode);
+	emit_insn (gen_neon_vget_highv8bf (op_highpart, operands[3]));
+	operands[4] = GEN_INT (lane - 4);
+	emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], operands[1], operands[2], op_highpart, operands[4]));
+      }
+    DONE;
+  }
+  [(set_attr "type" "neon_fp_mla_s")]
+)
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index b4196b0e5cd939c3ee5e3f9bd19622fcc963adae..f452082b4bdb3a22a8e3b62113bb7f9470279e93 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -493,4 +493,7 @@
   UNSPEC_VCMLA90
   UNSPEC_VCMLA180
   UNSPEC_VCMLA270
+  UNSPEC_BFMMLA
+  UNSPEC_BFMAB
+  UNSPEC_BFMAT
 ])
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..ead3e9d569f45f5507985e5d7cb12e0541349dd1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
@@ -0,0 +1,84 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" }  */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+**test_vbfmlalbq_f32:
+**      ...
+**      vfmab.bf16\tq[0-9]+, q[0-9]+, q[0-9]+
+**      ...
+*/
+float32x4_t
+test_vbfmlalbq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlalbq_f32 (r, a, b);
+}
+
+/*
+**test_vbfmlaltq_f32:
+**      ...
+**      vfmat.bf16\tq[0-9]+, q[0-9]+, q[0-9]+
+**      ...
+*/
+float32x4_t
+test_vbfmlaltq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlaltq_f32 (r, a, b);
+}
+
+/*
+**test_vbfmlalbq_lane_f32:
+**      ...
+**      vfmab.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[0\]
+**      ...
+*/
+float32x4_t
+test_vbfmlalbq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return vbfmlalbq_lane_f32 (r, a, b, 0);
+}
+
+/*
+**test_vbfmlaltq_lane_f32:
+**      ...
+**      vfmat.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[2\]
+**      ...
+*/
+float32x4_t
+test_vbfmlaltq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return vbfmlaltq_lane_f32 (r, a, b, 2);
+}
+
+/*
+**test_vbfmlalbq_laneq_f32:
+**      ...
+**      vfmab.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[1\]
+**      ...
+*/
+float32x4_t
+test_vbfmlalbq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlalbq_laneq_f32 (r, a, b, 5);
+}
+
+/*
+**test_vbfmlaltq_laneq_f32:
+**      ...
+**      vfmat.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[3\]
+**      ...
+*/
+float32x4_t
+test_vbfmlaltq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlaltq_laneq_f32 (r, a, b, 7);
+}
+
+int main()
+{
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..226ed7e1d8e4747d73b0518c809aaf0e3c5bc78d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include "arm_neon.h"
+
+/* Test lane index limits for vbfmlalbq_lane_f32  */
+float32x4_t
+test_vbfmlalbq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmab_lanev8bf (r, a, b, -1); /* { dg-error {lane -1 out of range 0 - 3} } */
+}
+
+float32x4_t
+test_vbfmlalbq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmab_lanev8bf (r, a, b, 4); /* { dg-error {lane 4 out of range 0 - 3} } */
+}
+
+/* Test lane index limits for vbfmlaltq_lane_f32  */
+float32x4_t
+test_vbfmlaltq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmat_lanev8bf (r, a, b, -1); /* { dg-error {lane -1 out of range 0 - 3} } */
+}
+
+float32x4_t
+test_vbfmlaltq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmat_lanev8bf (r, a, b, 4); /* { dg-error {lane 4 out of range 0 - 3} } */
+}
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..0c7422b78c385850eaa53492af0da8826e8b3b4a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c
@@ -0,0 +1,24 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+**test_vbfmmlaq_f32:
+**	...
+**	vmmla.bf16\tq[0-9]+, q[0-9]+, q[0-9]+
+**	...
+*/
+float32x4_t
+test_vbfmmlaq_f32 (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfmmlaq_f32 (r, x, y);
+}
+
+int main()
+{
+  return 0;
+}

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma<b/t> for AArch32 AdvSIMD
  2019-12-20 18:46 [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma<b/t> for AArch32 AdvSIMD Delia Burduv
@ 2020-01-22 17:45 ` Delia Burduv
  2020-01-28 16:52   ` Delia Burduv
  0 siblings, 1 reply; 10+ messages in thread
From: Delia Burduv @ 2020-01-22 17:45 UTC (permalink / raw)
  To: gcc-patches; +Cc: nickc, Richard Earnshaw, Ramana Radhakrishnan, Kyrylo Tkachov

Ping.

I have read Richard Sandiford's comments on the AArch64 patches and I 
will apply what is relevant to this patch as well. Particularly, I will 
change the tests to use the exact input and output registers and I will 
change the types of the rtl patterns.

On 12/20/19 6:44 PM, Delia Burduv wrote:
> This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab and vfmat 
> as part of the BFloat16 extension.
> (https://developer.arm.com/docs/101028/latest.)
> The intrinsics are declared in arm_neon.h and the RTL patterns are 
> defined in neon.md.
> Two new tests are added to check assembler output and lane indices.
> 
> This patch depends on the Arm back-end patche. 
> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
> 
> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't have 
> commit rights, so if this is ok can someone please commit it for me?
> 
> gcc/ChangeLog:
> 
> 2019-11-12  Delia Burduv  <delia.burduv@arm.com>
> 
>      * config/arm/arm_neon.h (vbfmmlaq_f32): New.
>        (vbfmlalbq_f32): New.
>        (vbfmlaltq_f32): New.
>        (vbfmlalbq_lane_f32): New.
>        (vbfmlaltq_lane_f32): New.
>          (vbfmlalbq_laneq_f32): New.
>        (vbfmlaltq_laneq_f32): New.
>      * config/arm/arm_neon_builtins.def (vbfmmla): New.
>            (vbfmab): New.
>            (vbfmat): New.
>            (vbfmab_lane): New.
>            (vbfmat_lane): New.
>            (vbfmab_laneq): New.
>            (vbfmat_laneq): New.
>       * config/arm/iterators.md (BF_MA): New int iterator.
>            (bt): New int attribute.
>            (VQXBF): Copy of VQX with V8BF.
>            (V_HALF): Added V8BF.
>        * config/arm/neon.md (neon_vbfmmlav8hi): New insn.
>            (neon_vbfma<bt>v8hi): New insn.
>            (neon_vbfma<bt>_lanev8hi): New insn.
>            (neon_vbfma<bt>_laneqv8hi): New expand.
>            (neon_vget_high<mode>): Changed iterator to VQXBF.
>      * config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC.
>            (UNSPEC_BFMAB): New UNSPEC.
>            (UNSPEC_BFMAT): New UNSPEC.
> 
> 2019-11-12  Delia Burduv  <delia.burduv@arm.com>
> 
>          * gcc.target/arm/simd/bf16_ma_1.c: New test.
>          * gcc.target/arm/simd/bf16_ma_2.c: New test.
>          * gcc.target/arm/simd/bf16_mmla_1.c: New test.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma<b/t> for AArch32 AdvSIMD
  2020-01-22 17:45 ` Delia Burduv
@ 2020-01-28 16:52   ` Delia Burduv
  2020-01-30 15:55     ` Kyrill Tkachov
  0 siblings, 1 reply; 10+ messages in thread
From: Delia Burduv @ 2020-01-28 16:52 UTC (permalink / raw)
  To: gcc-patches; +Cc: nickc, Richard Earnshaw, Ramana Radhakrishnan, Kyrylo Tkachov

Ping.
________________________________
From: Delia Burduv <delia.burduv@arm.com>
Sent: 22 January 2020 17:26
To: gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org>
Cc: nickc@redhat.com <nickc@redhat.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>; Ramana Radhakrishnan <Ramana.Radhakrishnan@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
Subject: Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma<b/t> for AArch32 AdvSIMD

Ping.

I have read Richard Sandiford's comments on the AArch64 patches and I
will apply what is relevant to this patch as well. Particularly, I will
change the tests to use the exact input and output registers and I will
change the types of the rtl patterns.

On 12/20/19 6:44 PM, Delia Burduv wrote:
> This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab and vfmat
> as part of the BFloat16 extension.
> (https://developer.arm.com/docs/101028/latest.)
> The intrinsics are declared in arm_neon.h and the RTL patterns are
> defined in neon.md.
> Two new tests are added to check assembler output and lane indices.
>
> This patch depends on the Arm back-end patche.
> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>
> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't have
> commit rights, so if this is ok can someone please commit it for me?
>
> gcc/ChangeLog:
>
> 2019-11-12  Delia Burduv  <delia.burduv@arm.com>
>
>      * config/arm/arm_neon.h (vbfmmlaq_f32): New.
>        (vbfmlalbq_f32): New.
>        (vbfmlaltq_f32): New.
>        (vbfmlalbq_lane_f32): New.
>        (vbfmlaltq_lane_f32): New.
>          (vbfmlalbq_laneq_f32): New.
>        (vbfmlaltq_laneq_f32): New.
>      * config/arm/arm_neon_builtins.def (vbfmmla): New.
>            (vbfmab): New.
>            (vbfmat): New.
>            (vbfmab_lane): New.
>            (vbfmat_lane): New.
>            (vbfmab_laneq): New.
>            (vbfmat_laneq): New.
>       * config/arm/iterators.md (BF_MA): New int iterator.
>            (bt): New int attribute.
>            (VQXBF): Copy of VQX with V8BF.
>            (V_HALF): Added V8BF.
>        * config/arm/neon.md (neon_vbfmmlav8hi): New insn.
>            (neon_vbfma<bt>v8hi): New insn.
>            (neon_vbfma<bt>_lanev8hi): New insn.
>            (neon_vbfma<bt>_laneqv8hi): New expand.
>            (neon_vget_high<mode>): Changed iterator to VQXBF.
>      * config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC.
>            (UNSPEC_BFMAB): New UNSPEC.
>            (UNSPEC_BFMAT): New UNSPEC.
>
> 2019-11-12  Delia Burduv  <delia.burduv@arm.com>
>
>          * gcc.target/arm/simd/bf16_ma_1.c: New test.
>          * gcc.target/arm/simd/bf16_ma_2.c: New test.
>          * gcc.target/arm/simd/bf16_mmla_1.c: New test.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma<b/t> for AArch32 AdvSIMD
  2020-01-28 16:52   ` Delia Burduv
@ 2020-01-30 15:55     ` Kyrill Tkachov
  2020-01-31 16:21       ` Delia Burduv
  0 siblings, 1 reply; 10+ messages in thread
From: Kyrill Tkachov @ 2020-01-30 15:55 UTC (permalink / raw)
  To: Delia Burduv, gcc-patches; +Cc: nickc, Richard Earnshaw, Ramana Radhakrishnan

Hi Delia,


On 1/28/20 4:44 PM, Delia Burduv wrote:
> Ping.
> ------------------------------------------------------------------------
> *From:* Delia Burduv <delia.burduv@arm.com>
> *Sent:* 22 January 2020 17:26
> *To:* gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org>
> *Cc:* nickc@redhat.com <nickc@redhat.com>; Richard Earnshaw 
> <Richard.Earnshaw@arm.com>; Ramana Radhakrishnan 
> <Ramana.Radhakrishnan@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> *Subject:* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla 
> and vfma<b/t> for AArch32 AdvSIMD
> Ping.
>
> I have read Richard Sandiford's comments on the AArch64 patches and I
> will apply what is relevant to this patch as well. Particularly, I will
> change the tests to use the exact input and output registers and I will
> change the types of the rtl patterns.


Please send the updated patches so that someone can commit them for you 
once they're reviewed.

Thanks,

Kyrill


>
> On 12/20/19 6:44 PM, Delia Burduv wrote:
> > This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab and vfmat
> > as part of the BFloat16 extension.
> > (https://developer.arm.com/docs/101028/latest.)
> > The intrinsics are declared in arm_neon.h and the RTL patterns are
> > defined in neon.md.
> > Two new tests are added to check assembler output and lane indices.
> >
> > This patch depends on the Arm back-end patche.
> > (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
> >
> > Tested for regression on arm-none-eabi and armeb-none-eabi. I don't 
> have
> > commit rights, so if this is ok can someone please commit it for me?
> >
> > gcc/ChangeLog:
> >
> > 2019-11-12Â  Delia Burduv <delia.burduv@arm.com>
> >
> >Â  Â Â Â Â * config/arm/arm_neon.h (vbfmmlaq_f32): New.
> >Â  Â Â Â Â Â  (vbfmlalbq_f32): New.
> >Â  Â Â Â Â Â  (vbfmlaltq_f32): New.
> >Â  Â Â Â Â Â  (vbfmlalbq_lane_f32): New.
> >Â  Â Â Â Â Â  (vbfmlaltq_lane_f32): New.
> >Â  Â Â Â Â Â Â Â  (vbfmlalbq_laneq_f32): New.
> >Â  Â Â Â Â Â  (vbfmlaltq_laneq_f32): New.
> >Â  Â Â Â Â * config/arm/arm_neon_builtins.def (vbfmmla): New.
> >Â  Â Â Â Â Â Â Â Â Â  (vbfmab): New.
> >Â  Â Â Â Â Â Â Â Â Â  (vbfmat): New.
> >Â  Â Â Â Â Â Â Â Â Â  (vbfmab_lane): New.
> >Â  Â Â Â Â Â Â Â Â Â  (vbfmat_lane): New.
> >Â  Â Â Â Â Â Â Â Â Â  (vbfmab_laneq): New.
> >Â  Â Â Â Â Â Â Â Â Â  (vbfmat_laneq): New.
> >Â  Â Â Â Â  * config/arm/iterators.md (BF_MA): New int iterator.
> >Â  Â Â Â Â Â Â Â Â Â  (bt): New int attribute.
> >Â  Â Â Â Â Â Â Â Â Â  (VQXBF): Copy of VQX with V8BF.
> >Â  Â Â Â Â Â Â Â Â Â  (V_HALF): Added V8BF.
> >Â  Â Â Â Â Â  * config/arm/neon.md (neon_vbfmmlav8hi): New insn.
> >Â  Â Â Â Â Â Â Â Â Â  (neon_vbfma<bt>v8hi): New insn.
> >Â  Â Â Â Â Â Â Â Â Â  (neon_vbfma<bt>_lanev8hi): New insn.
> >Â  Â Â Â Â Â Â Â Â Â  (neon_vbfma<bt>_laneqv8hi): New expand.
> >Â  Â Â Â Â Â Â Â Â Â  (neon_vget_high<mode>): Changed iterator to VQXBF.
> >Â  Â Â Â Â * config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC.
> >Â  Â Â Â Â Â Â Â Â Â  (UNSPEC_BFMAB): New UNSPEC.
> >Â  Â Â Â Â Â Â Â Â Â  (UNSPEC_BFMAT): New UNSPEC.
> >
> > 2019-11-12Â  Delia Burduv <delia.burduv@arm.com>
> >
> >Â  Â Â Â Â Â Â Â  * gcc.target/arm/simd/bf16_ma_1.c: New test.
> >Â  Â Â Â Â Â Â Â  * gcc.target/arm/simd/bf16_ma_2.c: New test.
> >Â  Â Â Â Â Â Â Â  * gcc.target/arm/simd/bf16_mmla_1.c: New test.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma<b/t> for AArch32 AdvSIMD
  2020-01-30 15:55     ` Kyrill Tkachov
@ 2020-01-31 16:21       ` Delia Burduv
  2020-02-19 17:23         ` Delia Burduv
  0 siblings, 1 reply; 10+ messages in thread
From: Delia Burduv @ 2020-01-31 16:21 UTC (permalink / raw)
  To: Kyrill Tkachov, gcc-patches; +Cc: nickc, Richard Earnshaw, Ramana Radhakrishnan

[-- Attachment #1: Type: text/plain, Size: 3453 bytes --]

Here is the updated patch. The changes are minor, so let me know if 
there is anything else to fix or if it can be committed.

Thank you,
Delia

On 1/30/20 2:55 PM, Kyrill Tkachov wrote:
> Hi Delia,
> 
> 
> On 1/28/20 4:44 PM, Delia Burduv wrote:
>> Ping.
>> ------------------------------------------------------------------------
>> *From:* Delia Burduv <delia.burduv@arm.com>
>> *Sent:* 22 January 2020 17:26
>> *To:* gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org>
>> *Cc:* nickc@redhat.com <nickc@redhat.com>; Richard Earnshaw 
>> <Richard.Earnshaw@arm.com>; Ramana Radhakrishnan 
>> <Ramana.Radhakrishnan@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
>> *Subject:* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla 
>> and vfma<b/t> for AArch32 AdvSIMD
>> Ping.
>>
>> I have read Richard Sandiford's comments on the AArch64 patches and I
>> will apply what is relevant to this patch as well. Particularly, I will
>> change the tests to use the exact input and output registers and I will
>> change the types of the rtl patterns.
> 
> 
> Please send the updated patches so that someone can commit them for you 
> once they're reviewed.
> 
> Thanks,
> 
> Kyrill
> 
> 
>>
>> On 12/20/19 6:44 PM, Delia Burduv wrote:
>> > This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab and vfmat
>> > as part of the BFloat16 extension.
>> > (https://developer.arm.com/docs/101028/latest.)
>> > The intrinsics are declared in arm_neon.h and the RTL patterns are
>> > defined in neon.md.
>> > Two new tests are added to check assembler output and lane indices.
>> >
>> > This patch depends on the Arm back-end patche.
>> > (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>> >
>> > Tested for regression on arm-none-eabi and armeb-none-eabi. I don't 
>> have
>> > commit rights, so if this is ok can someone please commit it for me?
>> >
>> > gcc/ChangeLog:
>> >
>> > 2019-11-12  Delia Burduv <delia.burduv@arm.com>
>> >
>> >      * config/arm/arm_neon.h (vbfmmlaq_f32): New.
>> >        (vbfmlalbq_f32): New.
>> >        (vbfmlaltq_f32): New.
>> >        (vbfmlalbq_lane_f32): New.
>> >        (vbfmlaltq_lane_f32): New.
>> >          (vbfmlalbq_laneq_f32): New.
>> >        (vbfmlaltq_laneq_f32): New.
>> >      * config/arm/arm_neon_builtins.def (vbfmmla): New.
>> >            (vbfmab): New.
>> >            (vbfmat): New.
>> >            (vbfmab_lane): New.
>> >            (vbfmat_lane): New.
>> >            (vbfmab_laneq): New.
>> >            (vbfmat_laneq): New.
>> >       * config/arm/iterators.md (BF_MA): New int iterator.
>> >            (bt): New int attribute.
>> >            (VQXBF): Copy of VQX with V8BF.
>> >            (V_HALF): Added V8BF.
>> >        * config/arm/neon.md (neon_vbfmmlav8hi): New insn.
>> >            (neon_vbfma<bt>v8hi): New insn.
>> >            (neon_vbfma<bt>_lanev8hi): New insn.
>> >            (neon_vbfma<bt>_laneqv8hi): New expand.
>> >            (neon_vget_high<mode>): Changed iterator to VQXBF.
>> >      * config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC.
>> >            (UNSPEC_BFMAB): New UNSPEC.
>> >            (UNSPEC_BFMAT): New UNSPEC.
>> >
>> > 2019-11-12  Delia Burduv <delia.burduv@arm.com>
>> >
>> >          * gcc.target/arm/simd/bf16_ma_1.c: New test.
>> >          * gcc.target/arm/simd/bf16_ma_2.c: New test.
>> >          * gcc.target/arm/simd/bf16_mmla_1.c: New test.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: rb12263.patch --]
[-- Type: text/x-patch; name="rb12263.patch", Size: 12772 bytes --]

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 3c78f435009ab027f92693d00ab5b40960d5419d..81f8008ea6a5fb11eb09f6685ba24bb0c54fb248 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18742,6 +18742,64 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b,
   return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, __index);
 }
 
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmmlav8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmabv8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmatv8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		   const int __index)
+{
+  return __builtin_neon_vbfmab_lanev8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		   const int __index)
+{
+  return __builtin_neon_vbfmat_lanev8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		    const int __index)
+{
+  return __builtin_neon_vbfmab_laneqv8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		    const int __index)
+{
+  return __builtin_neon_vbfmat_laneqv8bf (__r, __a, __b, __index);
+}
+
+#pragma GCC pop_options
+
 #pragma GCC pop_options
 #endif
 
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index e9ff4e501cbb5d16b9211f5bc96db376ddf21afc..cc06783daf393f7166fd922f86b3db79c02ba188 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -373,3 +373,12 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane0, v4sf, v8hf)
 VAR2 (MAC_LANE_PAIR, vcmlaq_lane90, v4sf, v8hf)
 VAR2 (MAC_LANE_PAIR, vcmlaq_lane180, v4sf, v8hf)
 VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf)
+
+VAR1 (TERNOP, vbfmmla, v8bf)
+
+VAR1 (TERNOP, vbfmab, v8bf)
+VAR1 (TERNOP, vbfmat, v8bf)
+VAR1 (MAC_LANE, vbfmab_lane, v8bf)
+VAR1 (MAC_LANE, vbfmat_lane, v8bf)
+VAR1 (MAC_LANE, vbfmab_laneq, v8bf)
+VAR1 (MAC_LANE, vbfmat_laneq, v8bf)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 33e29509f00a89fa23d0546687c0e4643f0b32d2..72b8ce0bb26dcd520603b907b4f86a74d0164332 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -106,6 +106,9 @@
 ;; Quad-width vector modes plus 64-bit elements.
 (define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI])
 
+;; Quad-width vector modes plus 64-bit elements and V8BF.
+(define_mode_iterator VQXBF [V16QI V8HI V8HF (V8BF "TARGET_BF16_SIMD") V4SI V4SF V2DI])
+
 ;; Quad-width vector modes without floating-point elements.
 (define_mode_iterator VQI [V16QI V8HI V4SI])
 
@@ -485,6 +488,8 @@
 (define_int_iterator VCADD [UNSPEC_VCADD90 UNSPEC_VCADD270])
 (define_int_iterator VCMLA [UNSPEC_VCMLA UNSPEC_VCMLA90 UNSPEC_VCMLA180 UNSPEC_VCMLA270])
 
+(define_int_iterator BF_MA [UNSPEC_BFMAB UNSPEC_BFMAT])
+
 ;;----------------------------------------------------------------------------
 ;; Mode attributes
 ;;----------------------------------------------------------------------------
@@ -609,7 +614,8 @@
 (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI")
 			  (V8HF "V4HF") (V4SI  "V2SI")
 			  (V4SF "V2SF") (V2DF "DF")
-			  (V2DI "DI") (V4HF "HF")])
+			  (V2DI "DI") (V4HF "HF")
+			  (V8BF "V4BF")])
 
 ;; Same, but lower-case.
 (define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi")
@@ -1171,4 +1177,7 @@
 (define_int_attr opsuffix [(UNSPEC_DOT_S "s8")
 			   (UNSPEC_DOT_U "u8")])
 
+;; An iterator for VFMA<bt>
+(define_int_attr bt [(UNSPEC_BFMAB "b") (UNSPEC_BFMAT "t")])
+
 (define_int_attr smlaw_op [(UNSPEC_SMLAWB "smlawb") (UNSPEC_SMLAWT "smlawt")])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 6087ca6f2badde6a492bb515a2cb5846f3d4ad8e..4e0d0b5c317a81839de9dee581c5e351d3193dfa 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -3875,7 +3875,7 @@ if (BYTES_BIG_ENDIAN)
 
 (define_expand "neon_vget_high<mode>"
   [(match_operand:<V_HALF> 0 "s_register_operand")
-   (match_operand:VQX 1 "s_register_operand")]
+   (match_operand:VQXBF 1 "s_register_operand")]
   "TARGET_NEON"
 {
   emit_move_insn (operands[0],
@@ -6552,3 +6552,64 @@ if (BYTES_BIG_ENDIAN)
  "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
  [(set_attr "type" "neon_fp_abd_s<q>")]
 )
+
+(define_insn "neon_vbfmmlav8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus:V4SF (match_operand:V4SF 1 "register_operand" "0")
+                   (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                 (match_operand:V8BF 3 "register_operand" "w")]
+                    UNSPEC_BFMMLA)))]
+  "TARGET_BF16_SIMD"
+  "vmmla.bf16\\t%q0, %q2, %q3"
+  [(set_attr "type" "neon_fp_mla_s_q")]
+)
+
+(define_insn "neon_vbfma<bt>v8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V8BF 3 "register_operand" "w")]
+                     BF_MA)))]
+  "TARGET_BF16_SIMD"
+  "vfma<bt>.bf16\\t%q0, %q2, %q3"
+  [(set_attr "type" "neon_fp_mla_s_q")]
+)
+
+(define_insn "neon_vbfma<bt>_lanev8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V4BF 3 "register_operand" "x")
+                                  (match_operand:SI 4 "const_int_operand" "n")]
+                     BF_MA)))]
+  "TARGET_BF16_SIMD"
+  "vfma<bt>.bf16\\t%q0, %q2, %P3[%c4]"
+  [(set_attr "type" "neon_fp_mla_s_scalar_q")]
+)
+
+(define_expand "neon_vbfma<bt>_laneqv8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V8BF 3 "register_operand" "x")
+                                  (match_operand:SI 4 "const_int_operand" "n")]
+                     BF_MA)))]
+  "TARGET_BF16_SIMD"
+  {
+    int lane = INTVAL (operands[4]);
+    gcc_assert (lane >=0 && lane <=7);
+    if (lane < 4)
+    {
+	emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], operands[1], operands[2], operands[3], operands[4]));
+    }
+    else
+      {
+	rtx op_highpart = gen_reg_rtx (V4BFmode);
+	emit_insn (gen_neon_vget_highv8bf (op_highpart, operands[3]));
+	operands[4] = GEN_INT (lane - 4);
+	emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], operands[1], operands[2], op_highpart, operands[4]));
+      }
+    DONE;
+  }
+  [(set_attr "type" "neon_fp_mla_s_scalar_q")]
+)
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 8f4a705f43efdb6baf03b39cee589cf728620687..97f08abec0a089b5cd95840da12ae22f7c960b28 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -501,4 +501,7 @@
   UNSPEC_VCMLA90
   UNSPEC_VCMLA180
   UNSPEC_VCMLA270
+  UNSPEC_BFMMLA
+  UNSPEC_BFMAB
+  UNSPEC_BFMAT
 ])
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..855e86b91fc69904f488dea1b277de6cc4ecba7e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
@@ -0,0 +1,79 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" }  */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+
+#include "arm_neon.h"
+
+/*
+**test_vbfmlalbq_f32:
+**      ...
+**      vfmab.bf16\tq0, q1, q2
+**      bx\tlr
+*/
+float32x4_t
+test_vbfmlalbq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlalbq_f32 (r, a, b);
+}
+
+/*
+**test_vbfmlaltq_f32:
+**      ...
+**      vfmat.bf16\tq0, q1, q2
+**      bx\tlr
+*/
+float32x4_t
+test_vbfmlaltq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlaltq_f32 (r, a, b);
+}
+
+/*
+**test_vbfmlalbq_lane_f32:
+**      ...
+**      vfmab.bf16\tq0, q1, d4[0]
+**      bx\tlr
+*/
+float32x4_t
+test_vbfmlalbq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return vbfmlalbq_lane_f32 (r, a, b, 0);
+}
+
+/*
+**test_vbfmlaltq_lane_f32:
+**      ...
+**      vfmat.bf16\tq0, q1, d4[2]
+**      bx\tlr
+*/
+float32x4_t
+test_vbfmlaltq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return vbfmlaltq_lane_f32 (r, a, b, 2);
+}
+
+/*
+**test_vbfmlalbq_laneq_f32:
+**      ...
+**      vfmab.bf16\tq0, q1, d5[1]
+**      bx\tlr
+*/
+float32x4_t
+test_vbfmlalbq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlalbq_laneq_f32 (r, a, b, 5);
+}
+
+/*
+**test_vbfmlaltq_laneq_f32:
+**      ...
+**      vfmat.bf16\tq0, q1, d5[3]
+**      bx\tlr
+*/
+float32x4_t
+test_vbfmlaltq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlaltq_laneq_f32 (r, a, b, 7);
+}
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..226ed7e1d8e4747d73b0518c809aaf0e3c5bc78d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include "arm_neon.h"
+
+/* Test lane index limits for vbfmlalbq_lane_f32  */
+float32x4_t
+test_vbfmlalbq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmab_lanev8bf (r, a, b, -1); /* { dg-error {lane -1 out of range 0 - 3} } */
+}
+
+float32x4_t
+test_vbfmlalbq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmab_lanev8bf (r, a, b, 4); /* { dg-error {lane 4 out of range 0 - 3} } */
+}
+
+/* Test lane index limits for vbfmlaltq_lane_f32  */
+float32x4_t
+test_vbfmlaltq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmat_lanev8bf (r, a, b, -1); /* { dg-error {lane -1 out of range 0 - 3} } */
+}
+
+float32x4_t
+test_vbfmlaltq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmat_lanev8bf (r, a, b, 4); /* { dg-error {lane 4 out of range 0 - 3} } */
+}
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..9370e4d945a353ac7329929d27920b3a0aa08281
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c
@@ -0,0 +1,18 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+
+#include <arm_neon.h>
+
+/*test_vbfmmlaq_f32:
+**        ...
+**        vmmla.bf16\tq0, q1, q2
+**        bx\tlr
+*/
+float32x4_t
+test_vbfmmlaq_f32 (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfmmlaq_f32 (r, x, y);
+}

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma<b/t> for AArch32 AdvSIMD
  2020-01-31 16:21       ` Delia Burduv
@ 2020-02-19 17:23         ` Delia Burduv
  2020-02-21 11:41           ` Kyrill Tkachov
  0 siblings, 1 reply; 10+ messages in thread
From: Delia Burduv @ 2020-02-19 17:23 UTC (permalink / raw)
  To: Kyrill Tkachov, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 4863 bytes --]

Hi,

Here is the latest version of the patch. It just has some minor 
formatting changes that were brought up by Richard Sandiford in the 
AArch64 patches

Thanks,
Delia

On 1/31/20 3:23 PM, Delia Burduv wrote:
> Here is the updated patch. The changes are minor, so let me know if 
> there is anything else to fix or if it can be committed.
> 
> Thank you,
> Delia
> 
> On 1/30/20 2:55 PM, Kyrill Tkachov wrote:
>> Hi Delia,
>>
>>
>> On 1/28/20 4:44 PM, Delia Burduv wrote:
>>> Ping.
>>> ------------------------------------------------------------------------
>>> *From:* Delia Burduv <delia.burduv@arm.com>
>>> *Sent:* 22 January 2020 17:26
>>> *To:* gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org>
>>> *Cc:* nickc@redhat.com <nickc@redhat.com>; Richard Earnshaw 
>>> <Richard.Earnshaw@arm.com>; Ramana Radhakrishnan 
>>> <Ramana.Radhakrishnan@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
>>> *Subject:* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla 
>>> and vfma<b/t> for AArch32 AdvSIMD
>>> Ping.
>>>
>>> I have read Richard Sandiford's comments on the AArch64 patches and I
>>> will apply what is relevant to this patch as well. Particularly, I will
>>> change the tests to use the exact input and output registers and I will
>>> change the types of the rtl patterns.
>>
>>
>> Please send the updated patches so that someone can commit them for 
>> you once they're reviewed.
>>
>> Thanks,
>>
>> Kyrill
>>
>>
>>>
>>> On 12/20/19 6:44 PM, Delia Burduv wrote:
>>> > This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab and vfmat
>>> > as part of the BFloat16 extension.
>>> > (https://developer.arm.com/docs/101028/latest.)
>>> > The intrinsics are declared in arm_neon.h and the RTL patterns are
>>> > defined in neon.md.
>>> > Two new tests are added to check assembler output and lane indices.
>>> >
>>> > This patch depends on the Arm back-end patche.
>>> > (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>>> >
>>> > Tested for regression on arm-none-eabi and armeb-none-eabi. I don't 
>>> have
>>> > commit rights, so if this is ok can someone please commit it for me?
>>> >
>>> > gcc/ChangeLog:
>>> >
>>> > 2019-11-12Ã¯Â¿Â½ Delia Burduv <delia.burduv@arm.com>
>>> >
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½* config/arm/arm_neon.h (vbfmmlaq_f32): New.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlalbq_f32): New.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlaltq_f32): New.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlalbq_lane_f32): New.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlaltq_lane_f32): New.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlalbq_laneq_f32): New.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlaltq_laneq_f32): New.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½* config/arm/arm_neon_builtins.def (vbfmmla): New.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmab): New.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmat): New.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmab_lane): New.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmat_lane): New.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmab_laneq): New.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmat_laneq): New.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * config/arm/iterators.md (BF_MA): New int iterator.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (bt): New int attribute.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (VQXBF): Copy of VQX with V8BF.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V_HALF): Added V8BF.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * config/arm/neon.md (neon_vbfmmlav8hi): New insn.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vbfma<bt>v8hi): New insn.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vbfma<bt>_lanev8hi): New insn.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vbfma<bt>_laneqv8hi): New expand.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vget_high<mode>): Changed iterator to VQXBF.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½* config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (UNSPEC_BFMAB): New UNSPEC.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (UNSPEC_BFMAT): New UNSPEC.
>>> >
>>> > 2019-11-12Ã¯Â¿Â½ Delia Burduv <delia.burduv@arm.com>
>>> >
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * gcc.target/arm/simd/bf16_ma_1.c: New test.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * gcc.target/arm/simd/bf16_ma_2.c: New test.
>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * gcc.target/arm/simd/bf16_mmla_1.c: New test.

[-- Attachment #2: rb12263.patch --]
[-- Type: text/x-patch, Size: 12395 bytes --]

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 3c78f435009ab027f92693d00ab5b40960d5419d..81f8008ea6a5fb11eb09f6685ba24bb0c54fb248 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18742,6 +18742,64 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b,
   return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, __index);
 }
 
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmmlav8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmabv8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmatv8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		   const int __index)
+{
+  return __builtin_neon_vbfmab_lanev8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		   const int __index)
+{
+  return __builtin_neon_vbfmat_lanev8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		    const int __index)
+{
+  return __builtin_neon_vbfmab_laneqv8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		    const int __index)
+{
+  return __builtin_neon_vbfmat_laneqv8bf (__r, __a, __b, __index);
+}
+
+#pragma GCC pop_options
+
 #pragma GCC pop_options
 #endif
 
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index e9ff4e501cbb5d16b9211f5bc96db376ddf21afc..cc06783daf393f7166fd922f86b3db79c02ba188 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -373,3 +373,12 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane0, v4sf, v8hf)
 VAR2 (MAC_LANE_PAIR, vcmlaq_lane90, v4sf, v8hf)
 VAR2 (MAC_LANE_PAIR, vcmlaq_lane180, v4sf, v8hf)
 VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf)
+
+VAR1 (TERNOP, vbfmmla, v8bf)
+
+VAR1 (TERNOP, vbfmab, v8bf)
+VAR1 (TERNOP, vbfmat, v8bf)
+VAR1 (MAC_LANE, vbfmab_lane, v8bf)
+VAR1 (MAC_LANE, vbfmat_lane, v8bf)
+VAR1 (MAC_LANE, vbfmab_laneq, v8bf)
+VAR1 (MAC_LANE, vbfmat_laneq, v8bf)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 33e29509f00a89fa23d0546687c0e4643f0b32d2..72b8ce0bb26dcd520603b907b4f86a74d0164332 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -106,6 +106,9 @@
 ;; Quad-width vector modes plus 64-bit elements.
 (define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI])
 
+;; Quad-width vector modes plus 64-bit elements and V8BF.
+(define_mode_iterator VQXBF [V16QI V8HI V8HF (V8BF "TARGET_BF16_SIMD") V4SI V4SF V2DI])
+
 ;; Quad-width vector modes without floating-point elements.
 (define_mode_iterator VQI [V16QI V8HI V4SI])
 
@@ -485,6 +488,8 @@
 (define_int_iterator VCADD [UNSPEC_VCADD90 UNSPEC_VCADD270])
 (define_int_iterator VCMLA [UNSPEC_VCMLA UNSPEC_VCMLA90 UNSPEC_VCMLA180 UNSPEC_VCMLA270])
 
+(define_int_iterator BF_MA [UNSPEC_BFMAB UNSPEC_BFMAT])
+
 ;;----------------------------------------------------------------------------
 ;; Mode attributes
 ;;----------------------------------------------------------------------------
@@ -609,7 +614,8 @@
 (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI")
 			  (V8HF "V4HF") (V4SI  "V2SI")
 			  (V4SF "V2SF") (V2DF "DF")
-			  (V2DI "DI") (V4HF "HF")])
+			  (V2DI "DI") (V4HF "HF")
+			  (V8BF "V4BF")])
 
 ;; Same, but lower-case.
 (define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi")
@@ -1171,4 +1177,7 @@
 (define_int_attr opsuffix [(UNSPEC_DOT_S "s8")
 			   (UNSPEC_DOT_U "u8")])
 
+;; An iterator for VFMA<bt>
+(define_int_attr bt [(UNSPEC_BFMAB "b") (UNSPEC_BFMAT "t")])
+
 (define_int_attr smlaw_op [(UNSPEC_SMLAWB "smlawb") (UNSPEC_SMLAWT "smlawt")])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 6087ca6f2badde6a492bb515a2cb5846f3d4ad8e..4e0d0b5c317a81839de9dee581c5e351d3193dfa 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -3875,7 +3875,7 @@ if (BYTES_BIG_ENDIAN)
 
 (define_expand "neon_vget_high<mode>"
   [(match_operand:<V_HALF> 0 "s_register_operand")
-   (match_operand:VQX 1 "s_register_operand")]
+   (match_operand:VQXBF 1 "s_register_operand")]
   "TARGET_NEON"
 {
   emit_move_insn (operands[0],
@@ -6552,3 +6552,64 @@ if (BYTES_BIG_ENDIAN)
  "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
  [(set_attr "type" "neon_fp_abd_s<q>")]
 )
+
+(define_insn "neon_vbfmmlav8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus:V4SF (match_operand:V4SF 1 "register_operand" "0")
+                   (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                 (match_operand:V8BF 3 "register_operand" "w")]
+                    UNSPEC_BFMMLA)))]
+  "TARGET_BF16_SIMD"
+  "vmmla.bf16\\t%q0, %q2, %q3"
+  [(set_attr "type" "neon_fp_mla_s_q")]
+)
+
+(define_insn "neon_vbfma<bt>v8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V8BF 3 "register_operand" "w")]
+                     BF_MA)))]
+  "TARGET_BF16_SIMD"
+  "vfma<bt>.bf16\\t%q0, %q2, %q3"
+  [(set_attr "type" "neon_fp_mla_s_q")]
+)
+
+(define_insn "neon_vbfma<bt>_lanev8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V4BF 3 "register_operand" "x")
+                                  (match_operand:SI 4 "const_int_operand" "n")]
+                     BF_MA)))]
+  "TARGET_BF16_SIMD"
+  "vfma<bt>.bf16\\t%q0, %q2, %P3[%c4]"
+  [(set_attr "type" "neon_fp_mla_s_scalar_q")]
+)
+
+(define_expand "neon_vbfma<bt>_laneqv8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V8BF 3 "register_operand" "x")
+                                  (match_operand:SI 4 "const_int_operand" "n")]
+                     BF_MA)))]
+  "TARGET_BF16_SIMD"
+  {
+    int lane = INTVAL (operands[4]);
+    gcc_assert (lane >=0 && lane <=7);
+    if (lane < 4)
+    {
+	emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], operands[1], operands[2], operands[3], operands[4]));
+    }
+    else
+      {
+	rtx op_highpart = gen_reg_rtx (V4BFmode);
+	emit_insn (gen_neon_vget_highv8bf (op_highpart, operands[3]));
+	operands[4] = GEN_INT (lane - 4);
+	emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], operands[1], operands[2], op_highpart, operands[4]));
+      }
+    DONE;
+  }
+  [(set_attr "type" "neon_fp_mla_s_scalar_q")]
+)
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 8f4a705f43efdb6baf03b39cee589cf728620687..97f08abec0a089b5cd95840da12ae22f7c960b28 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -501,4 +501,7 @@
   UNSPEC_VCMLA90
   UNSPEC_VCMLA180
   UNSPEC_VCMLA270
+  UNSPEC_BFMMLA
+  UNSPEC_BFMAB
+  UNSPEC_BFMAT
 ])
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..7602db9597a955b2a303f2dc55b9ff80f81b3b6f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
@@ -0,0 +1,79 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" }  */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+
+#include "arm_neon.h"
+
+/*
+**test_vbfmlalbq_f32:
+**      ...
+**      vfmab.bf16	q0, q1, q2
+**      bx	lr
+*/
+float32x4_t
+test_vbfmlalbq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlalbq_f32 (r, a, b);
+}
+
+/*
+**test_vbfmlaltq_f32:
+**      ...
+**      vfmat.bf16	q0, q1, q2
+**      bx	lr
+*/
+float32x4_t
+test_vbfmlaltq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlaltq_f32 (r, a, b);
+}
+
+/*
+**test_vbfmlalbq_lane_f32:
+**      ...
+**      vfmab.bf16	q0, q1, d4[0]
+**      bx	lr
+*/
+float32x4_t
+test_vbfmlalbq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return vbfmlalbq_lane_f32 (r, a, b, 0);
+}
+
+/*
+**test_vbfmlaltq_lane_f32:
+**      ...
+**      vfmat.bf16	q0, q1, d4[2]
+**      bx	lr
+*/
+float32x4_t
+test_vbfmlaltq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return vbfmlaltq_lane_f32 (r, a, b, 2);
+}
+
+/*
+**test_vbfmlalbq_laneq_f32:
+**      ...
+**      vfmab.bf16	q0, q1, d5[1]
+**      bx	lr
+*/
+float32x4_t
+test_vbfmlalbq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlalbq_laneq_f32 (r, a, b, 5);
+}
+
+/*
+**test_vbfmlaltq_laneq_f32:
+**      ...
+**      vfmat.bf16	q0, q1, d5[3]
+**      bx	lr
+*/
+float32x4_t
+test_vbfmlaltq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlaltq_laneq_f32 (r, a, b, 7);
+}
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..226ed7e1d8e4747d73b0518c809aaf0e3c5bc78d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include "arm_neon.h"
+
+/* Test lane index limits for vbfmlalbq_lane_f32  */
+float32x4_t
+test_vbfmlalbq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmab_lanev8bf (r, a, b, -1); /* { dg-error {lane -1 out of range 0 - 3} } */
+}
+
+float32x4_t
+test_vbfmlalbq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmab_lanev8bf (r, a, b, 4); /* { dg-error {lane 4 out of range 0 - 3} } */
+}
+
+/* Test lane index limits for vbfmlaltq_lane_f32  */
+float32x4_t
+test_vbfmlaltq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmat_lanev8bf (r, a, b, -1); /* { dg-error {lane -1 out of range 0 - 3} } */
+}
+
+float32x4_t
+test_vbfmlaltq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmat_lanev8bf (r, a, b, 4); /* { dg-error {lane 4 out of range 0 - 3} } */
+}
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..d8118a7111a359464f1508e92ac6183ea1f4eeed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c
@@ -0,0 +1,18 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+
+#include <arm_neon.h>
+
+/*test_vbfmmlaq_f32:
+**        ...
+**        vmmla.bf16	q0, q1, q2
+**        bx	lr
+*/
+float32x4_t
+test_vbfmmlaq_f32 (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfmmlaq_f32 (r, x, y);
+}

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma<b/t> for AArch32 AdvSIMD
  2020-02-19 17:23         ` Delia Burduv
@ 2020-02-21 11:41           ` Kyrill Tkachov
  2020-03-04 17:21             ` Delia Burduv
  0 siblings, 1 reply; 10+ messages in thread
From: Kyrill Tkachov @ 2020-02-21 11:41 UTC (permalink / raw)
  To: Delia Burduv, gcc-patches

Hi Delia,

On 2/19/20 5:23 PM, Delia Burduv wrote:
> Hi,
>
> Here is the latest version of the patch. It just has some minor 
> formatting changes that were brought up by Richard Sandiford in the 
> AArch64 patches
>
> Thanks,
> Delia
>
> On 1/31/20 3:23 PM, Delia Burduv wrote:
>> Here is the updated patch. The changes are minor, so let me know if 
>> there is anything else to fix or if it can be committed.
>>
>> Thank you,
>> Delia
>>
>> On 1/30/20 2:55 PM, Kyrill Tkachov wrote:
>>> Hi Delia,
>>>
>>>
>>> On 1/28/20 4:44 PM, Delia Burduv wrote:
>>>> Ping.
>>>> ------------------------------------------------------------------------ 
>>>>
>>>> *From:* Delia Burduv <delia.burduv@arm.com>
>>>> *Sent:* 22 January 2020 17:26
>>>> *To:* gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org>
>>>> *Cc:* nickc@redhat.com <nickc@redhat.com>; Richard Earnshaw 
>>>> <Richard.Earnshaw@arm.com>; Ramana Radhakrishnan 
>>>> <Ramana.Radhakrishnan@arm.com>; Kyrylo Tkachov 
>>>> <Kyrylo.Tkachov@arm.com>
>>>> *Subject:* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla 
>>>> and vfma<b/t> for AArch32 AdvSIMD
>>>> Ping.
>>>>
>>>> I have read Richard Sandiford's comments on the AArch64 patches and I
>>>> will apply what is relevant to this patch as well. Particularly, I 
>>>> will
>>>> change the tests to use the exact input and output registers and I 
>>>> will
>>>> change the types of the rtl patterns.
>>>
>>>
>>> Please send the updated patches so that someone can commit them for 
>>> you once they're reviewed.
>>>
>>> Thanks,
>>>
>>> Kyrill
>>>
>>>
>>>>
>>>> On 12/20/19 6:44 PM, Delia Burduv wrote:
>>>> > This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab and 
>>>> vfmat
>>>> > as part of the BFloat16 extension.
>>>> > (https://developer.arm.com/docs/101028/latest.)
>>>> > The intrinsics are declared in arm_neon.h and the RTL patterns are
>>>> > defined in neon.md.
>>>> > Two new tests are added to check assembler output and lane indices.
>>>> >
>>>> > This patch depends on the Arm back-end patche.
>>>> > (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>>>> >
>>>> > Tested for regression on arm-none-eabi and armeb-none-eabi. I 
>>>> don't have
>>>> > commit rights, so if this is ok can someone please commit it for me?
>>>> >
>>>> > gcc/ChangeLog:
>>>> >
>>>> > 2019-11-12Ã¯Â¿Â½ Delia Burduv <delia.burduv@arm.com>
>>>> >
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½* config/arm/arm_neon.h (vbfmmlaq_f32): New.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlalbq_f32): New.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlaltq_f32): New.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlalbq_lane_f32): New.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlaltq_lane_f32): New.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlalbq_laneq_f32): New.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlaltq_laneq_f32): New.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½* config/arm/arm_neon_builtins.def (vbfmmla): New.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmab): New.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmat): New.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmab_lane): New.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmat_lane): New.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmab_laneq): New.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmat_laneq): New.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * config/arm/iterators.md (BF_MA): New int iterator.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (bt): New int attribute.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (VQXBF): Copy of VQX with V8BF.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V_HALF): Added V8BF.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * config/arm/neon.md (neon_vbfmmlav8hi): New 
>>>> insn.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vbfma<bt>v8hi): New insn.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vbfma<bt>_lanev8hi): New insn.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vbfma<bt>_laneqv8hi): New 
>>>> expand.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vget_high<mode>): Changed 
>>>> iterator to VQXBF.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½* config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (UNSPEC_BFMAB): New UNSPEC.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (UNSPEC_BFMAT): New UNSPEC.
>>>> >
>>>> > 2019-11-12Ã¯Â¿Â½ Delia Burduv <delia.burduv@arm.com>
>>>> >
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * gcc.target/arm/simd/bf16_ma_1.c: New 
>>>> test.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * gcc.target/arm/simd/bf16_ma_2.c: New 
>>>> test.
>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * gcc.target/arm/simd/bf16_mmla_1.c: New 
>>>> test.

This looks good, a few minor things though...


diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 3c78f435009ab027f92693d00ab5b40960d5419d..81f8008ea6a5fb11eb09f6685ba24bb0c54fb248 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18742,6 +18742,64 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b,
    return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, __index);
  }
  
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmmlav8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmabv8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmatv8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		   const int __index)
+{
+  return __builtin_neon_vbfmab_lanev8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		   const int __index)
+{
+  return __builtin_neon_vbfmat_lanev8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		    const int __index)
+{
+  return __builtin_neon_vbfmab_laneqv8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		    const int __index)
+{
+  return __builtin_neon_vbfmat_laneqv8bf (__r, __a, __b, __index);
+}
+
+#pragma GCC pop_options
+
  #pragma GCC pop_options
  #endif
  
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index e9ff4e501cbb5d16b9211f5bc96db376ddf21afc..cc06783daf393f7166fd922f86b3db79c02ba188 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -373,3 +373,12 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane0, v4sf, v8hf)
  VAR2 (MAC_LANE_PAIR, vcmlaq_lane90, v4sf, v8hf)
  VAR2 (MAC_LANE_PAIR, vcmlaq_lane180, v4sf, v8hf)
  VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf)
+
+VAR1 (TERNOP, vbfmmla, v8bf)
+
+VAR1 (TERNOP, vbfmab, v8bf)
+VAR1 (TERNOP, vbfmat, v8bf)
+VAR1 (MAC_LANE, vbfmab_lane, v8bf)
+VAR1 (MAC_LANE, vbfmat_lane, v8bf)
+VAR1 (MAC_LANE, vbfmab_laneq, v8bf)
+VAR1 (MAC_LANE, vbfmat_laneq, v8bf)

The instructions produced from these intrinsics have the form vmlla, vfmab, vfmat. Let's use those names here rather than the "vbf*" ones to avoid confusion in the future.

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 33e29509f00a89fa23d0546687c0e4643f0b32d2..72b8ce0bb26dcd520603b907b4f86a74d0164332 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -106,6 +106,9 @@
  ;; Quad-width vector modes plus 64-bit elements.
  (define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI])
  
+;; Quad-width vector modes plus 64-bit elements and V8BF.
+(define_mode_iterator VQXBF [V16QI V8HI V8HF (V8BF "TARGET_BF16_SIMD") V4SI V4SF V2DI])
+
  ;; Quad-width vector modes without floating-point elements.
  (define_mode_iterator VQI [V16QI V8HI V4SI])
  
@@ -485,6 +488,8 @@
  (define_int_iterator VCADD [UNSPEC_VCADD90 UNSPEC_VCADD270])
  (define_int_iterator VCMLA [UNSPEC_VCMLA UNSPEC_VCMLA90 UNSPEC_VCMLA180 UNSPEC_VCMLA270])
  
+(define_int_iterator BF_MA [UNSPEC_BFMAB UNSPEC_BFMAT])
+
  ;;----------------------------------------------------------------------------
  ;; Mode attributes
  ;;----------------------------------------------------------------------------
@@ -609,7 +614,8 @@
  (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI")
  			  (V8HF "V4HF") (V4SI  "V2SI")
  			  (V4SF "V2SF") (V2DF "DF")
-			  (V2DI "DI") (V4HF "HF")])
+			  (V2DI "DI") (V4HF "HF")
+			  (V8BF "V4BF")])
  
  ;; Same, but lower-case.
  (define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi")
@@ -1171,4 +1177,7 @@
  (define_int_attr opsuffix [(UNSPEC_DOT_S "s8")
  			   (UNSPEC_DOT_U "u8")])
  
+;; An iterator for VFMA<bt>
+(define_int_attr bt [(UNSPEC_BFMAB "b") (UNSPEC_BFMAT "t")])
+
  (define_int_attr smlaw_op [(UNSPEC_SMLAWB "smlawb") (UNSPEC_SMLAWT "smlawt")])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 6087ca6f2badde6a492bb515a2cb5846f3d4ad8e..4e0d0b5c317a81839de9dee581c5e351d3193dfa 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -3875,7 +3875,7 @@ if (BYTES_BIG_ENDIAN)
  
  (define_expand "neon_vget_high<mode>"
    [(match_operand:<V_HALF> 0 "s_register_operand")
-   (match_operand:VQX 1 "s_register_operand")]
+   (match_operand:VQXBF 1 "s_register_operand")]
    "TARGET_NEON"
  {
    emit_move_insn (operands[0],
@@ -6552,3 +6552,64 @@ if (BYTES_BIG_ENDIAN)
   "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
   [(set_attr "type" "neon_fp_abd_s<q>")]
  )
+
+(define_insn "neon_vbfmmlav8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus:V4SF (match_operand:V4SF 1 "register_operand" "0")
+                   (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                 (match_operand:V8BF 3 "register_operand" "w")]
+                    UNSPEC_BFMMLA)))]
+  "TARGET_BF16_SIMD"
+  "vmmla.bf16\\t%q0, %q2, %q3"
+  [(set_attr "type" "neon_fp_mla_s_q")]
+)
+
+(define_insn "neon_vbfma<bt>v8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V8BF 3 "register_operand" "w")]
+                     BF_MA)))]
+  "TARGET_BF16_SIMD"
+  "vfma<bt>.bf16\\t%q0, %q2, %q3"
+  [(set_attr "type" "neon_fp_mla_s_q")]
+)
+
+(define_insn "neon_vbfma<bt>_lanev8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V4BF 3 "register_operand" "x")
+                                  (match_operand:SI 4 "const_int_operand" "n")]
+                     BF_MA)))]
+  "TARGET_BF16_SIMD"
+  "vfma<bt>.bf16\\t%q0, %q2, %P3[%c4]"
+  [(set_attr "type" "neon_fp_mla_s_scalar_q")]
+)
+
+(define_expand "neon_vbfma<bt>_laneqv8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V8BF 3 "register_operand" "x")
+                                  (match_operand:SI 4 "const_int_operand" "n")]
+                     BF_MA)))]
+  "TARGET_BF16_SIMD"
+  {
+    int lane = INTVAL (operands[4]);
+    gcc_assert (lane >=0 && lane <=7);

Let's use the IN_RANGE macro to assert this.

+    if (lane < 4)
+    {
+	emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], operands[1], operands[2], operands[3], operands[4]));
+    }
+    else
+      {
+	rtx op_highpart = gen_reg_rtx (V4BFmode);
+	emit_insn (gen_neon_vget_highv8bf (op_highpart, operands[3]));
+	operands[4] = GEN_INT (lane - 4);
+	emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], operands[1], operands[2], op_highpart, operands[4]));
+      }
+    DONE;
+  }
+  [(set_attr "type" "neon_fp_mla_s_scalar_q")]
+)
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 8f4a705f43efdb6baf03b39cee589cf728620687..97f08abec0a089b5cd95840da12ae22f7c960b28 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -501,4 +501,7 @@
    UNSPEC_VCMLA90
    UNSPEC_VCMLA180
    UNSPEC_VCMLA270
+  UNSPEC_BFMMLA
+  UNSPEC_BFMAB
+  UNSPEC_BFMAT
  ])
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..7602db9597a955b2a303f2dc55b9ff80f81b3b6f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
@@ -0,0 +1,79 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" }  */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+
+#include "arm_neon.h"
+
+/*
+**test_vbfmlalbq_f32:
+**      ...
+**      vfmab.bf16	q0, q1, q2
+**      bx	lr
+*/
+float32x4_t
+test_vbfmlalbq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlalbq_f32 (r, a, b);
+}
+
+/*
+**test_vbfmlaltq_f32:
+**      ...
+**      vfmat.bf16	q0, q1, q2
+**      bx	lr
+*/
+float32x4_t
+test_vbfmlaltq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlaltq_f32 (r, a, b);
+}
+
+/*
+**test_vbfmlalbq_lane_f32:
+**      ...
+**      vfmab.bf16	q0, q1, d4[0]
+**      bx	lr
+*/
+float32x4_t
+test_vbfmlalbq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return vbfmlalbq_lane_f32 (r, a, b, 0);
+}
+
+/*
+**test_vbfmlaltq_lane_f32:
+**      ...
+**      vfmat.bf16	q0, q1, d4[2]
+**      bx	lr
+*/
+float32x4_t
+test_vbfmlaltq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return vbfmlaltq_lane_f32 (r, a, b, 2);
+}
+
+/*
+**test_vbfmlalbq_laneq_f32:
+**      ...
+**      vfmab.bf16	q0, q1, d5[1]
+**      bx	lr
+*/
+float32x4_t
+test_vbfmlalbq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlalbq_laneq_f32 (r, a, b, 5);
+}
+
+/*
+**test_vbfmlaltq_laneq_f32:
+**      ...
+**      vfmat.bf16	q0, q1, d5[3]
+**      bx	lr
+*/
+float32x4_t
+test_vbfmlaltq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlaltq_laneq_f32 (r, a, b, 7);
+}
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..226ed7e1d8e4747d73b0518c809aaf0e3c5bc78d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include "arm_neon.h"
+
+/* Test lane index limits for vbfmlalbq_lane_f32  */
+float32x4_t
+test_vbfmlalbq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmab_lanev8bf (r, a, b, -1); /* { dg-error {lane -1 out of range 0 - 3} } */
+}
+
+float32x4_t
+test_vbfmlalbq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmab_lanev8bf (r, a, b, 4); /* { dg-error {lane 4 out of range 0 - 3} } */
+}
+
+/* Test lane index limits for vbfmlaltq_lane_f32  */
+float32x4_t
+test_vbfmlaltq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmat_lanev8bf (r, a, b, -1); /* { dg-error {lane -1 out of range 0 - 3} } */
+}
+
+float32x4_t
+test_vbfmlaltq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmat_lanev8bf (r, a, b, 4); /* { dg-error {lane 4 out of range 0 - 3} } */
+}


We want to be testing the ACLE intrinsics here rather than the __builtin_neon* builtins directly. The builtins are an implementation detail that the user should not rely on.

Ok with these changes.
Thanks,
Kyrill



diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..d8118a7111a359464f1508e92ac6183ea1f4eeed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c
@@ -0,0 +1,18 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+
+#include <arm_neon.h>
+
+/*test_vbfmmlaq_f32:
+**        ...
+**        vmmla.bf16	q0, q1, q2
+**        bx	lr
+*/
+float32x4_t
+test_vbfmmlaq_f32 (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfmmlaq_f32 (r, x, y);
+}

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma<b/t> for AArch32 AdvSIMD
  2020-02-21 11:41           ` Kyrill Tkachov
@ 2020-03-04 17:21             ` Delia Burduv
  2020-03-05 11:22               ` Kyrill Tkachov
  0 siblings, 1 reply; 10+ messages in thread
From: Delia Burduv @ 2020-03-04 17:21 UTC (permalink / raw)
  To: Kyrill Tkachov, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 23207 bytes --]

Hi,

This is the latest version of the patch.

Thanks,
Delia

On 2/21/20 11:41 AM, Kyrill Tkachov wrote:
> Hi Delia,
> 
> On 2/19/20 5:23 PM, Delia Burduv wrote:
>> Hi,
>>
>> Here is the latest version of the patch. It just has some minor 
>> formatting changes that were brought up by Richard Sandiford in the 
>> AArch64 patches
>>
>> Thanks,
>> Delia
>>
>> On 1/31/20 3:23 PM, Delia Burduv wrote:
>>> Here is the updated patch. The changes are minor, so let me know if 
>>> there is anything else to fix or if it can be committed.
>>>
>>> Thank you,
>>> Delia
>>>
>>> On 1/30/20 2:55 PM, Kyrill Tkachov wrote:
>>>> Hi Delia,
>>>>
>>>>
>>>> On 1/28/20 4:44 PM, Delia Burduv wrote:
>>>>> Ping.
>>>>> ------------------------------------------------------------------------ 
>>>>>
>>>>> *From:* Delia Burduv <delia.burduv@arm.com>
>>>>> *Sent:* 22 January 2020 17:26
>>>>> *To:* gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org>
>>>>> *Cc:* nickc@redhat.com <nickc@redhat.com>; Richard Earnshaw 
>>>>> <Richard.Earnshaw@arm.com>; Ramana Radhakrishnan 
>>>>> <Ramana.Radhakrishnan@arm.com>; Kyrylo Tkachov 
>>>>> <Kyrylo.Tkachov@arm.com>
>>>>> *Subject:* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla 
>>>>> and vfma<b/t> for AArch32 AdvSIMD
>>>>> Ping.
>>>>>
>>>>> I have read Richard Sandiford's comments on the AArch64 patches and I
>>>>> will apply what is relevant to this patch as well. Particularly, I 
>>>>> will
>>>>> change the tests to use the exact input and output registers and I 
>>>>> will
>>>>> change the types of the rtl patterns.
>>>>
>>>>
>>>> Please send the updated patches so that someone can commit them for 
>>>> you once they're reviewed.
>>>>
>>>> Thanks,
>>>>
>>>> Kyrill
>>>>
>>>>
>>>>>
>>>>> On 12/20/19 6:44 PM, Delia Burduv wrote:
>>>>> > This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab and 
>>>>> vfmat
>>>>> > as part of the BFloat16 extension.
>>>>> > (https://developer.arm.com/docs/101028/latest.)
>>>>> > The intrinsics are declared in arm_neon.h and the RTL patterns are
>>>>> > defined in neon.md.
>>>>> > Two new tests are added to check assembler output and lane indices.
>>>>> >
>>>>> > This patch depends on the Arm back-end patche.
>>>>> > (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>>>>> >
>>>>> > Tested for regression on arm-none-eabi and armeb-none-eabi. I 
>>>>> don't have
>>>>> > commit rights, so if this is ok can someone please commit it for me?
>>>>> >
>>>>> > gcc/ChangeLog:
>>>>> >
>>>>> > 2019-11-12Ã¯Â¿Â½ Delia Burduv <delia.burduv@arm.com>
>>>>> >
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½* config/arm/arm_neon.h (vbfmmlaq_f32): New.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlalbq_f32): New.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlaltq_f32): New.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlalbq_lane_f32): New.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlaltq_lane_f32): New.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlalbq_laneq_f32): New.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlaltq_laneq_f32): New.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½* config/arm/arm_neon_builtins.def (vbfmmla): New.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmab): New.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmat): New.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmab_lane): New.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmat_lane): New.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmab_laneq): New.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmat_laneq): New.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * config/arm/iterators.md (BF_MA): New int iterator.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (bt): New int attribute.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (VQXBF): Copy of VQX with V8BF.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V_HALF): Added V8BF.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * config/arm/neon.md (neon_vbfmmlav8hi): New 
>>>>> insn.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vbfma<bt>v8hi): New insn.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vbfma<bt>_lanev8hi): New insn.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vbfma<bt>_laneqv8hi): New 
>>>>> expand.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vget_high<mode>): Changed 
>>>>> iterator to VQXBF.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½* config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (UNSPEC_BFMAB): New UNSPEC.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (UNSPEC_BFMAT): New UNSPEC.
>>>>> >
>>>>> > 2019-11-12Ã¯Â¿Â½ Delia Burduv <delia.burduv@arm.com>
>>>>> >
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * gcc.target/arm/simd/bf16_ma_1.c: New 
>>>>> test.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * gcc.target/arm/simd/bf16_ma_2.c: New 
>>>>> test.
>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * gcc.target/arm/simd/bf16_mmla_1.c: New 
>>>>> test.
> 
> This looks good, a few minor things though...
> 
> 
> diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
> index 
> 3c78f435009ab027f92693d00ab5b40960d5419d..81f8008ea6a5fb11eb09f6685ba24bb0c54fb248 
> 100644
> --- a/gcc/config/arm/arm_neon.h
> +++ b/gcc/config/arm/arm_neon.h
> @@ -18742,6 +18742,64 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, 
> float32x4_t __a, float32x4_t __b,
>  Ã¯Â¿Â½Ã¯Â¿Â½ return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, __index);
>  Ã¯Â¿Â½}
> 
> +#pragma GCC push_options
> +#pragma GCC target ("arch=armv8.2-a+bf16")
> +
> +__extension__ extern __inline float32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
> +{
> +Ã¯Â¿Â½ return __builtin_neon_vbfmmlav8bf (__r, __a, __b);
> +}
> +
> +__extension__ extern __inline float32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
> +{
> +Ã¯Â¿Â½ return __builtin_neon_vbfmabv8bf (__r, __a, __b);
> +}
> +
> +__extension__ extern __inline float32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
> +{
> +Ã¯Â¿Â½ return __builtin_neon_vbfmatv8bf (__r, __a, __b);
> +}
> +
> +__extension__ extern __inline float32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ const int __index)
> +{
> +Ã¯Â¿Â½ return __builtin_neon_vbfmab_lanev8bf (__r, __a, __b, __index);
> +}
> +
> +__extension__ extern __inline float32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ const int __index)
> +{
> +Ã¯Â¿Â½ return __builtin_neon_vbfmat_lanev8bf (__r, __a, __b, __index);
> +}
> +
> +__extension__ extern __inline float32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ const int __index)
> +{
> +Ã¯Â¿Â½ return __builtin_neon_vbfmab_laneqv8bf (__r, __a, __b, __index);
> +}
> +
> +__extension__ extern __inline float32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ const int __index)
> +{
> +Ã¯Â¿Â½ return __builtin_neon_vbfmat_laneqv8bf (__r, __a, __b, __index);
> +}
> +
> +#pragma GCC pop_options
> +
>  Ã¯Â¿Â½#pragma GCC pop_options
>  Ã¯Â¿Â½#endif
> 
> diff --git a/gcc/config/arm/arm_neon_builtins.def 
> b/gcc/config/arm/arm_neon_builtins.def
> index 
> e9ff4e501cbb5d16b9211f5bc96db376ddf21afc..cc06783daf393f7166fd922f86b3db79c02ba188 
> 100644
> --- a/gcc/config/arm/arm_neon_builtins.def
> +++ b/gcc/config/arm/arm_neon_builtins.def
> @@ -373,3 +373,12 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane0, v4sf, v8hf)
>  Ã¯Â¿Â½VAR2 (MAC_LANE_PAIR, vcmlaq_lane90, v4sf, v8hf)
>  Ã¯Â¿Â½VAR2 (MAC_LANE_PAIR, vcmlaq_lane180, v4sf, v8hf)
>  Ã¯Â¿Â½VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf)
> +
> +VAR1 (TERNOP, vbfmmla, v8bf)
> +
> +VAR1 (TERNOP, vbfmab, v8bf)
> +VAR1 (TERNOP, vbfmat, v8bf)
> +VAR1 (MAC_LANE, vbfmab_lane, v8bf)
> +VAR1 (MAC_LANE, vbfmat_lane, v8bf)
> +VAR1 (MAC_LANE, vbfmab_laneq, v8bf)
> +VAR1 (MAC_LANE, vbfmat_laneq, v8bf)
> 
> The instructions produced from these intrinsics have the form vmlla, 
> vfmab, vfmat. Let's use those names here rather than the "vbf*" ones to 
> avoid confusion in the future.
> 
> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> index 
> 33e29509f00a89fa23d0546687c0e4643f0b32d2..72b8ce0bb26dcd520603b907b4f86a74d0164332 
> 100644
> --- a/gcc/config/arm/iterators.md
> +++ b/gcc/config/arm/iterators.md
> @@ -106,6 +106,9 @@
>  Ã¯Â¿Â½;; Quad-width vector modes plus 64-bit elements.
>  Ã¯Â¿Â½(define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI])
> 
> +;; Quad-width vector modes plus 64-bit elements and V8BF.
> +(define_mode_iterator VQXBF [V16QI V8HI V8HF (V8BF "TARGET_BF16_SIMD") 
> V4SI V4SF V2DI])
> +
>  Ã¯Â¿Â½;; Quad-width vector modes without floating-point elements.
>  Ã¯Â¿Â½(define_mode_iterator VQI [V16QI V8HI V4SI])
> 
> @@ -485,6 +488,8 @@
>  Ã¯Â¿Â½(define_int_iterator VCADD [UNSPEC_VCADD90 UNSPEC_VCADD270])
>  Ã¯Â¿Â½(define_int_iterator VCMLA [UNSPEC_VCMLA UNSPEC_VCMLA90 
> UNSPEC_VCMLA180 UNSPEC_VCMLA270])
> 
> +(define_int_iterator BF_MA [UNSPEC_BFMAB UNSPEC_BFMAT])
> +
>  Ã¯Â¿Â½;;----------------------------------------------------------------------------
>  Ã¯Â¿Â½;; Mode attributes
>  Ã¯Â¿Â½;;----------------------------------------------------------------------------
> @@ -609,7 +614,8 @@
>  Ã¯Â¿Â½(define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI")
>  Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V8HF "V4HF") (V4SIÃ¯Â¿Â½ "V2SI")
>  Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V4SF "V2SF") (V2DF "DF")
> -Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V2DI "DI") (V4HF "HF")])
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V2DI "DI") (V4HF "HF")
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V8BF "V4BF")])
> 
>  Ã¯Â¿Â½;; Same, but lower-case.
>  Ã¯Â¿Â½(define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi")
> @@ -1171,4 +1177,7 @@
>  Ã¯Â¿Â½(define_int_attr opsuffix [(UNSPEC_DOT_S "s8")
>  Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (UNSPEC_DOT_U "u8")])
> 
> +;; An iterator for VFMA<bt>
> +(define_int_attr bt [(UNSPEC_BFMAB "b") (UNSPEC_BFMAT "t")])
> +
>  Ã¯Â¿Â½(define_int_attr smlaw_op [(UNSPEC_SMLAWB "smlawb") (UNSPEC_SMLAWT 
> "smlawt")])
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index 
> 6087ca6f2badde6a492bb515a2cb5846f3d4ad8e..4e0d0b5c317a81839de9dee581c5e351d3193dfa 
> 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -3875,7 +3875,7 @@ if (BYTES_BIG_ENDIAN)
> 
>  Ã¯Â¿Â½(define_expand "neon_vget_high<mode>"
>  Ã¯Â¿Â½Ã¯Â¿Â½ [(match_operand:<V_HALF> 0 "s_register_operand")
> -Ã¯Â¿Â½Ã¯Â¿Â½ (match_operand:VQX 1 "s_register_operand")]
> +Ã¯Â¿Â½Ã¯Â¿Â½ (match_operand:VQXBF 1 "s_register_operand")]
>  Ã¯Â¿Â½Ã¯Â¿Â½ "TARGET_NEON"
>  Ã¯Â¿Â½{
>  Ã¯Â¿Â½Ã¯Â¿Â½ emit_move_insn (operands[0],
> @@ -6552,3 +6552,64 @@ if (BYTES_BIG_ENDIAN)
>  Ã¯Â¿Â½ "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
>  Ã¯Â¿Â½ [(set_attr "type" "neon_fp_abd_s<q>")]
>  Ã¯Â¿Â½)
> +
> +(define_insn "neon_vbfmmlav8bf"
> +Ã¯Â¿Â½ [(set (match_operand:V4SF 0 "register_operand" "=w")
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (plus:V4SF (match_operand:V4SF 1 "register_operand" "0")
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (unspec:V4SF [(match_operand:V8BF 2 
> "register_operand" "w")
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (match_operand:V8BF 3 
> "register_operand" "w")]
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ UNSPEC_BFMMLA)))]
> +Ã¯Â¿Â½ "TARGET_BF16_SIMD"
> +Ã¯Â¿Â½ "vmmla.bf16\\t%q0, %q2, %q3"
> +Ã¯Â¿Â½ [(set_attr "type" "neon_fp_mla_s_q")]
> +)
> +
> +(define_insn "neon_vbfma<bt>v8bf"
> +Ã¯Â¿Â½ [(set (match_operand:V4SF 0 "register_operand" "=w")
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (unspec:V4SF [(match_operand:V8BF 2 
> "register_operand" "w")
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (match_operand:V8BF 3 
> "register_operand" "w")]
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ BF_MA)))]
> +Ã¯Â¿Â½ "TARGET_BF16_SIMD"
> +Ã¯Â¿Â½ "vfma<bt>.bf16\\t%q0, %q2, %q3"
> +Ã¯Â¿Â½ [(set_attr "type" "neon_fp_mla_s_q")]
> +)
> +
> +(define_insn "neon_vbfma<bt>_lanev8bf"
> +Ã¯Â¿Â½ [(set (match_operand:V4SF 0 "register_operand" "=w")
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (unspec:V4SF [(match_operand:V8BF 2 
> "register_operand" "w")
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (match_operand:V4BF 3 
> "register_operand" "x")
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (match_operand:SI 4 
> "const_int_operand" "n")]
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ BF_MA)))]
> +Ã¯Â¿Â½ "TARGET_BF16_SIMD"
> +Ã¯Â¿Â½ "vfma<bt>.bf16\\t%q0, %q2, %P3[%c4]"
> +Ã¯Â¿Â½ [(set_attr "type" "neon_fp_mla_s_scalar_q")]
> +)
> +
> +(define_expand "neon_vbfma<bt>_laneqv8bf"
> +Ã¯Â¿Â½ [(set (match_operand:V4SF 0 "register_operand" "=w")
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (unspec:V4SF [(match_operand:V8BF 2 
> "register_operand" "w")
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (match_operand:V8BF 3 
> "register_operand" "x")
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (match_operand:SI 4 
> "const_int_operand" "n")]
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ BF_MA)))]
> +Ã¯Â¿Â½ "TARGET_BF16_SIMD"
> +Ã¯Â¿Â½ {
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ int lane = INTVAL (operands[4]);
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ gcc_assert (lane >=0 && lane <=7);
> 
> Let's use the IN_RANGE macro to assert this.
> 
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ if (lane < 4)
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ {
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], operands[1], 
> operands[2], operands[3], operands[4]));
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ }
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ else
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ {
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ rtx op_highpart = gen_reg_rtx (V4BFmode);
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ emit_insn (gen_neon_vget_highv8bf (op_highpart, operands[3]));
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ operands[4] = GEN_INT (lane - 4);
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], operands[1], 
> operands[2], op_highpart, operands[4]));
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ }
> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ DONE;
> +Ã¯Â¿Â½ }
> +Ã¯Â¿Â½ [(set_attr "type" "neon_fp_mla_s_scalar_q")]
> +)
> diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
> index 
> 8f4a705f43efdb6baf03b39cee589cf728620687..97f08abec0a089b5cd95840da12ae22f7c960b28 
> 100644
> --- a/gcc/config/arm/unspecs.md
> +++ b/gcc/config/arm/unspecs.md
> @@ -501,4 +501,7 @@
>  Ã¯Â¿Â½Ã¯Â¿Â½ UNSPEC_VCMLA90
>  Ã¯Â¿Â½Ã¯Â¿Â½ UNSPEC_VCMLA180
>  Ã¯Â¿Â½Ã¯Â¿Â½ UNSPEC_VCMLA270
> +Ã¯Â¿Â½ UNSPEC_BFMMLA
> +Ã¯Â¿Â½ UNSPEC_BFMAB
> +Ã¯Â¿Â½ UNSPEC_BFMAT
>  Ã¯Â¿Â½])
> diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c 
> b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..7602db9597a955b2a303f2dc55b9ff80f81b3b6f 
> 
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
> @@ -0,0 +1,79 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon } */
> +/* { dg-additional-options "-save-temps" }Ã¯Â¿Â½ */
> +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
> +
> +#include "arm_neon.h"
> +
> +/*
> +**test_vbfmlalbq_f32:
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vfmab.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, q2
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
> +*/
> +float32x4_t
> +test_vbfmlalbq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
> +{
> +Ã¯Â¿Â½ return vbfmlalbq_f32 (r, a, b);
> +}
> +
> +/*
> +**test_vbfmlaltq_f32:
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vfmat.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, q2
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
> +*/
> +float32x4_t
> +test_vbfmlaltq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
> +{
> +Ã¯Â¿Â½ return vbfmlaltq_f32 (r, a, b);
> +}
> +
> +/*
> +**test_vbfmlalbq_lane_f32:
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vfmab.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, d4[0]
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
> +*/
> +float32x4_t
> +test_vbfmlalbq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
> +{
> +Ã¯Â¿Â½ return vbfmlalbq_lane_f32 (r, a, b, 0);
> +}
> +
> +/*
> +**test_vbfmlaltq_lane_f32:
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vfmat.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, d4[2]
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
> +*/
> +float32x4_t
> +test_vbfmlaltq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
> +{
> +Ã¯Â¿Â½ return vbfmlaltq_lane_f32 (r, a, b, 2);
> +}
> +
> +/*
> +**test_vbfmlalbq_laneq_f32:
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vfmab.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, d5[1]
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
> +*/
> +float32x4_t
> +test_vbfmlalbq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
> +{
> +Ã¯Â¿Â½ return vbfmlalbq_laneq_f32 (r, a, b, 5);
> +}
> +
> +/*
> +**test_vbfmlaltq_laneq_f32:
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vfmat.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, d5[3]
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
> +*/
> +float32x4_t
> +test_vbfmlaltq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
> +{
> +Ã¯Â¿Â½ return vbfmlaltq_laneq_f32 (r, a, b, 7);
> +}
> diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c 
> b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..226ed7e1d8e4747d73b0518c809aaf0e3c5bc78d 
> 
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile { target { arm*-*-* } } } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon } */
> +
> +#include "arm_neon.h"
> +
> +/* Test lane index limits for vbfmlalbq_lane_f32Ã¯Â¿Â½ */
> +float32x4_t
> +test_vbfmlalbq_lane_f32_low (float32x4_t r, bfloat16x8_t a, 
> bfloat16x4_t b)
> +{
> +Ã¯Â¿Â½ return __builtin_neon_vbfmab_lanev8bf (r, a, b, -1); /* { dg-error 
> {lane -1 out of range 0 - 3} } */
> +}
> +
> +float32x4_t
> +test_vbfmlalbq_lane_f32_high (float32x4_t r, bfloat16x8_t a, 
> bfloat16x4_t b)
> +{
> +Ã¯Â¿Â½ return __builtin_neon_vbfmab_lanev8bf (r, a, b, 4); /* { dg-error 
> {lane 4 out of range 0 - 3} } */
> +}
> +
> +/* Test lane index limits for vbfmlaltq_lane_f32Ã¯Â¿Â½ */
> +float32x4_t
> +test_vbfmlaltq_lane_f32_low (float32x4_t r, bfloat16x8_t a, 
> bfloat16x4_t b)
> +{
> +Ã¯Â¿Â½ return __builtin_neon_vbfmat_lanev8bf (r, a, b, -1); /* { dg-error 
> {lane -1 out of range 0 - 3} } */
> +}
> +
> +float32x4_t
> +test_vbfmlaltq_lane_f32_high (float32x4_t r, bfloat16x8_t a, 
> bfloat16x4_t b)
> +{
> +Ã¯Â¿Â½ return __builtin_neon_vbfmat_lanev8bf (r, a, b, 4); /* { dg-error 
> {lane 4 out of range 0 - 3} } */
> +}
> 
> 
> We want to be testing the ACLE intrinsics here rather than the 
> __builtin_neon* builtins directly. The builtins are an implementation 
> detail that the user should not rely on.
> 
> Ok with these changes.
> Thanks,
> Kyrill
> 
> 
> 
> diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c 
> b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..d8118a7111a359464f1508e92ac6183ea1f4eeed 
> 
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c
> @@ -0,0 +1,18 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon } */
> +/* { dg-additional-options "-save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
> +
> +#include <arm_neon.h>
> +
> +/*test_vbfmmlaq_f32:
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vmmla.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, q2
> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
> +*/
> +float32x4_t
> +test_vbfmmlaq_f32 (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
> +{
> +Ã¯Â¿Â½ return vbfmmlaq_f32 (r, x, y);
> +}
> 

[-- Attachment #2: rb12263.patch --]
[-- Type: text/x-patch, Size: 11690 bytes --]

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index a66961d0c513323844dd069b05cdfccc3e432cfc..1974967b171c28b95b21dc27837d7fe69f2d9f64 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -19426,6 +19426,59 @@ vcvtq_high_bf16_f32 (bfloat16x8_t inactive, float32x4_t __a)
   return __builtin_neon_vbfcvtv4sf_highv8bf (inactive, __a);
 }
 
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vmmlav8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vfmabv8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vfmatv8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		   const int __index)
+{
+  return __builtin_neon_vfmab_lanev8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		   const int __index)
+{
+  return __builtin_neon_vfmat_lanev8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		    const int __index)
+{
+  return __builtin_neon_vfmab_laneqv8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		    const int __index)
+{
+  return __builtin_neon_vfmat_laneqv8bf (__r, __a, __b, __index);
+}
+
 #pragma GCC pop_options
 
 #ifdef __cplusplus
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index 48c06c43a1744da7e143f6070ac945e8dd7225b6..38c8bb0b0ebe2c3cc59da629c7630c389ddd8317 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -391,3 +391,12 @@ VAR2 (UNOP, vbfcvt, v4bf, v8bf)
 VAR1 (UNOP, vbfcvt_high, v8bf)
 VAR2 (UNOP, vbfcvtv4sf, v4bf, v8bf)
 VAR1 (BINOP, vbfcvtv4sf_high, v8bf)
+
+VAR1 (TERNOP, vmmla, v8bf)
+
+VAR1 (TERNOP, vfmab, v8bf)
+VAR1 (TERNOP, vfmat, v8bf)
+VAR1 (MAC_LANE, vfmab_lane, v8bf)
+VAR1 (MAC_LANE, vfmat_lane, v8bf)
+VAR1 (MAC_LANE, vfmab_laneq, v8bf)
+VAR1 (MAC_LANE, vfmat_laneq, v8bf)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 5f4e3d1235813ab81c176505f9a98d702359f7ec..831400192280d892835055174d9daab22ab08c92 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -106,6 +106,9 @@
 ;; Quad-width vector modes plus 64-bit elements.
 (define_mode_iterator VQX [V16QI V8HI V8HF V8BF V4SI V4SF V2DI])
 
+;; Quad-width vector modes plus 64-bit elements and V8BF.
+(define_mode_iterator VQXBF [V16QI V8HI V8HF (V8BF "TARGET_BF16_SIMD") V4SI V4SF V2DI])
+
 ;; Quad-width vector modes without floating-point elements.
 (define_mode_iterator VQI [V16QI V8HI V4SI])
 
@@ -493,6 +496,8 @@
 
 (define_int_iterator MATMUL [UNSPEC_MATMUL_S UNSPEC_MATMUL_U UNSPEC_MATMUL_US])
 
+(define_int_iterator BF_MA [UNSPEC_BFMAB UNSPEC_BFMAT])
+
 ;;----------------------------------------------------------------------------
 ;; Mode attributes
 ;;----------------------------------------------------------------------------
@@ -1209,3 +1214,6 @@
 			   ])
 
 (define_int_attr smlaw_op [(UNSPEC_SMLAWB "smlawb") (UNSPEC_SMLAWT "smlawt")])
+
+;; An iterator for VFMA<bt>
+(define_int_attr bt [(UNSPEC_BFMAB "b") (UNSPEC_BFMAT "t")])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index f5286d9c4b1a309f6ebe864c86596aaceb05c05b..75cc31a0d144724e8e51cb7f05a27e71a77eed25 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -3924,7 +3924,7 @@ if (BYTES_BIG_ENDIAN)
 
 (define_expand "neon_vget_high<mode>"
   [(match_operand:<V_HALF> 0 "s_register_operand")
-   (match_operand:VQX 1 "s_register_operand")]
+   (match_operand:VQXBF 1 "s_register_operand")]
   "TARGET_NEON"
 {
   emit_move_insn (operands[0],
@@ -6737,3 +6737,64 @@ if (BYTES_BIG_ENDIAN)
   "TARGET_BF16_FP"
   ""
 )
+
+(define_insn "neon_vmmlav8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus:V4SF (match_operand:V4SF 1 "register_operand" "0")
+                   (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                 (match_operand:V8BF 3 "register_operand" "w")]
+                    UNSPEC_BFMMLA)))]
+  "TARGET_BF16_SIMD"
+  "vmmla.bf16\\t%q0, %q2, %q3"
+  [(set_attr "type" "neon_fp_mla_s_q")]
+)
+
+(define_insn "neon_vfma<bt>v8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V8BF 3 "register_operand" "w")]
+                     BF_MA)))]
+  "TARGET_BF16_SIMD"
+  "vfma<bt>.bf16\\t%q0, %q2, %q3"
+  [(set_attr "type" "neon_fp_mla_s_q")]
+)
+
+(define_insn "neon_vfma<bt>_lanev8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V4BF 3 "register_operand" "x")
+                                  (match_operand:SI 4 "const_int_operand" "n")]
+                     BF_MA)))]
+  "TARGET_BF16_SIMD"
+  "vfma<bt>.bf16\\t%q0, %q2, %P3[%c4]"
+  [(set_attr "type" "neon_fp_mla_s_scalar_q")]
+)
+
+(define_expand "neon_vfma<bt>_laneqv8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V8BF 3 "register_operand" "x")
+                                  (match_operand:SI 4 "const_int_operand" "n")]
+                     BF_MA)))]
+  "TARGET_BF16_SIMD"
+  {
+    int lane = INTVAL (operands[4]);
+    gcc_assert (IN_RANGE(lane, 0, 7));
+    if (lane < 4)
+    {
+	emit_insn (gen_neon_vfma<bt>_lanev8bf (operands[0], operands[1], operands[2], operands[3], operands[4]));
+    }
+    else
+      {
+	rtx op_highpart = gen_reg_rtx (V4BFmode);
+	emit_insn (gen_neon_vget_highv8bf (op_highpart, operands[3]));
+	operands[4] = GEN_INT (lane - 4);
+	emit_insn (gen_neon_vfma<bt>_lanev8bf (operands[0], operands[1], operands[2], op_highpart, operands[4]));
+      }
+    DONE;
+  }
+  [(set_attr "type" "neon_fp_mla_s_scalar_q")]
+)
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index b36ae512a6ebcf231b46a24e127c62e22e71a34f..f0b1f465de4b63d624510783576700519044717d 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -508,4 +508,7 @@
   UNSPEC_MATMUL_US
   UNSPEC_BFCVT
   UNSPEC_BFCVT_HIGH
+  UNSPEC_BFMMLA
+  UNSPEC_BFMAB
+  UNSPEC_BFMAT
 ])
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..d7a944923cc889bc5f8eaeaa6a4de7672bacb8c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
@@ -0,0 +1,79 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" }  */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+
+#include "arm_neon.h"
+
+/*
+**test_vfmabq_f32:
+**      ...
+**      vfmab.bf16	q0, q1, q2
+**      bx	lr
+*/
+float32x4_t
+test_vfmabq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlalbq_f32 (r, a, b);
+}
+
+/*
+**test_vfmatq_f32:
+**      ...
+**      vfmat.bf16	q0, q1, q2
+**      bx	lr
+*/
+float32x4_t
+test_vfmatq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlaltq_f32 (r, a, b);
+}
+
+/*
+**test_vfmabq_lane_f32:
+**      ...
+**      vfmab.bf16	q0, q1, d4[0]
+**      bx	lr
+*/
+float32x4_t
+test_vfmabq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return vbfmlalbq_lane_f32 (r, a, b, 0);
+}
+
+/*
+**test_vfmatq_lane_f32:
+**      ...
+**      vfmat.bf16	q0, q1, d4[2]
+**      bx	lr
+*/
+float32x4_t
+test_vfmatq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return vbfmlaltq_lane_f32 (r, a, b, 2);
+}
+
+/*
+**test_vfmabq_laneq_f32:
+**      ...
+**      vfmab.bf16	q0, q1, d5[1]
+**      bx	lr
+*/
+float32x4_t
+test_vfmabq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlalbq_laneq_f32 (r, a, b, 5);
+}
+
+/*
+**test_vfmatq_laneq_f32:
+**      ...
+**      vfmat.bf16	q0, q1, d5[3]
+**      bx	lr
+*/
+float32x4_t
+test_vfmatq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlaltq_laneq_f32 (r, a, b, 7);
+}
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..5a7a2a71791968045b413fc6c1d7daade5cf30f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
@@ -0,0 +1,35 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include "arm_neon.h"
+
+/* Test lane index limits for vfmabq_lane_f32  */
+float32x4_t
+test_vfmabq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vbfmlalbq_lane_f32 (r, a, b, -1);
+}
+
+float32x4_t
+test_vfmabq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vbfmlalbq_lane_f32 (r, a, b, 4);
+}
+
+/* Test lane index limits for vfmatq_lane_f32  */
+float32x4_t
+test_vfmatq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  /* { dg-error "lane -2 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vbfmlaltq_lane_f32 (r, a, b, -2);
+}
+
+float32x4_t
+test_vfmatq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  /* { dg-error "lane 5 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vbfmlaltq_lane_f32 (r, a, b, 5);
+}
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..0b74e19203bbdbf8668f6c214843870338d27655
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c
@@ -0,0 +1,18 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+
+#include <arm_neon.h>
+
+/*test_vfmmlaq_f32:
+**        ...
+**        vmmla.bf16	q0, q1, q2
+**        bx	lr
+*/
+float32x4_t
+test_vmmlaq_f32 (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfmmlaq_f32 (r, x, y);
+}

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma<b/t> for AArch32 AdvSIMD
  2020-03-04 17:21             ` Delia Burduv
@ 2020-03-05 11:22               ` Kyrill Tkachov
  2020-03-05 17:49                 ` Kyrill Tkachov
  0 siblings, 1 reply; 10+ messages in thread
From: Kyrill Tkachov @ 2020-03-05 11:22 UTC (permalink / raw)
  To: Delia Burduv, gcc-patches

Hi Delia,

On 3/4/20 5:20 PM, Delia Burduv wrote:
> Hi,
>
> This is the latest version of the patch.
>
> Thanks,
> Delia
>
> On 2/21/20 11:41 AM, Kyrill Tkachov wrote:
>> Hi Delia,
>>
>> On 2/19/20 5:23 PM, Delia Burduv wrote:
>>> Hi,
>>>
>>> Here is the latest version of the patch. It just has some minor 
>>> formatting changes that were brought up by Richard Sandiford in the 
>>> AArch64 patches
>>>
>>> Thanks,
>>> Delia
>>>
>>> On 1/31/20 3:23 PM, Delia Burduv wrote:
>>>> Here is the updated patch. The changes are minor, so let me know if 
>>>> there is anything else to fix or if it can be committed.
>>>>
>>>> Thank you,
>>>> Delia
>>>>
>>>> On 1/30/20 2:55 PM, Kyrill Tkachov wrote:
>>>>> Hi Delia,
>>>>>
>>>>>
>>>>> On 1/28/20 4:44 PM, Delia Burduv wrote:
>>>>>> Ping.
>>>>>> ------------------------------------------------------------------------ 
>>>>>>
>>>>>> *From:* Delia Burduv <delia.burduv@arm.com>
>>>>>> *Sent:* 22 January 2020 17:26
>>>>>> *To:* gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org>
>>>>>> *Cc:* nickc@redhat.com <nickc@redhat.com>; Richard Earnshaw 
>>>>>> <Richard.Earnshaw@arm.com>; Ramana Radhakrishnan 
>>>>>> <Ramana.Radhakrishnan@arm.com>; Kyrylo Tkachov 
>>>>>> <Kyrylo.Tkachov@arm.com>
>>>>>> *Subject:* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 
>>>>>> vmmla and vfma<b/t> for AArch32 AdvSIMD
>>>>>> Ping.
>>>>>>
>>>>>> I have read Richard Sandiford's comments on the AArch64 patches 
>>>>>> and I
>>>>>> will apply what is relevant to this patch as well. Particularly, 
>>>>>> I will
>>>>>> change the tests to use the exact input and output registers and 
>>>>>> I will
>>>>>> change the types of the rtl patterns.
>>>>>
>>>>>
>>>>> Please send the updated patches so that someone can commit them 
>>>>> for you once they're reviewed.
>>>>>
>>>>> Thanks,
>>>>>
>>>>> Kyrill
>>>>>
>>>>>
>>>>>>
>>>>>> On 12/20/19 6:44 PM, Delia Burduv wrote:
>>>>>> > This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab 
>>>>>> and vfmat
>>>>>> > as part of the BFloat16 extension.
>>>>>> > (https://developer.arm.com/docs/101028/latest.)
>>>>>> > The intrinsics are declared in arm_neon.h and the RTL patterns are
>>>>>> > defined in neon.md.
>>>>>> > Two new tests are added to check assembler output and lane 
>>>>>> indices.
>>>>>> >
>>>>>> > This patch depends on the Arm back-end patche.
>>>>>> > (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>>>>>> >
>>>>>> > Tested for regression on arm-none-eabi and armeb-none-eabi. I 
>>>>>> don't have
>>>>>> > commit rights, so if this is ok can someone please commit it 
>>>>>> for me?
>>>>>> >
>>>>>> > gcc/ChangeLog:
>>>>>> >
>>>>>> > 2019-11-12Ã¯Â¿Â½ Delia Burduv <delia.burduv@arm.com>
>>>>>> >
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½* config/arm/arm_neon.h (vbfmmlaq_f32): New.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlalbq_f32): New.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlaltq_f32): New.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlalbq_lane_f32): New.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlaltq_lane_f32): New.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlalbq_laneq_f32): New.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlaltq_laneq_f32): New.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½* config/arm/arm_neon_builtins.def (vbfmmla): New.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmab): New.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmat): New.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmab_lane): New.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmat_lane): New.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmab_laneq): New.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmat_laneq): New.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * config/arm/iterators.md (BF_MA): New int 
>>>>>> iterator.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (bt): New int attribute.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (VQXBF): Copy of VQX with V8BF.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V_HALF): Added V8BF.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * config/arm/neon.md (neon_vbfmmlav8hi): New 
>>>>>> insn.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vbfma<bt>v8hi): New insn.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vbfma<bt>_lanev8hi): New 
>>>>>> insn.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vbfma<bt>_laneqv8hi): New 
>>>>>> expand.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vget_high<mode>): Changed 
>>>>>> iterator to VQXBF.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½* config/arm/unspecs.md (UNSPEC_BFMMLA): New 
>>>>>> UNSPEC.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (UNSPEC_BFMAB): New UNSPEC.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (UNSPEC_BFMAT): New UNSPEC.
>>>>>> >
>>>>>> > 2019-11-12Ã¯Â¿Â½ Delia Burduv <delia.burduv@arm.com>
>>>>>> >
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * gcc.target/arm/simd/bf16_ma_1.c: New 
>>>>>> test.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * gcc.target/arm/simd/bf16_ma_2.c: New 
>>>>>> test.
>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * gcc.target/arm/simd/bf16_mmla_1.c: 
>>>>>> New test.
>>
>> This looks good, a few minor things though...
>>
>>
>> diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
>> index 
>> 3c78f435009ab027f92693d00ab5b40960d5419d..81f8008ea6a5fb11eb09f6685ba24bb0c54fb248 
>> 100644
>> --- a/gcc/config/arm/arm_neon.h
>> +++ b/gcc/config/arm/arm_neon.h
>> @@ -18742,6 +18742,64 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, 
>> float32x4_t __a, float32x4_t __b,
>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½ return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, 
>> __index);
>> \xA0Ã¯Â¿Â½}
>>
>> +#pragma GCC push_options
>> +#pragma GCC target ("arch=armv8.2-a+bf16")
>> +
>> +__extension__ extern __inline float32x4_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
>> +{
>> +Ã¯Â¿Â½ return __builtin_neon_vbfmmlav8bf (__r, __a, __b);
>> +}
>> +
>> +__extension__ extern __inline float32x4_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
>> +{
>> +Ã¯Â¿Â½ return __builtin_neon_vbfmabv8bf (__r, __a, __b);
>> +}
>> +
>> +__extension__ extern __inline float32x4_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
>> +{
>> +Ã¯Â¿Â½ return __builtin_neon_vbfmatv8bf (__r, __a, __b);
>> +}
>> +
>> +__extension__ extern __inline float32x4_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t 
>> __b,
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ const int __index)
>> +{
>> +Ã¯Â¿Â½ return __builtin_neon_vbfmab_lanev8bf (__r, __a, __b, __index);
>> +}
>> +
>> +__extension__ extern __inline float32x4_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t 
>> __b,
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ const int __index)
>> +{
>> +Ã¯Â¿Â½ return __builtin_neon_vbfmat_lanev8bf (__r, __a, __b, __index);
>> +}
>> +
>> +__extension__ extern __inline float32x4_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t 
>> __b,
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ const int __index)
>> +{
>> +Ã¯Â¿Â½ return __builtin_neon_vbfmab_laneqv8bf (__r, __a, __b, __index);
>> +}
>> +
>> +__extension__ extern __inline float32x4_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t 
>> __b,
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ const int __index)
>> +{
>> +Ã¯Â¿Â½ return __builtin_neon_vbfmat_laneqv8bf (__r, __a, __b, __index);
>> +}
>> +
>> +#pragma GCC pop_options
>> +
>> \xA0Ã¯Â¿Â½#pragma GCC pop_options
>> \xA0Ã¯Â¿Â½#endif
>>
>> diff --git a/gcc/config/arm/arm_neon_builtins.def 
>> b/gcc/config/arm/arm_neon_builtins.def
>> index 
>> e9ff4e501cbb5d16b9211f5bc96db376ddf21afc..cc06783daf393f7166fd922f86b3db79c02ba188 
>> 100644
>> --- a/gcc/config/arm/arm_neon_builtins.def
>> +++ b/gcc/config/arm/arm_neon_builtins.def
>> @@ -373,3 +373,12 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane0, v4sf, v8hf)
>> \xA0Ã¯Â¿Â½VAR2 (MAC_LANE_PAIR, vcmlaq_lane90, v4sf, v8hf)
>> \xA0Ã¯Â¿Â½VAR2 (MAC_LANE_PAIR, vcmlaq_lane180, v4sf, v8hf)
>> \xA0Ã¯Â¿Â½VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf)
>> +
>> +VAR1 (TERNOP, vbfmmla, v8bf)
>> +
>> +VAR1 (TERNOP, vbfmab, v8bf)
>> +VAR1 (TERNOP, vbfmat, v8bf)
>> +VAR1 (MAC_LANE, vbfmab_lane, v8bf)
>> +VAR1 (MAC_LANE, vbfmat_lane, v8bf)
>> +VAR1 (MAC_LANE, vbfmab_laneq, v8bf)
>> +VAR1 (MAC_LANE, vbfmat_laneq, v8bf)
>>
>> The instructions produced from these intrinsics have the form vmlla, 
>> vfmab, vfmat. Let's use those names here rather than the "vbf*" ones 
>> to avoid confusion in the future.
>>
>> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
>> index 
>> 33e29509f00a89fa23d0546687c0e4643f0b32d2..72b8ce0bb26dcd520603b907b4f86a74d0164332 
>> 100644
>> --- a/gcc/config/arm/iterators.md
>> +++ b/gcc/config/arm/iterators.md
>> @@ -106,6 +106,9 @@
>> \xA0Ã¯Â¿Â½;; Quad-width vector modes plus 64-bit elements.
>> \xA0Ã¯Â¿Â½(define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI])
>>
>> +;; Quad-width vector modes plus 64-bit elements and V8BF.
>> +(define_mode_iterator VQXBF [V16QI V8HI V8HF (V8BF 
>> "TARGET_BF16_SIMD") V4SI V4SF V2DI])
>> +
>> \xA0Ã¯Â¿Â½;; Quad-width vector modes without floating-point elements.
>> \xA0Ã¯Â¿Â½(define_mode_iterator VQI [V16QI V8HI V4SI])
>>
>> @@ -485,6 +488,8 @@
>> \xA0Ã¯Â¿Â½(define_int_iterator VCADD [UNSPEC_VCADD90 UNSPEC_VCADD270])
>> \xA0Ã¯Â¿Â½(define_int_iterator VCMLA [UNSPEC_VCMLA UNSPEC_VCMLA90 
>> UNSPEC_VCMLA180 UNSPEC_VCMLA270])
>>
>> +(define_int_iterator BF_MA [UNSPEC_BFMAB UNSPEC_BFMAT])
>> +
>> \xA0Ã¯Â¿Â½;;---------------------------------------------------------------------------- 
>>
>> \xA0Ã¯Â¿Â½;; Mode attributes
>> \xA0Ã¯Â¿Â½;;---------------------------------------------------------------------------- 
>>
>> @@ -609,7 +614,8 @@
>> \xA0Ã¯Â¿Â½(define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI")
>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V8HF "V4HF") (V4SIÃ¯Â¿Â½ 
>> "V2SI")
>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V4SF "V2SF") (V2DF "DF")
>> -Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V2DI "DI") (V4HF "HF")])
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V2DI "DI") (V4HF "HF")
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V8BF "V4BF")])
>>
>> \xA0Ã¯Â¿Â½;; Same, but lower-case.
>> \xA0Ã¯Â¿Â½(define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi")
>> @@ -1171,4 +1177,7 @@
>> \xA0Ã¯Â¿Â½(define_int_attr opsuffix [(UNSPEC_DOT_S "s8")
>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (UNSPEC_DOT_U "u8")])
>>
>> +;; An iterator for VFMA<bt>
>> +(define_int_attr bt [(UNSPEC_BFMAB "b") (UNSPEC_BFMAT "t")])
>> +
>> \xA0Ã¯Â¿Â½(define_int_attr smlaw_op [(UNSPEC_SMLAWB "smlawb") 
>> (UNSPEC_SMLAWT "smlawt")])
>> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
>> index 
>> 6087ca6f2badde6a492bb515a2cb5846f3d4ad8e..4e0d0b5c317a81839de9dee581c5e351d3193dfa 
>> 100644
>> --- a/gcc/config/arm/neon.md
>> +++ b/gcc/config/arm/neon.md
>> @@ -3875,7 +3875,7 @@ if (BYTES_BIG_ENDIAN)
>>
>> \xA0Ã¯Â¿Â½(define_expand "neon_vget_high<mode>"
>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½ [(match_operand:<V_HALF> 0 "s_register_operand")
>> -Ã¯Â¿Â½Ã¯Â¿Â½ (match_operand:VQX 1 "s_register_operand")]
>> +Ã¯Â¿Â½Ã¯Â¿Â½ (match_operand:VQXBF 1 "s_register_operand")]
>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½ "TARGET_NEON"
>> \xA0Ã¯Â¿Â½{
>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½ emit_move_insn (operands[0],
>> @@ -6552,3 +6552,64 @@ if (BYTES_BIG_ENDIAN)
>> \xA0Ã¯Â¿Â½ "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
>> \xA0Ã¯Â¿Â½ [(set_attr "type" "neon_fp_abd_s<q>")]
>> \xA0Ã¯Â¿Â½)
>> +
>> +(define_insn "neon_vbfmmlav8bf"
>> +Ã¯Â¿Â½ [(set (match_operand:V4SF 0 "register_operand" "=w")
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (plus:V4SF (match_operand:V4SF 1 
>> "register_operand" "0")
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (unspec:V4SF 
>> [(match_operand:V8BF 2 "register_operand" "w")
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>> (match_operand:V8BF 3 "register_operand" "w")]
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>> UNSPEC_BFMMLA)))]
>> +Ã¯Â¿Â½ "TARGET_BF16_SIMD"
>> +Ã¯Â¿Â½ "vmmla.bf16\\t%q0, %q2, %q3"
>> +Ã¯Â¿Â½ [(set_attr "type" "neon_fp_mla_s_q")]
>> +)
>> +
>> +(define_insn "neon_vbfma<bt>v8bf"
>> +Ã¯Â¿Â½ [(set (match_operand:V4SF 0 "register_operand" "=w")
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (plus: V4SF (match_operand:V4SF 1 
>> "register_operand" "0")
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>> (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>> (match_operand:V8BF 3 "register_operand" "w")]
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ BF_MA)))]
>> +Ã¯Â¿Â½ "TARGET_BF16_SIMD"
>> +Ã¯Â¿Â½ "vfma<bt>.bf16\\t%q0, %q2, %q3"
>> +Ã¯Â¿Â½ [(set_attr "type" "neon_fp_mla_s_q")]
>> +)
>> +
>> +(define_insn "neon_vbfma<bt>_lanev8bf"
>> +Ã¯Â¿Â½ [(set (match_operand:V4SF 0 "register_operand" "=w")
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (plus: V4SF (match_operand:V4SF 1 
>> "register_operand" "0")
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>> (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>> (match_operand:V4BF 3 "register_operand" "x")
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>> (match_operand:SI 4 "const_int_operand" "n")]
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ BF_MA)))]
>> +Ã¯Â¿Â½ "TARGET_BF16_SIMD"
>> +Ã¯Â¿Â½ "vfma<bt>.bf16\\t%q0, %q2, %P3[%c4]"
>> +Ã¯Â¿Â½ [(set_attr "type" "neon_fp_mla_s_scalar_q")]
>> +)
>> +
>> +(define_expand "neon_vbfma<bt>_laneqv8bf"
>> +Ã¯Â¿Â½ [(set (match_operand:V4SF 0 "register_operand" "=w")
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (plus: V4SF (match_operand:V4SF 1 
>> "register_operand" "0")
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>> (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>> (match_operand:V8BF 3 "register_operand" "x")
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>> (match_operand:SI 4 "const_int_operand" "n")]
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ BF_MA)))]
>> +Ã¯Â¿Â½ "TARGET_BF16_SIMD"
>> +Ã¯Â¿Â½ {
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ int lane = INTVAL (operands[4]);
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ gcc_assert (lane >=0 && lane <=7);
>>
>> Let's use the IN_RANGE macro to assert this.
>>
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ if (lane < 4)
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ {
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], 
>> operands[1], operands[2], operands[3], operands[4]));
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ }
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ else
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ {
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ rtx op_highpart = gen_reg_rtx (V4BFmode);
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ emit_insn (gen_neon_vget_highv8bf (op_highpart, 
>> operands[3]));
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ operands[4] = GEN_INT (lane - 4);
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], 
>> operands[1], operands[2], op_highpart, operands[4]));
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ }
>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ DONE;
>> +Ã¯Â¿Â½ }
>> +Ã¯Â¿Â½ [(set_attr "type" "neon_fp_mla_s_scalar_q")]
>> +)
>> diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
>> index 
>> 8f4a705f43efdb6baf03b39cee589cf728620687..97f08abec0a089b5cd95840da12ae22f7c960b28 
>> 100644
>> --- a/gcc/config/arm/unspecs.md
>> +++ b/gcc/config/arm/unspecs.md
>> @@ -501,4 +501,7 @@
>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½ UNSPEC_VCMLA90
>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½ UNSPEC_VCMLA180
>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½ UNSPEC_VCMLA270
>> +Ã¯Â¿Â½ UNSPEC_BFMMLA
>> +Ã¯Â¿Â½ UNSPEC_BFMAB
>> +Ã¯Â¿Â½ UNSPEC_BFMAT
>> \xA0Ã¯Â¿Â½])
>> diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c 
>> b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
>> new file mode 100644
>> index 
>> 0000000000000000000000000000000000000000..7602db9597a955b2a303f2dc55b9ff80f81b3b6f 
>>
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
>> @@ -0,0 +1,79 @@
>> +/* { dg-do assemble } */
>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>> +/* { dg-add-options arm_v8_2a_bf16_neon } */
>> +/* { dg-additional-options "-save-temps" }Ã¯Â¿Â½ */
>> +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
>> +
>> +#include "arm_neon.h"
>> +
>> +/*
>> +**test_vbfmlalbq_f32:
>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vfmab.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, q2
>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
>> +*/
>> +float32x4_t
>> +test_vbfmlalbq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
>> +{
>> +Ã¯Â¿Â½ return vbfmlalbq_f32 (r, a, b);
>> +}
>> +
>> +/*
>> +**test_vbfmlaltq_f32:
>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vfmat.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, q2
>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
>> +*/
>> +float32x4_t
>> +test_vbfmlaltq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
>> +{
>> +Ã¯Â¿Â½ return vbfmlaltq_f32 (r, a, b);
>> +}
>> +
>> +/*
>> +**test_vbfmlalbq_lane_f32:
>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vfmab.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, d4[0]
>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
>> +*/
>> +float32x4_t
>> +test_vbfmlalbq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
>> +{
>> +Ã¯Â¿Â½ return vbfmlalbq_lane_f32 (r, a, b, 0);
>> +}
>> +
>> +/*
>> +**test_vbfmlaltq_lane_f32:
>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vfmat.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, d4[2]
>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
>> +*/
>> +float32x4_t
>> +test_vbfmlaltq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
>> +{
>> +Ã¯Â¿Â½ return vbfmlaltq_lane_f32 (r, a, b, 2);
>> +}
>> +
>> +/*
>> +**test_vbfmlalbq_laneq_f32:
>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vfmab.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, d5[1]
>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
>> +*/
>> +float32x4_t
>> +test_vbfmlalbq_laneq_f32 (float32x4_t r, bfloat16x8_t a, 
>> bfloat16x8_t b)
>> +{
>> +Ã¯Â¿Â½ return vbfmlalbq_laneq_f32 (r, a, b, 5);
>> +}
>> +
>> +/*
>> +**test_vbfmlaltq_laneq_f32:
>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vfmat.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, d5[3]
>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
>> +*/
>> +float32x4_t
>> +test_vbfmlaltq_laneq_f32 (float32x4_t r, bfloat16x8_t a, 
>> bfloat16x8_t b)
>> +{
>> +Ã¯Â¿Â½ return vbfmlaltq_laneq_f32 (r, a, b, 7);
>> +}
>> diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c 
>> b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
>> new file mode 100644
>> index 
>> 0000000000000000000000000000000000000000..226ed7e1d8e4747d73b0518c809aaf0e3c5bc78d 
>>
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
>> @@ -0,0 +1,31 @@
>> +/* { dg-do compile { target { arm*-*-* } } } */
>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>> +/* { dg-add-options arm_v8_2a_bf16_neon } */
>> +
>> +#include "arm_neon.h"
>> +
>> +/* Test lane index limits for vbfmlalbq_lane_f32Ã¯Â¿Â½ */
>> +float32x4_t
>> +test_vbfmlalbq_lane_f32_low (float32x4_t r, bfloat16x8_t a, 
>> bfloat16x4_t b)
>> +{
>> +Ã¯Â¿Â½ return __builtin_neon_vbfmab_lanev8bf (r, a, b, -1); /* { 
>> dg-error {lane -1 out of range 0 - 3} } */
>> +}
>> +
>> +float32x4_t
>> +test_vbfmlalbq_lane_f32_high (float32x4_t r, bfloat16x8_t a, 
>> bfloat16x4_t b)
>> +{
>> +Ã¯Â¿Â½ return __builtin_neon_vbfmab_lanev8bf (r, a, b, 4); /* { 
>> dg-error {lane 4 out of range 0 - 3} } */
>> +}
>> +
>> +/* Test lane index limits for vbfmlaltq_lane_f32Ã¯Â¿Â½ */
>> +float32x4_t
>> +test_vbfmlaltq_lane_f32_low (float32x4_t r, bfloat16x8_t a, 
>> bfloat16x4_t b)
>> +{
>> +Ã¯Â¿Â½ return __builtin_neon_vbfmat_lanev8bf (r, a, b, -1); /* { 
>> dg-error {lane -1 out of range 0 - 3} } */
>> +}
>> +
>> +float32x4_t
>> +test_vbfmlaltq_lane_f32_high (float32x4_t r, bfloat16x8_t a, 
>> bfloat16x4_t b)
>> +{
>> +Ã¯Â¿Â½ return __builtin_neon_vbfmat_lanev8bf (r, a, b, 4); /* { 
>> dg-error {lane 4 out of range 0 - 3} } */
>> +}
>>
>>
>> We want to be testing the ACLE intrinsics here rather than the 
>> __builtin_neon* builtins directly. The builtins are an implementation 
>> detail that the user should not rely on.
>>
>> Ok with these changes.
>> Thanks,
>> Kyrill
>>

diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..d7a944923cc889bc5f8eaeaa6a4de7672bacb8c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
@@ -0,0 +1,79 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" }  */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */


Same with the comments on the load and store patches, this ends up not doing the function body checks...
I've adjusted these and added optimisation options to the tests and committed this to master with 43031fbdda7d4edbd607365a4f3bbec069fe3983
and adjusted the ChangeLog to reflect the latest changes:
     2020-03-05  Delia Burduv  <delia.burduv@arm.com>
     
             * config/arm/arm_neon.h (vbfmmlaq_f32): New.
             (vbfmlalbq_f32): New.
             (vbfmlaltq_f32): New.
             (vbfmlalbq_lane_f32): New.
             (vbfmlaltq_lane_f32): New.
             (vbfmlalbq_laneq_f32): New.
             (vbfmlaltq_laneq_f32): New.
             * config/arm/arm_neon_builtins.def (vmmla): New.
             (vfmab): New.
             (vfmat): New.
             (vfmab_lane): New.
             (vfmat_lane): New.
             (vfmab_laneq): New.
             (vfmat_laneq): New.
             * config/arm/iterators.md (BF_MA): New int iterator.
             (bt): New int attribute.
             (VQXBF): Copy of VQX with V8BF.
             * config/arm/neon.md (neon_vmmlav8bf): New insn.
             (neon_vfma<bt>v8bf): New insn.
             (neon_vfma<bt>_lanev8bf): New insn.
             (neon_vfma<bt>_laneqv8bf): New expand.
             (neon_vget_high<mode>): Changed iterator to VQXBF.
             * config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC.
             (UNSPEC_BFMAB): New UNSPEC.
             (UNSPEC_BFMAT): New UNSPEC.
     
     2020-03-05  Delia Burduv  <delia.burduv@arm.com>
     
             * gcc.target/arm/simd/bf16_ma_1.c: New test.
             * gcc.target/arm/simd/bf16_ma_2.c: New test.
             * gcc.target/arm/simd/bf16_mmla_1.c: New test.

Thanks!
Kyrill


+
+#include "arm_neon.h"
+
+/*
+**test_vfmabq_f32:
+**      ...
+**      vfmab.bf16	q0, q1, q2
+**      bx	lr
+*/
+float32x4_t
+test_vfmabq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlalbq_f32 (r, a, b);
+}
+
+/*
+**test_vfmatq_f32:
+**      ...
+**      vfmat.bf16	q0, q1, q2
+**      bx	lr
+*/
+float32x4_t
+test_vfmatq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlaltq_f32 (r, a, b);
+}
+
+/*
+**test_vfmabq_lane_f32:
+**      ...
+**      vfmab.bf16	q0, q1, d4[0]
+**      bx	lr
+*/
+float32x4_t
+test_vfmabq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return vbfmlalbq_lane_f32 (r, a, b, 0);
+}
+
+/*
+**test_vfmatq_lane_f32:
+**      ...
+**      vfmat.bf16	q0, q1, d4[2]
+**      bx	lr
+*/
+float32x4_t
+test_vfmatq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return vbfmlaltq_lane_f32 (r, a, b, 2);
+}
+
+/*
+**test_vfmabq_laneq_f32:
+**      ...
+**      vfmab.bf16	q0, q1, d5[1]
+**      bx	lr
+*/
+float32x4_t
+test_vfmabq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlalbq_laneq_f32 (r, a, b, 5);
+}
+
+/*
+**test_vfmatq_laneq_f32:
+**      ...
+**      vfmat.bf16	q0, q1, d5[3]
+**      bx	lr
+*/
+float32x4_t
+test_vfmatq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlaltq_laneq_f32 (r, a, b, 7);
+}
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..5a7a2a71791968045b413fc6c1d7daade5cf30f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
@@ -0,0 +1,35 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include "arm_neon.h"
+
+/* Test lane index limits for vfmabq_lane_f32  */
+float32x4_t
+test_vfmabq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vbfmlalbq_lane_f32 (r, a, b, -1);
+}
+
+float32x4_t
+test_vfmabq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vbfmlalbq_lane_f32 (r, a, b, 4);
+}
+
+/* Test lane index limits for vfmatq_lane_f32  */
+float32x4_t
+test_vfmatq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  /* { dg-error "lane -2 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vbfmlaltq_lane_f32 (r, a, b, -2);
+}
+
+float32x4_t
+test_vfmatq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  /* { dg-error "lane 5 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vbfmlaltq_lane_f32 (r, a, b, 5);
+}
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..0b74e19203bbdbf8668f6c214843870338d27655
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c
@@ -0,0 +1,18 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+
+#include <arm_neon.h>
+
+/*test_vfmmlaq_f32:
+**        ...
+**        vmmla.bf16	q0, q1, q2
+**        bx	lr
+*/
+float32x4_t
+test_vmmlaq_f32 (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfmmlaq_f32 (r, x, y);
+}

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma<b/t> for AArch32 AdvSIMD
  2020-03-05 11:22               ` Kyrill Tkachov
@ 2020-03-05 17:49                 ` Kyrill Tkachov
  0 siblings, 0 replies; 10+ messages in thread
From: Kyrill Tkachov @ 2020-03-05 17:49 UTC (permalink / raw)
  To: Delia Burduv, gcc-patches


On 3/5/20 11:22 AM, Kyrill Tkachov wrote:
> Hi Delia,
>
> On 3/4/20 5:20 PM, Delia Burduv wrote:
>> Hi,
>>
>> This is the latest version of the patch.
>>
>> Thanks,
>> Delia
>>
>> On 2/21/20 11:41 AM, Kyrill Tkachov wrote:
>>> Hi Delia,
>>>
>>> On 2/19/20 5:23 PM, Delia Burduv wrote:
>>>> Hi,
>>>>
>>>> Here is the latest version of the patch. It just has some minor 
>>>> formatting changes that were brought up by Richard Sandiford in the 
>>>> AArch64 patches
>>>>
>>>> Thanks,
>>>> Delia
>>>>
>>>> On 1/31/20 3:23 PM, Delia Burduv wrote:
>>>>> Here is the updated patch. The changes are minor, so let me know 
>>>>> if there is anything else to fix or if it can be committed.
>>>>>
>>>>> Thank you,
>>>>> Delia
>>>>>
>>>>> On 1/30/20 2:55 PM, Kyrill Tkachov wrote:
>>>>>> Hi Delia,
>>>>>>
>>>>>>
>>>>>> On 1/28/20 4:44 PM, Delia Burduv wrote:
>>>>>>> Ping.
>>>>>>> ------------------------------------------------------------------------ 
>>>>>>>
>>>>>>> *From:* Delia Burduv <delia.burduv@arm.com>
>>>>>>> *Sent:* 22 January 2020 17:26
>>>>>>> *To:* gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org>
>>>>>>> *Cc:* nickc@redhat.com <nickc@redhat.com>; Richard Earnshaw 
>>>>>>> <Richard.Earnshaw@arm.com>; Ramana Radhakrishnan 
>>>>>>> <Ramana.Radhakrishnan@arm.com>; Kyrylo Tkachov 
>>>>>>> <Kyrylo.Tkachov@arm.com>
>>>>>>> *Subject:* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 
>>>>>>> vmmla and vfma<b/t> for AArch32 AdvSIMD
>>>>>>> Ping.
>>>>>>>
>>>>>>> I have read Richard Sandiford's comments on the AArch64 patches 
>>>>>>> and I
>>>>>>> will apply what is relevant to this patch as well. Particularly, 
>>>>>>> I will
>>>>>>> change the tests to use the exact input and output registers and 
>>>>>>> I will
>>>>>>> change the types of the rtl patterns.
>>>>>>
>>>>>>
>>>>>> Please send the updated patches so that someone can commit them 
>>>>>> for you once they're reviewed.
>>>>>>
>>>>>> Thanks,
>>>>>>
>>>>>> Kyrill
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> On 12/20/19 6:44 PM, Delia Burduv wrote:
>>>>>>> > This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab 
>>>>>>> and vfmat
>>>>>>> > as part of the BFloat16 extension.
>>>>>>> > (https://developer.arm.com/docs/101028/latest.)
>>>>>>> > The intrinsics are declared in arm_neon.h and the RTL patterns 
>>>>>>> are
>>>>>>> > defined in neon.md.
>>>>>>> > Two new tests are added to check assembler output and lane 
>>>>>>> indices.
>>>>>>> >
>>>>>>> > This patch depends on the Arm back-end patche.
>>>>>>> > (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>>>>>>> >
>>>>>>> > Tested for regression on arm-none-eabi and armeb-none-eabi. I 
>>>>>>> don't have
>>>>>>> > commit rights, so if this is ok can someone please commit it 
>>>>>>> for me?
>>>>>>> >
>>>>>>> > gcc/ChangeLog:
>>>>>>> >
>>>>>>> > 2019-11-12Ã¯Â¿Â½ Delia Burduv <delia.burduv@arm.com>
>>>>>>> >
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½* config/arm/arm_neon.h (vbfmmlaq_f32): New.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlalbq_f32): New.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlaltq_f32): New.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlalbq_lane_f32): New.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlaltq_lane_f32): New.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlalbq_laneq_f32): New.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmlaltq_laneq_f32): New.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½* config/arm/arm_neon_builtins.def (vbfmmla): New.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmab): New.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmat): New.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmab_lane): New.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmat_lane): New.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmab_laneq): New.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (vbfmat_laneq): New.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * config/arm/iterators.md (BF_MA): New int 
>>>>>>> iterator.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (bt): New int attribute.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (VQXBF): Copy of VQX with V8BF.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V_HALF): Added V8BF.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * config/arm/neon.md (neon_vbfmmlav8hi): 
>>>>>>> New insn.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vbfma<bt>v8hi): New insn.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vbfma<bt>_lanev8hi): New 
>>>>>>> insn.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vbfma<bt>_laneqv8hi): New 
>>>>>>> expand.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (neon_vget_high<mode>): Changed 
>>>>>>> iterator to VQXBF.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½* config/arm/unspecs.md (UNSPEC_BFMMLA): New 
>>>>>>> UNSPEC.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (UNSPEC_BFMAB): New UNSPEC.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (UNSPEC_BFMAT): New UNSPEC.
>>>>>>> >
>>>>>>> > 2019-11-12Ã¯Â¿Â½ Delia Burduv <delia.burduv@arm.com>
>>>>>>> >
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * gcc.target/arm/simd/bf16_ma_1.c: 
>>>>>>> New test.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * gcc.target/arm/simd/bf16_ma_2.c: 
>>>>>>> New test.
>>>>>>> >Ã¯Â¿Â½ Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ * gcc.target/arm/simd/bf16_mmla_1.c: 
>>>>>>> New test.
>>>
>>> This looks good, a few minor things though...
>>>
>>>
>>> diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
>>> index 
>>> 3c78f435009ab027f92693d00ab5b40960d5419d..81f8008ea6a5fb11eb09f6685ba24bb0c54fb248 
>>> 100644
>>> --- a/gcc/config/arm/arm_neon.h
>>> +++ b/gcc/config/arm/arm_neon.h
>>> @@ -18742,6 +18742,64 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, 
>>> float32x4_t __a, float32x4_t __b,
>>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½ return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, 
>>> __index);
>>> \xA0Ã¯Â¿Â½}
>>>
>>> +#pragma GCC push_options
>>> +#pragma GCC target ("arch=armv8.2-a+bf16")
>>> +
>>> +__extension__ extern __inline float32x4_t
>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>>> +vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
>>> +{
>>> +Ã¯Â¿Â½ return __builtin_neon_vbfmmlav8bf (__r, __a, __b);
>>> +}
>>> +
>>> +__extension__ extern __inline float32x4_t
>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>>> +vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
>>> +{
>>> +Ã¯Â¿Â½ return __builtin_neon_vbfmabv8bf (__r, __a, __b);
>>> +}
>>> +
>>> +__extension__ extern __inline float32x4_t
>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>>> +vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
>>> +{
>>> +Ã¯Â¿Â½ return __builtin_neon_vbfmatv8bf (__r, __a, __b);
>>> +}
>>> +
>>> +__extension__ extern __inline float32x4_t
>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>>> +vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t 
>>> __b,
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ const int __index)
>>> +{
>>> +Ã¯Â¿Â½ return __builtin_neon_vbfmab_lanev8bf (__r, __a, __b, __index);
>>> +}
>>> +
>>> +__extension__ extern __inline float32x4_t
>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>>> +vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t 
>>> __b,
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ const int __index)
>>> +{
>>> +Ã¯Â¿Â½ return __builtin_neon_vbfmat_lanev8bf (__r, __a, __b, __index);
>>> +}
>>> +
>>> +__extension__ extern __inline float32x4_t
>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>>> +vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, 
>>> bfloat16x8_t __b,
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ const int __index)
>>> +{
>>> +Ã¯Â¿Â½ return __builtin_neon_vbfmab_laneqv8bf (__r, __a, __b, __index);
>>> +}
>>> +
>>> +__extension__ extern __inline float32x4_t
>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>>> +vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, 
>>> bfloat16x8_t __b,
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ const int __index)
>>> +{
>>> +Ã¯Â¿Â½ return __builtin_neon_vbfmat_laneqv8bf (__r, __a, __b, __index);
>>> +}
>>> +
>>> +#pragma GCC pop_options
>>> +
>>> \xA0Ã¯Â¿Â½#pragma GCC pop_options
>>> \xA0Ã¯Â¿Â½#endif
>>>
>>> diff --git a/gcc/config/arm/arm_neon_builtins.def 
>>> b/gcc/config/arm/arm_neon_builtins.def
>>> index 
>>> e9ff4e501cbb5d16b9211f5bc96db376ddf21afc..cc06783daf393f7166fd922f86b3db79c02ba188 
>>> 100644
>>> --- a/gcc/config/arm/arm_neon_builtins.def
>>> +++ b/gcc/config/arm/arm_neon_builtins.def
>>> @@ -373,3 +373,12 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane0, v4sf, v8hf)
>>> \xA0Ã¯Â¿Â½VAR2 (MAC_LANE_PAIR, vcmlaq_lane90, v4sf, v8hf)
>>> \xA0Ã¯Â¿Â½VAR2 (MAC_LANE_PAIR, vcmlaq_lane180, v4sf, v8hf)
>>> \xA0Ã¯Â¿Â½VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf)
>>> +
>>> +VAR1 (TERNOP, vbfmmla, v8bf)
>>> +
>>> +VAR1 (TERNOP, vbfmab, v8bf)
>>> +VAR1 (TERNOP, vbfmat, v8bf)
>>> +VAR1 (MAC_LANE, vbfmab_lane, v8bf)
>>> +VAR1 (MAC_LANE, vbfmat_lane, v8bf)
>>> +VAR1 (MAC_LANE, vbfmab_laneq, v8bf)
>>> +VAR1 (MAC_LANE, vbfmat_laneq, v8bf)
>>>
>>> The instructions produced from these intrinsics have the form vmlla, 
>>> vfmab, vfmat. Let's use those names here rather than the "vbf*" ones 
>>> to avoid confusion in the future.
>>>
>>> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
>>> index 
>>> 33e29509f00a89fa23d0546687c0e4643f0b32d2..72b8ce0bb26dcd520603b907b4f86a74d0164332 
>>> 100644
>>> --- a/gcc/config/arm/iterators.md
>>> +++ b/gcc/config/arm/iterators.md
>>> @@ -106,6 +106,9 @@
>>> \xA0Ã¯Â¿Â½;; Quad-width vector modes plus 64-bit elements.
>>> \xA0Ã¯Â¿Â½(define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI])
>>>
>>> +;; Quad-width vector modes plus 64-bit elements and V8BF.
>>> +(define_mode_iterator VQXBF [V16QI V8HI V8HF (V8BF 
>>> "TARGET_BF16_SIMD") V4SI V4SF V2DI])
>>> +
>>> \xA0Ã¯Â¿Â½;; Quad-width vector modes without floating-point elements.
>>> \xA0Ã¯Â¿Â½(define_mode_iterator VQI [V16QI V8HI V4SI])
>>>
>>> @@ -485,6 +488,8 @@
>>> \xA0Ã¯Â¿Â½(define_int_iterator VCADD [UNSPEC_VCADD90 UNSPEC_VCADD270])
>>> \xA0Ã¯Â¿Â½(define_int_iterator VCMLA [UNSPEC_VCMLA UNSPEC_VCMLA90 
>>> UNSPEC_VCMLA180 UNSPEC_VCMLA270])
>>>
>>> +(define_int_iterator BF_MA [UNSPEC_BFMAB UNSPEC_BFMAT])
>>> +
>>> \xA0Ã¯Â¿Â½;;---------------------------------------------------------------------------- 
>>>
>>> \xA0Ã¯Â¿Â½;; Mode attributes
>>> \xA0Ã¯Â¿Â½;;---------------------------------------------------------------------------- 
>>>
>>> @@ -609,7 +614,8 @@
>>> \xA0Ã¯Â¿Â½(define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI")
>>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V8HF "V4HF") (V4SIÃ¯Â¿Â½ 
>>> "V2SI")
>>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V4SF "V2SF") (V2DF "DF")
>>> -Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V2DI "DI") (V4HF "HF")])
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V2DI "DI") (V4HF "HF")
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (V8BF "V4BF")])
>>>
>>> \xA0Ã¯Â¿Â½;; Same, but lower-case.
>>> \xA0Ã¯Â¿Â½(define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi")
>>> @@ -1171,4 +1177,7 @@
>>> \xA0Ã¯Â¿Â½(define_int_attr opsuffix [(UNSPEC_DOT_S "s8")
>>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (UNSPEC_DOT_U "u8")])
>>>
>>> +;; An iterator for VFMA<bt>
>>> +(define_int_attr bt [(UNSPEC_BFMAB "b") (UNSPEC_BFMAT "t")])
>>> +
>>> \xA0Ã¯Â¿Â½(define_int_attr smlaw_op [(UNSPEC_SMLAWB "smlawb") 
>>> (UNSPEC_SMLAWT "smlawt")])
>>> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
>>> index 
>>> 6087ca6f2badde6a492bb515a2cb5846f3d4ad8e..4e0d0b5c317a81839de9dee581c5e351d3193dfa 
>>> 100644
>>> --- a/gcc/config/arm/neon.md
>>> +++ b/gcc/config/arm/neon.md
>>> @@ -3875,7 +3875,7 @@ if (BYTES_BIG_ENDIAN)
>>>
>>> \xA0Ã¯Â¿Â½(define_expand "neon_vget_high<mode>"
>>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½ [(match_operand:<V_HALF> 0 "s_register_operand")
>>> -Ã¯Â¿Â½Ã¯Â¿Â½ (match_operand:VQX 1 "s_register_operand")]
>>> +Ã¯Â¿Â½Ã¯Â¿Â½ (match_operand:VQXBF 1 "s_register_operand")]
>>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½ "TARGET_NEON"
>>> \xA0Ã¯Â¿Â½{
>>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½ emit_move_insn (operands[0],
>>> @@ -6552,3 +6552,64 @@ if (BYTES_BIG_ENDIAN)
>>> \xA0Ã¯Â¿Â½ "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
>>> \xA0Ã¯Â¿Â½ [(set_attr "type" "neon_fp_abd_s<q>")]
>>> \xA0Ã¯Â¿Â½)
>>> +
>>> +(define_insn "neon_vbfmmlav8bf"
>>> +Ã¯Â¿Â½ [(set (match_operand:V4SF 0 "register_operand" "=w")
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (plus:V4SF (match_operand:V4SF 1 
>>> "register_operand" "0")
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (unspec:V4SF 
>>> [(match_operand:V8BF 2 "register_operand" "w")
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>>> (match_operand:V8BF 3 "register_operand" "w")]
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>>> UNSPEC_BFMMLA)))]
>>> +Ã¯Â¿Â½ "TARGET_BF16_SIMD"
>>> +Ã¯Â¿Â½ "vmmla.bf16\\t%q0, %q2, %q3"
>>> +Ã¯Â¿Â½ [(set_attr "type" "neon_fp_mla_s_q")]
>>> +)
>>> +
>>> +(define_insn "neon_vbfma<bt>v8bf"
>>> +Ã¯Â¿Â½ [(set (match_operand:V4SF 0 "register_operand" "=w")
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (plus: V4SF (match_operand:V4SF 1 
>>> "register_operand" "0")
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>>> (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>>> (match_operand:V8BF 3 "register_operand" "w")]
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ BF_MA)))]
>>> +Ã¯Â¿Â½ "TARGET_BF16_SIMD"
>>> +Ã¯Â¿Â½ "vfma<bt>.bf16\\t%q0, %q2, %q3"
>>> +Ã¯Â¿Â½ [(set_attr "type" "neon_fp_mla_s_q")]
>>> +)
>>> +
>>> +(define_insn "neon_vbfma<bt>_lanev8bf"
>>> +Ã¯Â¿Â½ [(set (match_operand:V4SF 0 "register_operand" "=w")
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (plus: V4SF (match_operand:V4SF 1 
>>> "register_operand" "0")
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>>> (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>>> (match_operand:V4BF 3 "register_operand" "x")
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>>> (match_operand:SI 4 "const_int_operand" "n")]
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ BF_MA)))]
>>> +Ã¯Â¿Â½ "TARGET_BF16_SIMD"
>>> +Ã¯Â¿Â½ "vfma<bt>.bf16\\t%q0, %q2, %P3[%c4]"
>>> +Ã¯Â¿Â½ [(set_attr "type" "neon_fp_mla_s_scalar_q")]
>>> +)
>>> +
>>> +(define_expand "neon_vbfma<bt>_laneqv8bf"
>>> +Ã¯Â¿Â½ [(set (match_operand:V4SF 0 "register_operand" "=w")
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ (plus: V4SF (match_operand:V4SF 1 
>>> "register_operand" "0")
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>>> (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>>> (match_operand:V8BF 3 "register_operand" "x")
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ 
>>> (match_operand:SI 4 "const_int_operand" "n")]
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ BF_MA)))]
>>> +Ã¯Â¿Â½ "TARGET_BF16_SIMD"
>>> +Ã¯Â¿Â½ {
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ int lane = INTVAL (operands[4]);
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ gcc_assert (lane >=0 && lane <=7);
>>>
>>> Let's use the IN_RANGE macro to assert this.
>>>
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ if (lane < 4)
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ {
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], 
>>> operands[1], operands[2], operands[3], operands[4]));
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ }
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ else
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ {
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ rtx op_highpart = gen_reg_rtx (V4BFmode);
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ emit_insn (gen_neon_vget_highv8bf (op_highpart, 
>>> operands[3]));
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ operands[4] = GEN_INT (lane - 4);
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], 
>>> operands[1], operands[2], op_highpart, operands[4]));
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ }
>>> +Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ DONE;
>>> +Ã¯Â¿Â½ }
>>> +Ã¯Â¿Â½ [(set_attr "type" "neon_fp_mla_s_scalar_q")]
>>> +)
>>> diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
>>> index 
>>> 8f4a705f43efdb6baf03b39cee589cf728620687..97f08abec0a089b5cd95840da12ae22f7c960b28 
>>> 100644
>>> --- a/gcc/config/arm/unspecs.md
>>> +++ b/gcc/config/arm/unspecs.md
>>> @@ -501,4 +501,7 @@
>>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½ UNSPEC_VCMLA90
>>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½ UNSPEC_VCMLA180
>>> \xA0Ã¯Â¿Â½Ã¯Â¿Â½ UNSPEC_VCMLA270
>>> +Ã¯Â¿Â½ UNSPEC_BFMMLA
>>> +Ã¯Â¿Â½ UNSPEC_BFMAB
>>> +Ã¯Â¿Â½ UNSPEC_BFMAT
>>> \xA0Ã¯Â¿Â½])
>>> diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c 
>>> b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
>>> new file mode 100644
>>> index 
>>> 0000000000000000000000000000000000000000..7602db9597a955b2a303f2dc55b9ff80f81b3b6f 
>>>
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
>>> @@ -0,0 +1,79 @@
>>> +/* { dg-do assemble } */
>>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>>> +/* { dg-add-options arm_v8_2a_bf16_neon } */
>>> +/* { dg-additional-options "-save-temps" }Ã¯Â¿Â½ */
>>> +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
>>> +
>>> +#include "arm_neon.h"
>>> +
>>> +/*
>>> +**test_vbfmlalbq_f32:
>>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
>>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vfmab.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, q2
>>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
>>> +*/
>>> +float32x4_t
>>> +test_vbfmlalbq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
>>> +{
>>> +Ã¯Â¿Â½ return vbfmlalbq_f32 (r, a, b);
>>> +}
>>> +
>>> +/*
>>> +**test_vbfmlaltq_f32:
>>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
>>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vfmat.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, q2
>>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
>>> +*/
>>> +float32x4_t
>>> +test_vbfmlaltq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
>>> +{
>>> +Ã¯Â¿Â½ return vbfmlaltq_f32 (r, a, b);
>>> +}
>>> +
>>> +/*
>>> +**test_vbfmlalbq_lane_f32:
>>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
>>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vfmab.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, d4[0]
>>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
>>> +*/
>>> +float32x4_t
>>> +test_vbfmlalbq_lane_f32 (float32x4_t r, bfloat16x8_t a, 
>>> bfloat16x4_t b)
>>> +{
>>> +Ã¯Â¿Â½ return vbfmlalbq_lane_f32 (r, a, b, 0);
>>> +}
>>> +
>>> +/*
>>> +**test_vbfmlaltq_lane_f32:
>>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
>>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vfmat.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, d4[2]
>>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
>>> +*/
>>> +float32x4_t
>>> +test_vbfmlaltq_lane_f32 (float32x4_t r, bfloat16x8_t a, 
>>> bfloat16x4_t b)
>>> +{
>>> +Ã¯Â¿Â½ return vbfmlaltq_lane_f32 (r, a, b, 2);
>>> +}
>>> +
>>> +/*
>>> +**test_vbfmlalbq_laneq_f32:
>>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
>>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vfmab.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, d5[1]
>>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
>>> +*/
>>> +float32x4_t
>>> +test_vbfmlalbq_laneq_f32 (float32x4_t r, bfloat16x8_t a, 
>>> bfloat16x8_t b)
>>> +{
>>> +Ã¯Â¿Â½ return vbfmlalbq_laneq_f32 (r, a, b, 5);
>>> +}
>>> +
>>> +/*
>>> +**test_vbfmlaltq_laneq_f32:
>>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ ...
>>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ vfmat.bf16Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ q0, q1, d5[3]
>>> +**Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ bxÃ¯Â¿Â½Ã¯Â¿Â½Ã¯Â¿Â½ lr
>>> +*/
>>> +float32x4_t
>>> +test_vbfmlaltq_laneq_f32 (float32x4_t r, bfloat16x8_t a, 
>>> bfloat16x8_t b)
>>> +{
>>> +Ã¯Â¿Â½ return vbfmlaltq_laneq_f32 (r, a, b, 7);
>>> +}
>>> diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c 
>>> b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
>>> new file mode 100644
>>> index 
>>> 0000000000000000000000000000000000000000..226ed7e1d8e4747d73b0518c809aaf0e3c5bc78d 
>>>
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
>>> @@ -0,0 +1,31 @@
>>> +/* { dg-do compile { target { arm*-*-* } } } */
>>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>>> +/* { dg-add-options arm_v8_2a_bf16_neon } */
>>> +
>>> +#include "arm_neon.h"
>>> +
>>> +/* Test lane index limits for vbfmlalbq_lane_f32Ã¯Â¿Â½ */
>>> +float32x4_t
>>> +test_vbfmlalbq_lane_f32_low (float32x4_t r, bfloat16x8_t a, 
>>> bfloat16x4_t b)
>>> +{
>>> +Ã¯Â¿Â½ return __builtin_neon_vbfmab_lanev8bf (r, a, b, -1); /* { 
>>> dg-error {lane -1 out of range 0 - 3} } */
>>> +}
>>> +
>>> +float32x4_t
>>> +test_vbfmlalbq_lane_f32_high (float32x4_t r, bfloat16x8_t a, 
>>> bfloat16x4_t b)
>>> +{
>>> +Ã¯Â¿Â½ return __builtin_neon_vbfmab_lanev8bf (r, a, b, 4); /* { 
>>> dg-error {lane 4 out of range 0 - 3} } */
>>> +}
>>> +
>>> +/* Test lane index limits for vbfmlaltq_lane_f32Ã¯Â¿Â½ */
>>> +float32x4_t
>>> +test_vbfmlaltq_lane_f32_low (float32x4_t r, bfloat16x8_t a, 
>>> bfloat16x4_t b)
>>> +{
>>> +Ã¯Â¿Â½ return __builtin_neon_vbfmat_lanev8bf (r, a, b, -1); /* { 
>>> dg-error {lane -1 out of range 0 - 3} } */
>>> +}
>>> +
>>> +float32x4_t
>>> +test_vbfmlaltq_lane_f32_high (float32x4_t r, bfloat16x8_t a, 
>>> bfloat16x4_t b)
>>> +{
>>> +Ã¯Â¿Â½ return __builtin_neon_vbfmat_lanev8bf (r, a, b, 4); /* { 
>>> dg-error {lane 4 out of range 0 - 3} } */
>>> +}
>>>
>>>
>>> We want to be testing the ACLE intrinsics here rather than the 
>>> __builtin_neon* builtins directly. The builtins are an 
>>> implementation detail that the user should not rely on.
>>>
>>> Ok with these changes.
>>> Thanks,
>>> Kyrill
>>>
>
> diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c 
> b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..d7a944923cc889bc5f8eaeaa6a4de7672bacb8c3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
> @@ -0,0 +1,79 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon } */
> +/* { dg-additional-options "-save-temps" }\xA0 */
> +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
>
>
> Same with the comments on the load and store patches, this ends up not 
> doing the function body checks...
> I've adjusted these and added optimisation options to the tests and 
> committed this to master with 43031fbdda7d4edbd607365a4f3bbec069fe3983
> and adjusted the ChangeLog to reflect the latest changes:
> \xA0\xA0\xA0 2020-03-05\xA0 Delia Burduv\xA0 <delia.burduv@arm.com>
> \xA0\xA0\xA0 \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 * config/arm/arm_neon.h (vbfmmlaq_f32): New.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (vbfmlalbq_f32): New.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (vbfmlaltq_f32): New.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (vbfmlalbq_lane_f32): New.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (vbfmlaltq_lane_f32): New.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (vbfmlalbq_laneq_f32): New.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (vbfmlaltq_laneq_f32): New.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 * config/arm/arm_neon_builtins.def (vmmla): New.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (vfmab): New.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (vfmat): New.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (vfmab_lane): New.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (vfmat_lane): New.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (vfmab_laneq): New.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (vfmat_laneq): New.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 * config/arm/iterators.md (BF_MA): New int iterator.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (bt): New int attribute.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (VQXBF): Copy of VQX with V8BF.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 * config/arm/neon.md (neon_vmmlav8bf): New insn.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (neon_vfma<bt>v8bf): New insn.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (neon_vfma<bt>_lanev8bf): New insn.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (neon_vfma<bt>_laneqv8bf): New expand.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (neon_vget_high<mode>): Changed iterator to VQXBF.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 * config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (UNSPEC_BFMAB): New UNSPEC.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 (UNSPEC_BFMAT): New UNSPEC.
> \xA0\xA0\xA0 \xA0\xA0\xA0 2020-03-05\xA0 Delia Burduv\xA0 <delia.burduv@arm.com>
> \xA0\xA0\xA0 \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 * gcc.target/arm/simd/bf16_ma_1.c: New test.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 * gcc.target/arm/simd/bf16_ma_2.c: New test.
> \xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0 * gcc.target/arm/simd/bf16_mmla_1.c: New test.
>

And I realised that I had accidentally only pushed the tetstuite/ changes :/

I've pushed the rest of the patch with 
2d22ab64c4774d7d30c7e014652b28a13d744aec

Sorry for that.

Kyrill



> Thanks!
> Kyrill
>
>
> +
> +#include "arm_neon.h"
> +
> +/*
> +**test_vfmabq_f32:
> +**\xA0\xA0\xA0\xA0\xA0 ...
> +**\xA0\xA0\xA0\xA0\xA0 vfmab.bf16\xA0\xA0\xA0 q0, q1, q2
> +**\xA0\xA0\xA0\xA0\xA0 bx\xA0\xA0\xA0 lr
> +*/
> +float32x4_t
> +test_vfmabq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
> +{
> +\xA0 return vbfmlalbq_f32 (r, a, b);
> +}
> +
> +/*
> +**test_vfmatq_f32:
> +**\xA0\xA0\xA0\xA0\xA0 ...
> +**\xA0\xA0\xA0\xA0\xA0 vfmat.bf16\xA0\xA0\xA0 q0, q1, q2
> +**\xA0\xA0\xA0\xA0\xA0 bx\xA0\xA0\xA0 lr
> +*/
> +float32x4_t
> +test_vfmatq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
> +{
> +\xA0 return vbfmlaltq_f32 (r, a, b);
> +}
> +
> +/*
> +**test_vfmabq_lane_f32:
> +**\xA0\xA0\xA0\xA0\xA0 ...
> +**\xA0\xA0\xA0\xA0\xA0 vfmab.bf16\xA0\xA0\xA0 q0, q1, d4[0]
> +**\xA0\xA0\xA0\xA0\xA0 bx\xA0\xA0\xA0 lr
> +*/
> +float32x4_t
> +test_vfmabq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
> +{
> +\xA0 return vbfmlalbq_lane_f32 (r, a, b, 0);
> +}
> +
> +/*
> +**test_vfmatq_lane_f32:
> +**\xA0\xA0\xA0\xA0\xA0 ...
> +**\xA0\xA0\xA0\xA0\xA0 vfmat.bf16\xA0\xA0\xA0 q0, q1, d4[2]
> +**\xA0\xA0\xA0\xA0\xA0 bx\xA0\xA0\xA0 lr
> +*/
> +float32x4_t
> +test_vfmatq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
> +{
> +\xA0 return vbfmlaltq_lane_f32 (r, a, b, 2);
> +}
> +
> +/*
> +**test_vfmabq_laneq_f32:
> +**\xA0\xA0\xA0\xA0\xA0 ...
> +**\xA0\xA0\xA0\xA0\xA0 vfmab.bf16\xA0\xA0\xA0 q0, q1, d5[1]
> +**\xA0\xA0\xA0\xA0\xA0 bx\xA0\xA0\xA0 lr
> +*/
> +float32x4_t
> +test_vfmabq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
> +{
> +\xA0 return vbfmlalbq_laneq_f32 (r, a, b, 5);
> +}
> +
> +/*
> +**test_vfmatq_laneq_f32:
> +**\xA0\xA0\xA0\xA0\xA0 ...
> +**\xA0\xA0\xA0\xA0\xA0 vfmat.bf16\xA0\xA0\xA0 q0, q1, d5[3]
> +**\xA0\xA0\xA0\xA0\xA0 bx\xA0\xA0\xA0 lr
> +*/
> +float32x4_t
> +test_vfmatq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
> +{
> +\xA0 return vbfmlaltq_laneq_f32 (r, a, b, 7);
> +}
> diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c 
> b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..5a7a2a71791968045b413fc6c1d7daade5cf30f0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
> @@ -0,0 +1,35 @@
> +/* { dg-do compile { target { arm*-*-* } } } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon } */
> +
> +#include "arm_neon.h"
> +
> +/* Test lane index limits for vfmabq_lane_f32\xA0 */
> +float32x4_t
> +test_vfmabq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
> +{
> +\xA0 /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
> +\xA0 return vbfmlalbq_lane_f32 (r, a, b, -1);
> +}
> +
> +float32x4_t
> +test_vfmabq_lane_f32_high (float32x4_t r, bfloat16x8_t a, 
> bfloat16x4_t b)
> +{
> +\xA0 /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
> +\xA0 return vbfmlalbq_lane_f32 (r, a, b, 4);
> +}
> +
> +/* Test lane index limits for vfmatq_lane_f32\xA0 */
> +float32x4_t
> +test_vfmatq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
> +{
> +\xA0 /* { dg-error "lane -2 out of range 0 - 3" "" { target *-*-* } 0 } */
> +\xA0 return vbfmlaltq_lane_f32 (r, a, b, -2);
> +}
> +
> +float32x4_t
> +test_vfmatq_lane_f32_high (float32x4_t r, bfloat16x8_t a, 
> bfloat16x4_t b)
> +{
> +\xA0 /* { dg-error "lane 5 out of range 0 - 3" "" { target *-*-* } 0 } */
> +\xA0 return vbfmlaltq_lane_f32 (r, a, b, 5);
> +}
> diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c 
> b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..0b74e19203bbdbf8668f6c214843870338d27655
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c
> @@ -0,0 +1,18 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon } */
> +/* { dg-additional-options "-save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
> +
> +#include <arm_neon.h>
> +
> +/*test_vfmmlaq_f32:
> +**\xA0\xA0\xA0\xA0\xA0\xA0\xA0 ...
> +**\xA0\xA0\xA0\xA0\xA0\xA0\xA0 vmmla.bf16\xA0\xA0\xA0 q0, q1, q2
> +**\xA0\xA0\xA0\xA0\xA0\xA0\xA0 bx\xA0\xA0\xA0 lr
> +*/
> +float32x4_t
> +test_vmmlaq_f32 (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
> +{
> +\xA0 return vbfmmlaq_f32 (r, x, y);
> +}
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2020-03-05 17:49 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-20 18:46 [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma<b/t> for AArch32 AdvSIMD Delia Burduv
2020-01-22 17:45 ` Delia Burduv
2020-01-28 16:52   ` Delia Burduv
2020-01-30 15:55     ` Kyrill Tkachov
2020-01-31 16:21       ` Delia Burduv
2020-02-19 17:23         ` Delia Burduv
2020-02-21 11:41           ` Kyrill Tkachov
2020-03-04 17:21             ` Delia Burduv
2020-03-05 11:22               ` Kyrill Tkachov
2020-03-05 17:49                 ` Kyrill Tkachov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).