public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma<b/t> for AArch32 AdvSIMD
@ 2019-12-20 18:46 Delia Burduv
  2020-01-22 17:45 ` Delia Burduv
  0 siblings, 1 reply; 10+ messages in thread
From: Delia Burduv @ 2019-12-20 18:46 UTC (permalink / raw)
  To: gcc-patches; +Cc: nickc, Richard Earnshaw, Ramana Radhakrishnan, Kyrylo Tkachov

[-- Attachment #1: Type: text/plain, Size: 1904 bytes --]

This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab and vfmat 
as part of the BFloat16 extension.
(https://developer.arm.com/docs/101028/latest.)
The intrinsics are declared in arm_neon.h and the RTL patterns are 
defined in neon.md.
Two new tests are added to check assembler output and lane indices.

This patch depends on the Arm back-end patche. 
(https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)

Tested for regression on arm-none-eabi and armeb-none-eabi. I don't have 
commit rights, so if this is ok can someone please commit it for me?

gcc/ChangeLog:

2019-11-12  Delia Burduv  <delia.burduv@arm.com>

	* config/arm/arm_neon.h (vbfmmlaq_f32): New.
	  (vbfmlalbq_f32): New.
	  (vbfmlaltq_f32): New.
	  (vbfmlalbq_lane_f32): New.
	  (vbfmlaltq_lane_f32): New.
   	  (vbfmlalbq_laneq_f32): New.
	  (vbfmlaltq_laneq_f32): New.
	* config/arm/arm_neon_builtins.def (vbfmmla): New.
           (vbfmab): New.
           (vbfmat): New.
           (vbfmab_lane): New.
           (vbfmat_lane): New.
           (vbfmab_laneq): New.
           (vbfmat_laneq): New.
  	* config/arm/iterators.md (BF_MA): New int iterator.
           (bt): New int attribute.
           (VQXBF): Copy of VQX with V8BF.
           (V_HALF): Added V8BF.
   	* config/arm/neon.md (neon_vbfmmlav8hi): New insn.
           (neon_vbfma<bt>v8hi): New insn.
           (neon_vbfma<bt>_lanev8hi): New insn.
           (neon_vbfma<bt>_laneqv8hi): New expand.
           (neon_vget_high<mode>): Changed iterator to VQXBF.
	* config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC.
           (UNSPEC_BFMAB): New UNSPEC.
           (UNSPEC_BFMAT): New UNSPEC.

2019-11-12  Delia Burduv  <delia.burduv@arm.com>

         * gcc.target/arm/simd/bf16_ma_1.c: New test.
         * gcc.target/arm/simd/bf16_ma_2.c: New test.
         * gcc.target/arm/simd/bf16_mmla_1.c: New test.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: rb12263.patch --]
[-- Type: text/x-patch; name="rb12263.patch", Size: 12794 bytes --]

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 71e7568e4315a9354062dee5442ca4af9d9660a9..097d7bb30ad0109ca2f41885206b1cfb2ce962dc 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -91,6 +91,60 @@ typedef float float32_t;
 #ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
 typedef __simd128_bfloat16_t bfloat16x8_t;
 typedef __simd64_bfloat16_t bfloat16x4_t;
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmmlav8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmabv8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmatv8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		    const int __index)
+{
+  return __builtin_neon_vbfmab_lanev8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		    const int __index)
+{
+  return __builtin_neon_vbfmat_lanev8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		     const int __index)
+{
+  return __builtin_neon_vbfmab_laneqv8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		     const int __index)
+{
+  return __builtin_neon_vbfmat_laneqv8bf (__r, __a, __b, __index);
+}
+
 #endif
 #pragma GCC pop_options
 #pragma GCC pop_options
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index bcccf93f7fa2750e9006e5856efecbec0fb331b9..169781fa9a07930eb755165019427be055dc36ef 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -373,3 +373,12 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane0, v4sf, v8hf)
 VAR2 (MAC_LANE_PAIR, vcmlaq_lane90, v4sf, v8hf)
 VAR2 (MAC_LANE_PAIR, vcmlaq_lane180, v4sf, v8hf)
 VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf)
+
+VAR1 (TERNOP, vbfmmla, v8bf)
+
+VAR1 (TERNOP, vbfmab, v8bf)
+VAR1 (TERNOP, vbfmat, v8bf)
+VAR1 (MAC_LANE, vbfmab_lane, v8bf)
+VAR1 (MAC_LANE, vbfmat_lane, v8bf)
+VAR1 (MAC_LANE, vbfmab_laneq, v8bf)
+VAR1 (MAC_LANE, vbfmat_laneq, v8bf)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 439021fa0733ac31706287c4f98d62b080afc3a1..b31f54ffe8957d3dad0a7e3d3fedc48911e7b2c4 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -108,6 +108,9 @@
 ;; Quad-width vector modes plus 64-bit elements.
 (define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI])
 
+;; Quad-width vector modes plus 64-bit elements and V8BF.
+(define_mode_iterator VQXBF [V16QI V8HI V8HF (V8BF "TARGET_BF16_SIMD") V4SI V4SF V2DI])
+
 ;; Quad-width vector modes without floating-point elements.
 (define_mode_iterator VQI [V16QI V8HI V4SI])
 
@@ -488,6 +491,8 @@
 (define_int_iterator VCADD [UNSPEC_VCADD90 UNSPEC_VCADD270])
 (define_int_iterator VCMLA [UNSPEC_VCMLA UNSPEC_VCMLA90 UNSPEC_VCMLA180 UNSPEC_VCMLA270])
 
+(define_int_iterator BF_MA [UNSPEC_BFMAB UNSPEC_BFMAT])
+
 ;;----------------------------------------------------------------------------
 ;; Mode attributes
 ;;----------------------------------------------------------------------------
@@ -612,7 +617,8 @@
 (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI")
 			  (V8HF "V4HF") (V4SI  "V2SI")
 			  (V4SF "V2SF") (V2DF "DF")
-			  (V2DI "DI") (V4HF "HF")])
+			  (V2DI "DI") (V4HF "HF")
+			  (V8BF "V4BF")])
 
 ;; Same, but lower-case.
 (define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi")
@@ -1174,4 +1180,7 @@
 (define_int_attr opsuffix [(UNSPEC_DOT_S "s8")
 			   (UNSPEC_DOT_U "u8")])
 
+;; An iterator for VFMA<bt>
+(define_int_attr bt [(UNSPEC_BFMAB "b") (UNSPEC_BFMAT "t")])
+
 (define_int_attr smlaw_op [(UNSPEC_SMLAWB "smlawb") (UNSPEC_SMLAWT "smlawt")])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index b724aab65f720bf0e48bb828f0874426effd235c..42763de178a96422f9df7f4500e4328adfa81d27 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -3879,7 +3879,7 @@ if (BYTES_BIG_ENDIAN)
 
 (define_expand "neon_vget_high<mode>"
   [(match_operand:<V_HALF> 0 "s_register_operand")
-   (match_operand:VQX 1 "s_register_operand")]
+   (match_operand:VQXBF 1 "s_register_operand")]
   "TARGET_NEON"
 {
   emit_move_insn (operands[0],
@@ -6556,3 +6556,62 @@ if (BYTES_BIG_ENDIAN)
  "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
  [(set_attr "type" "neon_fp_abd_s<q>")]
 )
+
+(define_insn "neon_vbfmmlav8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus:V4SF (match_operand:V4SF 1 "register_operand" "0")
+                   (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                 (match_operand:V8BF 3 "register_operand" "w")]
+                    UNSPEC_BFMMLA)))]
+  "TARGET_BF16_SIMD"
+  "vmmla.bf16\\t%q0, %q2, %q3"
+  [(set_attr "type" "neon_mla_s_q")]
+)
+
+(define_insn "neon_vbfma<bt>v8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V8BF 3 "register_operand" "w")]
+                     BF_MA)))]
+  "TARGET_BF16_SIMD"
+  "vfma<bt>.bf16\\t%q0, %q2, %q3"
+  [(set_attr "type" "neon_fp_mla_s")]
+)
+
+(define_insn "neon_vbfma<bt>_lanev8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V4BF 3 "register_operand" "x")
+                                  (match_operand:SI 4 "const_int_operand" "n")]
+                     BF_MA)))]
+  "TARGET_BF16_SIMD"
+  "vfma<bt>.bf16\\t%q0, %q2, %P3[%c4]"
+  [(set_attr "type" "neon_fp_mla_s")]
+)
+
+(define_expand "neon_vbfma<bt>_laneqv8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V8BF 3 "register_operand" "x")
+                                  (match_operand:SI 4 "const_int_operand" "n")]
+                     BF_MA)))]
+  "TARGET_BF16_SIMD"
+  {
+    int lane = INTVAL (operands[4]);
+    gcc_assert (lane >=0 && lane <=7);
+    if (lane < 4)
+	emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], operands[1], operands[2], operands[3], operands[4]));
+    else
+      {
+	rtx op_highpart = gen_reg_rtx (V4BFmode);
+	emit_insn (gen_neon_vget_highv8bf (op_highpart, operands[3]));
+	operands[4] = GEN_INT (lane - 4);
+	emit_insn (gen_neon_vbfma<bt>_lanev8bf (operands[0], operands[1], operands[2], op_highpart, operands[4]));
+      }
+    DONE;
+  }
+  [(set_attr "type" "neon_fp_mla_s")]
+)
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index b4196b0e5cd939c3ee5e3f9bd19622fcc963adae..f452082b4bdb3a22a8e3b62113bb7f9470279e93 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -493,4 +493,7 @@
   UNSPEC_VCMLA90
   UNSPEC_VCMLA180
   UNSPEC_VCMLA270
+  UNSPEC_BFMMLA
+  UNSPEC_BFMAB
+  UNSPEC_BFMAT
 ])
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..ead3e9d569f45f5507985e5d7cb12e0541349dd1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c
@@ -0,0 +1,84 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" }  */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+**test_vbfmlalbq_f32:
+**      ...
+**      vfmab.bf16\tq[0-9]+, q[0-9]+, q[0-9]+
+**      ...
+*/
+float32x4_t
+test_vbfmlalbq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlalbq_f32 (r, a, b);
+}
+
+/*
+**test_vbfmlaltq_f32:
+**      ...
+**      vfmat.bf16\tq[0-9]+, q[0-9]+, q[0-9]+
+**      ...
+*/
+float32x4_t
+test_vbfmlaltq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlaltq_f32 (r, a, b);
+}
+
+/*
+**test_vbfmlalbq_lane_f32:
+**      ...
+**      vfmab.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[0\]
+**      ...
+*/
+float32x4_t
+test_vbfmlalbq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return vbfmlalbq_lane_f32 (r, a, b, 0);
+}
+
+/*
+**test_vbfmlaltq_lane_f32:
+**      ...
+**      vfmat.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[2\]
+**      ...
+*/
+float32x4_t
+test_vbfmlaltq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return vbfmlaltq_lane_f32 (r, a, b, 2);
+}
+
+/*
+**test_vbfmlalbq_laneq_f32:
+**      ...
+**      vfmab.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[1\]
+**      ...
+*/
+float32x4_t
+test_vbfmlalbq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlalbq_laneq_f32 (r, a, b, 5);
+}
+
+/*
+**test_vbfmlaltq_laneq_f32:
+**      ...
+**      vfmat.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[3\]
+**      ...
+*/
+float32x4_t
+test_vbfmlaltq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlaltq_laneq_f32 (r, a, b, 7);
+}
+
+int main()
+{
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..226ed7e1d8e4747d73b0518c809aaf0e3c5bc78d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include "arm_neon.h"
+
+/* Test lane index limits for vbfmlalbq_lane_f32  */
+float32x4_t
+test_vbfmlalbq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmab_lanev8bf (r, a, b, -1); /* { dg-error {lane -1 out of range 0 - 3} } */
+}
+
+float32x4_t
+test_vbfmlalbq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmab_lanev8bf (r, a, b, 4); /* { dg-error {lane 4 out of range 0 - 3} } */
+}
+
+/* Test lane index limits for vbfmlaltq_lane_f32  */
+float32x4_t
+test_vbfmlaltq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmat_lanev8bf (r, a, b, -1); /* { dg-error {lane -1 out of range 0 - 3} } */
+}
+
+float32x4_t
+test_vbfmlaltq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfmat_lanev8bf (r, a, b, 4); /* { dg-error {lane 4 out of range 0 - 3} } */
+}
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..0c7422b78c385850eaa53492af0da8826e8b3b4a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c
@@ -0,0 +1,24 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+**test_vbfmmlaq_f32:
+**	...
+**	vmmla.bf16\tq[0-9]+, q[0-9]+, q[0-9]+
+**	...
+*/
+float32x4_t
+test_vbfmmlaq_f32 (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfmmlaq_f32 (r, x, y);
+}
+
+int main()
+{
+  return 0;
+}

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2020-03-05 17:49 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-20 18:46 [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma<b/t> for AArch32 AdvSIMD Delia Burduv
2020-01-22 17:45 ` Delia Burduv
2020-01-28 16:52   ` Delia Burduv
2020-01-30 15:55     ` Kyrill Tkachov
2020-01-31 16:21       ` Delia Burduv
2020-02-19 17:23         ` Delia Burduv
2020-02-21 11:41           ` Kyrill Tkachov
2020-03-04 17:21             ` Delia Burduv
2020-03-05 11:22               ` Kyrill Tkachov
2020-03-05 17:49                 ` Kyrill Tkachov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).