diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index a66961d0c513323844dd069b05cdfccc3e432cfc..1974967b171c28b95b21dc27837d7fe69f2d9f64 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -19426,6 +19426,59 @@ vcvtq_high_bf16_f32 (bfloat16x8_t inactive, float32x4_t __a) return __builtin_neon_vbfcvtv4sf_highv8bf (inactive, __a); } +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_neon_vmmlav8bf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_neon_vfmabv8bf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_neon_vfmatv8bf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, + const int __index) +{ + return __builtin_neon_vfmab_lanev8bf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, + const int __index) +{ + return __builtin_neon_vfmat_lanev8bf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, + const int __index) +{ + return __builtin_neon_vfmab_laneqv8bf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, + const int __index) +{ + return __builtin_neon_vfmat_laneqv8bf (__r, __a, __b, __index); +} + #pragma GCC pop_options #ifdef __cplusplus diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index 48c06c43a1744da7e143f6070ac945e8dd7225b6..38c8bb0b0ebe2c3cc59da629c7630c389ddd8317 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -391,3 +391,12 @@ VAR2 (UNOP, vbfcvt, v4bf, v8bf) VAR1 (UNOP, vbfcvt_high, v8bf) VAR2 (UNOP, vbfcvtv4sf, v4bf, v8bf) VAR1 (BINOP, vbfcvtv4sf_high, v8bf) + +VAR1 (TERNOP, vmmla, v8bf) + +VAR1 (TERNOP, vfmab, v8bf) +VAR1 (TERNOP, vfmat, v8bf) +VAR1 (MAC_LANE, vfmab_lane, v8bf) +VAR1 (MAC_LANE, vfmat_lane, v8bf) +VAR1 (MAC_LANE, vfmab_laneq, v8bf) +VAR1 (MAC_LANE, vfmat_laneq, v8bf) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 5f4e3d1235813ab81c176505f9a98d702359f7ec..831400192280d892835055174d9daab22ab08c92 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -106,6 +106,9 @@ ;; Quad-width vector modes plus 64-bit elements. (define_mode_iterator VQX [V16QI V8HI V8HF V8BF V4SI V4SF V2DI]) +;; Quad-width vector modes plus 64-bit elements and V8BF. +(define_mode_iterator VQXBF [V16QI V8HI V8HF (V8BF "TARGET_BF16_SIMD") V4SI V4SF V2DI]) + ;; Quad-width vector modes without floating-point elements. (define_mode_iterator VQI [V16QI V8HI V4SI]) @@ -493,6 +496,8 @@ (define_int_iterator MATMUL [UNSPEC_MATMUL_S UNSPEC_MATMUL_U UNSPEC_MATMUL_US]) +(define_int_iterator BF_MA [UNSPEC_BFMAB UNSPEC_BFMAT]) + ;;---------------------------------------------------------------------------- ;; Mode attributes ;;---------------------------------------------------------------------------- @@ -1209,3 +1214,6 @@ ]) (define_int_attr smlaw_op [(UNSPEC_SMLAWB "smlawb") (UNSPEC_SMLAWT "smlawt")]) + +;; An iterator for VFMA +(define_int_attr bt [(UNSPEC_BFMAB "b") (UNSPEC_BFMAT "t")]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index f5286d9c4b1a309f6ebe864c86596aaceb05c05b..75cc31a0d144724e8e51cb7f05a27e71a77eed25 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -3924,7 +3924,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vget_high" [(match_operand: 0 "s_register_operand") - (match_operand:VQX 1 "s_register_operand")] + (match_operand:VQXBF 1 "s_register_operand")] "TARGET_NEON" { emit_move_insn (operands[0], @@ -6737,3 +6737,64 @@ if (BYTES_BIG_ENDIAN) "TARGET_BF16_FP" "" ) + +(define_insn "neon_vmmlav8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (plus:V4SF (match_operand:V4SF 1 "register_operand" "0") + (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w") + (match_operand:V8BF 3 "register_operand" "w")] + UNSPEC_BFMMLA)))] + "TARGET_BF16_SIMD" + "vmmla.bf16\\t%q0, %q2, %q3" + [(set_attr "type" "neon_fp_mla_s_q")] +) + +(define_insn "neon_vfmav8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (plus: V4SF (match_operand:V4SF 1 "register_operand" "0") + (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w") + (match_operand:V8BF 3 "register_operand" "w")] + BF_MA)))] + "TARGET_BF16_SIMD" + "vfma.bf16\\t%q0, %q2, %q3" + [(set_attr "type" "neon_fp_mla_s_q")] +) + +(define_insn "neon_vfma_lanev8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (plus: V4SF (match_operand:V4SF 1 "register_operand" "0") + (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w") + (match_operand:V4BF 3 "register_operand" "x") + (match_operand:SI 4 "const_int_operand" "n")] + BF_MA)))] + "TARGET_BF16_SIMD" + "vfma.bf16\\t%q0, %q2, %P3[%c4]" + [(set_attr "type" "neon_fp_mla_s_scalar_q")] +) + +(define_expand "neon_vfma_laneqv8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (plus: V4SF (match_operand:V4SF 1 "register_operand" "0") + (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w") + (match_operand:V8BF 3 "register_operand" "x") + (match_operand:SI 4 "const_int_operand" "n")] + BF_MA)))] + "TARGET_BF16_SIMD" + { + int lane = INTVAL (operands[4]); + gcc_assert (IN_RANGE(lane, 0, 7)); + if (lane < 4) + { + emit_insn (gen_neon_vfma_lanev8bf (operands[0], operands[1], operands[2], operands[3], operands[4])); + } + else + { + rtx op_highpart = gen_reg_rtx (V4BFmode); + emit_insn (gen_neon_vget_highv8bf (op_highpart, operands[3])); + operands[4] = GEN_INT (lane - 4); + emit_insn (gen_neon_vfma_lanev8bf (operands[0], operands[1], operands[2], op_highpart, operands[4])); + } + DONE; + } + [(set_attr "type" "neon_fp_mla_s_scalar_q")] +) diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index b36ae512a6ebcf231b46a24e127c62e22e71a34f..f0b1f465de4b63d624510783576700519044717d 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -508,4 +508,7 @@ UNSPEC_MATMUL_US UNSPEC_BFCVT UNSPEC_BFCVT_HIGH + UNSPEC_BFMMLA + UNSPEC_BFMAB + UNSPEC_BFMAT ]) diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c new file mode 100644 index 0000000000000000000000000000000000000000..d7a944923cc889bc5f8eaeaa6a4de7672bacb8c3 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_1.c @@ -0,0 +1,79 @@ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps" } */ +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */ + +#include "arm_neon.h" + +/* +**test_vfmabq_f32: +** ... +** vfmab.bf16 q0, q1, q2 +** bx lr +*/ +float32x4_t +test_vfmabq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + return vbfmlalbq_f32 (r, a, b); +} + +/* +**test_vfmatq_f32: +** ... +** vfmat.bf16 q0, q1, q2 +** bx lr +*/ +float32x4_t +test_vfmatq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + return vbfmlaltq_f32 (r, a, b); +} + +/* +**test_vfmabq_lane_f32: +** ... +** vfmab.bf16 q0, q1, d4[0] +** bx lr +*/ +float32x4_t +test_vfmabq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return vbfmlalbq_lane_f32 (r, a, b, 0); +} + +/* +**test_vfmatq_lane_f32: +** ... +** vfmat.bf16 q0, q1, d4[2] +** bx lr +*/ +float32x4_t +test_vfmatq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + return vbfmlaltq_lane_f32 (r, a, b, 2); +} + +/* +**test_vfmabq_laneq_f32: +** ... +** vfmab.bf16 q0, q1, d5[1] +** bx lr +*/ +float32x4_t +test_vfmabq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + return vbfmlalbq_laneq_f32 (r, a, b, 5); +} + +/* +**test_vfmatq_laneq_f32: +** ... +** vfmat.bf16 q0, q1, d5[3] +** bx lr +*/ +float32x4_t +test_vfmatq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + return vbfmlaltq_laneq_f32 (r, a, b, 7); +} diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c new file mode 100644 index 0000000000000000000000000000000000000000..5a7a2a71791968045b413fc6c1d7daade5cf30f0 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_ma_2.c @@ -0,0 +1,35 @@ +/* { dg-do compile { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +#include "arm_neon.h" + +/* Test lane index limits for vfmabq_lane_f32 */ +float32x4_t +test_vfmabq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */ + return vbfmlalbq_lane_f32 (r, a, b, -1); +} + +float32x4_t +test_vfmabq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */ + return vbfmlalbq_lane_f32 (r, a, b, 4); +} + +/* Test lane index limits for vfmatq_lane_f32 */ +float32x4_t +test_vfmatq_lane_f32_low (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + /* { dg-error "lane -2 out of range 0 - 3" "" { target *-*-* } 0 } */ + return vbfmlaltq_lane_f32 (r, a, b, -2); +} + +float32x4_t +test_vfmatq_lane_f32_high (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + /* { dg-error "lane 5 out of range 0 - 3" "" { target *-*-* } 0 } */ + return vbfmlaltq_lane_f32 (r, a, b, 5); +} diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c new file mode 100644 index 0000000000000000000000000000000000000000..0b74e19203bbdbf8668f6c214843870338d27655 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_mmla_1.c @@ -0,0 +1,18 @@ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps" } */ +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */ + +#include + +/*test_vfmmlaq_f32: +** ... +** vmmla.bf16 q0, q1, q2 +** bx lr +*/ +float32x4_t +test_vmmlaq_f32 (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfmmlaq_f32 (r, x, y); +}