* ACLE intrinsics: BFloat16 load intrinsics for AArch32 @ 2019-12-20 19:04 Delia Burduv 2020-01-22 18:20 ` Delia Burduv 0 siblings, 1 reply; 9+ messages in thread From: Delia Burduv @ 2019-12-20 19:04 UTC (permalink / raw) To: gcc-patches; +Cc: nickc, Richard Earnshaw, Kyrylo Tkachov, Ramana Radhakrishnan [-- Attachment #1: Type: text/plain, Size: 2781 bytes --] This patch adds the ARMv8.6 ACLE BFloat16 load intrinsics vld<n>{q}_bf16 as part of the BFloat16 extension. (https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) The intrinsics are declared in arm_neon.h . A new test is added to check assembler output. This patch depends on the Arm back-end patche. (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html) Tested for regression on arm-none-eabi and armeb-none-eabi. I don't have commit rights, so if this is ok can someone please commit it for me? gcc/ChangeLog: 2019-11-14 Delia Burduv <delia.burduv@arm.com> * config/arm/arm_neon.h (bfloat16_t): New typedef. (bfloat16x4x2_t): New typedef. (bfloat16x8x2_t): New typedef. (bfloat16x4x3_t): New typedef. (bfloat16x8x3_t): New typedef. (bfloat16x4x4_t): New typedef. (bfloat16x8x4_t): New typedef. (vld2_bf16): New. (vld2q_bf16): New. (vld3_bf16): New. (vld3q_bf16): New. (vld4_bf16): New. (vld4q_bf16): New. (vld2_dup_bf16): New. (vld2q_dup_bf16): New. (vld3_dup_bf16): New. (vld3q_dup_bf16): New. (vld4_dup_bf16): New. (vld4q_dup_bf16): New. * config/arm/arm-builtins.c (E_V2BFmode): New mode. (VAR13): New. (arm_simd_types[Bfloat16x2_t]):New type. * config/arm/arm-modes.def (V2BF): New mode. * config/arm/arm-simd-builtin-types.def (Bfloat16x2_t): New entry. * config/arm/arm_neon_builtins.def (vld2): Changed to VAR13 and added v4bf, v8bf (vld2_dup): Changed to VAR8 and added v4bf, v8bf (vld3): Changed to VAR13 and added v4bf, v8bf (vld3_dup): Changed to VAR8 and added v4bf, v8bf (vld4): Changed to VAR13 and added v4bf, v8bf (vld4_dup): Changed to VAR8 and added v4bf, v8bf * config/arm/iterators.md (VDXBF): New iterator. (VQ2BF): New iterator. (V_elem): Added V4BF, V8BF. (V_sz_elem): Added V4BF, V8BF. (V_mode_nunits): Added V4BF, V8BF. (q): Added V4BF, V8BF. *config/arm/neon.md (vld2): Used new iterators. (vld2_dup<mode>): Used new iterators. (vld2_dupv8bf): New. (vst3): Used new iterators. (vst3qa): Used new iterators. (vst3qb): Used new iterators. (vld3_dup<mode>): Used new iterators. (vld3_dupv8bf): New. (vst4): Used new iterators. (vst4qa): Used new iterators. (vst4qb): Used new iterators. (vld4_dup<mode>): Used new iterators. (vld4_dupv8bf): New. gcc/testsuite/ChangeLog: 2019-11-14 Delia Burduv <delia.burduv@arm.com> * gcc.target/arm/simd/bf16_vldn_1.c: New test. [-- Warning: decoded text below may be mangled, UTF-8 assumed --] [-- Attachment #2: rb12473.patch --] [-- Type: text/x-patch; name="rb12473.patch", Size: 24503 bytes --] diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c index df09a6bb1fce5f9216337d71cba51a890fd57baf..551d76a44fadc58a35a6155486ec1fb16c959da0 100644 --- a/gcc/config/arm/arm-builtins.c +++ b/gcc/config/arm/arm-builtins.c @@ -318,6 +318,7 @@ arm_set_sat_qualifiers[SIMD_MAX_BUILTIN_ARGS] #define v4bf_UP E_V4BFmode #define v2si_UP E_V2SImode #define v2sf_UP E_V2SFmode +#define v2bf_UP E_V2BFmode #define di_UP E_DImode #define v16qi_UP E_V16QImode #define v8hi_UP E_V8HImode @@ -381,6 +382,9 @@ typedef struct { #define VAR12(T, N, A, B, C, D, E, F, G, H, I, J, K, L) \ VAR11 (T, N, A, B, C, D, E, F, G, H, I, J, K) \ VAR1 (T, N, L) +#define VAR13(T, N, A, B, C, D, E, F, G, H, I, J, K, L, M) \ + VAR12 (T, N, A, B, C, D, E, F, G, H, I, J, K, L) \ + VAR1 (T, N, M) /* The builtin data can be found in arm_neon_builtins.def, arm_vfp_builtins.def and arm_acle_builtins.def. The entries in arm_neon_builtins.def require @@ -1013,6 +1017,7 @@ arm_init_simd_builtin_types (void) arm_simd_types[Float32x4_t].eltype = float_type_node; /* Init Bfloat vector types with underlying __bf16 scalar type. */ + arm_simd_types[Bfloat16x2_t].eltype = arm_bf16_type_node; arm_simd_types[Bfloat16x4_t].eltype = arm_bf16_type_node; arm_simd_types[Bfloat16x8_t].eltype = arm_bf16_type_node; diff --git a/gcc/config/arm/arm-modes.def b/gcc/config/arm/arm-modes.def index 80c3c1a6eb258d116b07ad71fafafc9befb76e8b..9533d177059d98fa2a9e9d1d6321f3d92dad7592 100644 --- a/gcc/config/arm/arm-modes.def +++ b/gcc/config/arm/arm-modes.def @@ -80,6 +80,7 @@ VECTOR_MODE (FLOAT, HF, 2); /* V2HF */ FLOAT_MODE (BF, 2, 0); ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format); +VECTOR_MODE (FLOAT, BF, 2); /* V2BF. */ VECTOR_MODE (FLOAT, BF, 4); /* V4BF. */ VECTOR_MODE (FLOAT, BF, 8); /* V8BF. */ diff --git a/gcc/config/arm/arm-simd-builtin-types.def b/gcc/config/arm/arm-simd-builtin-types.def index ee240f85c5618417fff039ec43b81641b187c126..f52f679156d5041ab109909393dc37fda33a390d 100644 --- a/gcc/config/arm/arm-simd-builtin-types.def +++ b/gcc/config/arm/arm-simd-builtin-types.def @@ -48,5 +48,6 @@ ENTRY (Float16x8_t, V8HF, none, 128, float16, 19) ENTRY (Float32x4_t, V4SF, none, 128, float32, 19) + ENTRY (Bfloat16x2_t, V2BF, none, 32, bfloat16, 20) ENTRY (Bfloat16x4_t, V4BF, none, 64, bfloat16, 20) ENTRY (Bfloat16x8_t, V8BF, none, 128, bfloat16, 20) diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 71e7568e4315a9354062dee5442ca4af9d9660a9..c47f3cdd2d51066067d2ef341cc12a6db4b6f785 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -91,6 +91,145 @@ typedef float float32_t; #ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC typedef __simd128_bfloat16_t bfloat16x8_t; typedef __simd64_bfloat16_t bfloat16x4_t; + +typedef struct bfloat16x4x2_t +{ + bfloat16x4_t val[2]; +} bfloat16x4x2_t; + +typedef struct bfloat16x8x2_t +{ + bfloat16x8_t val[2]; +} bfloat16x8x2_t; + +typedef struct bfloat16x4x3_t +{ + bfloat16x4_t val[3]; +} bfloat16x4x3_t; + +typedef struct bfloat16x8x3_t +{ + bfloat16x8_t val[3]; +} bfloat16x8x3_t; + +typedef struct bfloat16x4x4_t +{ + bfloat16x4_t val[4]; +} bfloat16x4x4_t; + +typedef struct bfloat16x8x4_t +{ + bfloat16x8_t val[4]; +} bfloat16x8x4_t; + +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_bf16 (bfloat16_t const * __ptr) +{ + union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2v4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2v8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3v4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3v8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4v4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4v8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + #endif #pragma GCC pop_options #pragma GCC pop_options diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index bcccf93f7fa2750e9006e5856efecbec0fb331b9..b9b56fc3d8b767eac0734d75e3fc5b61188ddca7 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -320,29 +320,29 @@ VAR12 (STORE1, vst1, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di) VAR12 (STORE1LANE, vst1_lane, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di) -VAR11 (LOAD1, vld2, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (LOAD1, vld2, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR9 (LOAD1LANE, vld2_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR6 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR8 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR11 (STORE1, vst2, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) VAR9 (STORE1LANE, vst2_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR11 (LOAD1, vld3, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (LOAD1, vld3, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR9 (LOAD1LANE, vld3_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR6 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR8 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR11 (STORE1, vst3, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) VAR9 (STORE1LANE, vst3_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR11 (LOAD1, vld4, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (LOAD1, vld4, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR9 (LOAD1LANE, vld4_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR6 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR8 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR11 (STORE1, vst4, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) VAR9 (STORE1LANE, vst4_lane, diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 439021fa0733ac31706287c4f98d62b080afc3a1..f8b98bd57af223cacba05907d25e3d4b9d58eb8a 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -86,6 +86,12 @@ ;; Double-width vector modes plus 64-bit elements. (define_mode_iterator VDX [V8QI V4HI V4HF V2SI V2SF DI]) +;; Double-width vector modes plus 64-bit elements and V4BF. +(define_mode_iterator VDXBF [V8QI V4HI V4HF V2SI V2SF DI (V4BF "TARGET_BF16_SIMD")]) + +;; Double-width vector modes plus 64-bit elements, V4BF and V8BF. +(define_mode_iterator VDXBF2 [V8QI V4HI V4HF V2SI V2SF DI (V4BF "TARGET_BF16_SIMD") (V8BF ("TARGET_BF16_SIMD"))]) + ;; Double-width vector modes plus 64-bit elements, ;; with V4BFmode added, suitable for moves. (define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI]) @@ -102,6 +108,9 @@ ;; Quad-width vector modes, including V8HF. (define_mode_iterator VQ2 [V16QI V8HI V8HF V4SI V4SF]) +;; Quad-width vector modes, including V8HF and V8BF. +(define_mode_iterator VQ2BF [V16QI V8HI V8HF V4SI V4SF (V8BF "TARGET_BF16_SIMD")]) + ;; Quad-width vector modes with 16- or 32-bit elements (define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF]) @@ -546,6 +555,7 @@ (define_mode_attr V_two_elem [(V8QI "HI") (V16QI "HI") (V4HI "SI") (V8HI "SI") (V4HF "SF") (V8HF "SF") + (V4BF "V2BF") (V8BF "V2BF") (V2SI "V2SI") (V4SI "V2SI") (V2SF "V2SF") (V4SF "V2SF") (DI "V2DI") (V2DI "V2DI")]) @@ -566,6 +576,7 @@ (define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK") (V4HI "BLK") (V8HI "BLK") (V4HF "BLK") (V8HF "BLK") + (V4BF "BLK") (V8BF "BLK") (V2SI "BLK") (V4SI "BLK") (V2SF "BLK") (V4SF "BLK") (DI "EI") (V2DI "EI")]) @@ -574,6 +585,7 @@ (define_mode_attr V_four_elem [(V8QI "SI") (V16QI "SI") (V4HI "V4HI") (V8HI "V4HI") (V4HF "V4HF") (V8HF "V4HF") + (V4BF "V4BF") (V8BF "V4BF") (V2SI "V4SI") (V4SI "V4SI") (V2SF "V4SF") (V4SF "V4SF") (DI "OI") (V2DI "OI")]) @@ -697,6 +709,7 @@ (V2SI "32") (V4SI "32") (DI "64") (V2DI "64") (V4HF "16") (V8HF "16") + (V4BF "16") (V8BF "16") (V2SF "32") (V4SF "32")]) (define_mode_attr V_elem_ch [(V8QI "b") (V16QI "b") @@ -772,6 +785,7 @@ (define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16") (V4HF "4") (V8HF "8") (V4HI "4") (V8HI "8") + (V4BF "4") (V8BF "8") (V2SI "2") (V4SI "4") (V2SF "2") (V4SF "4") (DI "1") (V2DI "2") diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index b724aab65f720bf0e48bb828f0874426effd235c..4109e7f84838e48eebd95290eeeefc9d3e48ec7d 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -5383,7 +5383,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld2<mode>" [(set (match_operand:TI 0 "s_register_operand" "=w") (unspec:TI [(match_operand:TI 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2))] "TARGET_NEON" { @@ -5408,7 +5408,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld2<mode>" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2))] "TARGET_NEON" "vld2.<V_sz_elem>\t%h0, %A1" @@ -5471,7 +5471,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld2_dup<mode>" [(set (match_operand:TI 0 "s_register_operand" "=w") (unspec:TI [(match_operand:<V_two_elem> 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2_DUP))] "TARGET_NEON" { @@ -5486,6 +5486,27 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load1_1reg<q>")))] ) +(define_insn "neon_vld2_dupv8bf" + [(set (match_operand:OI 0 "s_register_operand" "=w") + (unspec:OI [(match_operand:V2BF 1 "neon_struct_operand" "Um") + (unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_BF16_SIMD" + { + rtx ops[5]; + int tabbase = REGNO (operands[0]); + + ops[4] = operands[1]; + ops[0] = gen_rtx_REG (V4BFmode, tabbase); + ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2); + ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4); + ops[3] = gen_rtx_REG (V4BFmode, tabbase + 6); + output_asm_insn ("vld2.16\t{%P0, %P1, %P2, %P3}, %A4", ops); + return ""; + } + [(set_attr "type" "neon_load2_all_lanes_q")] +) + (define_expand "vec_store_lanesti<mode>" [(set (match_operand:TI 0 "neon_struct_operand") (unspec:TI [(match_operand:TI 1 "s_register_operand") @@ -5592,7 +5613,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3<mode>" [(set (match_operand:EI 0 "s_register_operand" "=w") (unspec:EI [(match_operand:EI 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3))] "TARGET_NEON" { @@ -5620,7 +5641,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vld3<mode>" [(match_operand:CI 0 "s_register_operand") (match_operand:CI 1 "neon_struct_operand") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { rtx mem; @@ -5635,7 +5656,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3qa<mode>" [(set (match_operand:CI 0 "s_register_operand" "=w") (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3A))] "TARGET_NEON" { @@ -5655,7 +5676,7 @@ if (BYTES_BIG_ENDIAN) [(set (match_operand:CI 0 "s_register_operand" "=w") (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um") (match_operand:CI 2 "s_register_operand" "0") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3B))] "TARGET_NEON" { @@ -5732,7 +5753,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3_dup<mode>" [(set (match_operand:EI 0 "s_register_operand" "=w") (unspec:EI [(match_operand:<V_three_elem> 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3_DUP))] "TARGET_NEON" { @@ -5755,6 +5776,26 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load3_all_lanes<q>") (const_string "neon_load1_1reg<q>")))]) +(define_insn "neon_vld3_dupv8bf" + [(set (match_operand:CI 0 "s_register_operand" "=w") + (unspec:CI [(match_operand:V2BF 1 "neon_struct_operand" "Um") + (unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_BF16_SIMD" + { + rtx ops[4]; + int tabbase = REGNO (operands[0]); + + ops[3] = operands[1]; + ops[0] = gen_rtx_REG (V4BFmode, tabbase); + ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2); + ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4); + output_asm_insn ("vld3.16\t{%P0[], %P1[], %P2[]}, %A3", ops); + return ""; + } + [(set_attr "type" "neon_load3_all_lanes_q")] +) + (define_expand "vec_store_lanesei<mode>" [(set (match_operand:EI 0 "neon_struct_operand") (unspec:EI [(match_operand:EI 1 "s_register_operand") @@ -5910,7 +5951,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4<mode>" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4))] "TARGET_NEON" { @@ -5938,7 +5979,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vld4<mode>" [(match_operand:XI 0 "s_register_operand") (match_operand:XI 1 "neon_struct_operand") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { rtx mem; @@ -5953,7 +5994,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4qa<mode>" [(set (match_operand:XI 0 "s_register_operand" "=w") (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4A))] "TARGET_NEON" { @@ -5974,7 +6015,7 @@ if (BYTES_BIG_ENDIAN) [(set (match_operand:XI 0 "s_register_operand" "=w") (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um") (match_operand:XI 2 "s_register_operand" "0") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4B))] "TARGET_NEON" { @@ -6054,7 +6095,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4_dup<mode>" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:<V_four_elem> 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4_DUP))] "TARGET_NEON" { @@ -6080,6 +6121,27 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load1_1reg<q>")))] ) +(define_insn "neon_vld4_dupv8bf" + [(set (match_operand:XI 0 "s_register_operand" "=w") + (unspec:XI [(match_operand:V2BF 1 "neon_struct_operand" "Um") + (unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_BF16_SIMD" + { + rtx ops[5]; + int tabbase = REGNO (operands[0]); + + ops[4] = operands[1]; + ops[0] = gen_rtx_REG (V4BFmode, tabbase); + ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2); + ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4); + ops[3] = gen_rtx_REG (V4BFmode, tabbase + 6); + output_asm_insn ("vld4.16\t{%P0[], %P1[], %P2[], %P3[]}, %A4", ops); + return ""; + } + [(set_attr "type" "neon_load4_all_lanes_q")] +) + (define_expand "vec_store_lanesoi<mode>" [(set (match_operand:OI 0 "neon_struct_operand") (unspec:OI [(match_operand:OI 1 "s_register_operand") diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c new file mode 100644 index 0000000000000000000000000000000000000000..8db8dfbe28d2136bd2d943e2aae80e32cea34133 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c @@ -0,0 +1,152 @@ +/* { dg-do assemble } */ +/* { dg-options "-save-temps" } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + + +/* +**test_vld2_bf16: +** ... +** vld2.16\t{d[0-9]+-d[0-9]+}, \[r[0-9]+\] +** ... +*/ +bfloat16x4x2_t +test_vld2_bf16 (bfloat16_t * ptr) +{ + vld2_bf16 (ptr); +} + +/* +**test_vld2q_bf16: +** ... +** vld2.16\t{d[0-9]+-d[0-9]+}, \[r[0-9]+\] +** ... +*/ +bfloat16x8x2_t +test_vld2q_bf16 (bfloat16_t * ptr) +{ + vld2q_bf16 (ptr); +} + +/* +**test_vld2_dup_bf16: +** ... +** vld2.16\t{d[0-9]+\[\], d[0-9]+\[\]}, \[r[0-9]+\] +** ... +*/ +bfloat16x4x2_t +test_vld2_dup_bf16 (bfloat16_t * ptr) +{ + vld2_dup_bf16 (ptr); +} + +/* +**test_vld2q_dup_bf16: +** ... +** vld2.16\t{d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}, \[r3\] +** ... +*/ +bfloat16x8x2_t +test_vld2q_dup_bf16 (bfloat16_t * ptr) +{ + vld2q_dup_bf16 (ptr); +} + +/* +**test_vld3_bf16: +** ... +** vld3.16\t{d[0-9]+-d[0-9]+}, \[r[0-9]+\] +** ... +*/ +bfloat16x4x3_t +test_vld3_bf16 (bfloat16_t * ptr) +{ + vld3_bf16 (ptr); +} + +/* +**test_vld3q_bf16: +** ... +** vld3.16\t{d[0-9]+, d[0-9]+, d[0-9]+}, \[r[0-9]+\] +** ... +*/ +bfloat16x8x3_t +test_vld3q_bf16 (bfloat16_t * ptr) +{ + vld3q_bf16 (ptr); +} + +/* +**test_vld3_dup_bf16: +** ... +** vld3.16\t{d[0-9]+\[\], d[0-9]+\[\], d[0-9]+\[\]}, \[r[0-9]+\] +** ... +*/ +bfloat16x4x3_t +test_vld3_dup_bf16 (bfloat16_t * ptr) +{ + vld3_dup_bf16 (ptr); +} + +/* +**test_vld3q_dup_bf16: +** ... +** vld3.16\t{d[0-9]+\[\], d[0-9]+\[\], d[0-9]+\[\]}, \[r[0-9]+\] +** ... +*/ +bfloat16x8x3_t +test_vld3q_dup_bf16 (bfloat16_t * ptr) +{ + vld3q_dup_bf16 (ptr); +} + +/* +**test_vld4_bf16: +** ... +** vld4.16\t{d[0-9]+-d[0-9]+}, \[r[0-9]+\] +** ... +*/ +bfloat16x4x4_t +test_vld4_bf16 (bfloat16_t * ptr) +{ + vld4_bf16 (ptr); +} + +/* +**test_vld4q_bf16: +** ... +** vld4.16\t{d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}, \[r[0-9]+\] +** ... +*/ +bfloat16x8x4_t +test_vld4q_bf16 (bfloat16_t * ptr) +{ + vld4q_bf16 (ptr); +} + +/* +**test_vld4_dup_bf16: +** ... +** vld4.16\t{d[0-9]+\[\], d[0-9]+\[\], d[0-9]+\[\], d[0-9]+\[\]}, \[r[0-9]+\] +** ... +*/ +bfloat16x4x4_t +test_vld4_dup_bf16 (bfloat16_t * ptr) +{ + vld4_dup_bf16 (ptr); +} + +/* +**test_vld4q_dup_bf16: +** ... +** vld4.16\t{d[0-9]+\[\], d[0-9]+\[\], d[0-9]+\[\], d[0-9]+\[\]}, \[r[0-9]+\] +** ... +*/ +bfloat16x4x4_t +test_vld4q_dup_bf16 (bfloat16_t * ptr) +{ + vld4q_dup_bf16 (ptr); +} ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: ACLE intrinsics: BFloat16 load intrinsics for AArch32 2019-12-20 19:04 ACLE intrinsics: BFloat16 load intrinsics for AArch32 Delia Burduv @ 2020-01-22 18:20 ` Delia Burduv 2020-01-28 17:18 ` Delia Burduv 2020-02-19 17:26 ` Delia Burduv 0 siblings, 2 replies; 9+ messages in thread From: Delia Burduv @ 2020-01-22 18:20 UTC (permalink / raw) To: gcc-patches; +Cc: nickc, Richard Earnshaw, Kyrylo Tkachov, Ramana Radhakrishnan Ping. I will change the tests to use the exact input and output registers as Richard Sandiford suggested for the AArch64 patches. On 12/20/19 6:48 PM, Delia Burduv wrote: > This patch adds the ARMv8.6 ACLE BFloat16 load intrinsics vld<n>{q}_bf16 > as part of the BFloat16 extension. > (https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) > > The intrinsics are declared in arm_neon.h . > A new test is added to check assembler output. > > This patch depends on the Arm back-end patche. > (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html) > > Tested for regression on arm-none-eabi and armeb-none-eabi. I don't have > commit rights, so if this is ok can someone please commit it for me? > > gcc/ChangeLog: > > 2019-11-14 Delia Burduv <delia.burduv@arm.com> > > * config/arm/arm_neon.h (bfloat16_t): New typedef. > (bfloat16x4x2_t): New typedef. > (bfloat16x8x2_t): New typedef. > (bfloat16x4x3_t): New typedef. > (bfloat16x8x3_t): New typedef. > (bfloat16x4x4_t): New typedef. > (bfloat16x8x4_t): New typedef. > (vld2_bf16): New. > (vld2q_bf16): New. > (vld3_bf16): New. > (vld3q_bf16): New. > (vld4_bf16): New. > (vld4q_bf16): New. > (vld2_dup_bf16): New. > (vld2q_dup_bf16): New. > (vld3_dup_bf16): New. > (vld3q_dup_bf16): New. > (vld4_dup_bf16): New. > (vld4q_dup_bf16): New. > * config/arm/arm-builtins.c (E_V2BFmode): New mode. > (VAR13): New. > (arm_simd_types[Bfloat16x2_t]):New type. > * config/arm/arm-modes.def (V2BF): New mode. > * config/arm/arm-simd-builtin-types.def > (Bfloat16x2_t): New entry. > * config/arm/arm_neon_builtins.def > (vld2): Changed to VAR13 and added v4bf, v8bf > (vld2_dup): Changed to VAR8 and added v4bf, v8bf > (vld3): Changed to VAR13 and added v4bf, v8bf > (vld3_dup): Changed to VAR8 and added v4bf, v8bf > (vld4): Changed to VAR13 and added v4bf, v8bf > (vld4_dup): Changed to VAR8 and added v4bf, v8bf > * config/arm/iterators.md (VDXBF): New iterator. > (VQ2BF): New iterator. > (V_elem): Added V4BF, V8BF. > (V_sz_elem): Added V4BF, V8BF. > (V_mode_nunits): Added V4BF, V8BF. > (q): Added V4BF, V8BF. > *config/arm/neon.md (vld2): Used new iterators. > (vld2_dup<mode>): Used new iterators. > (vld2_dupv8bf): New. > (vst3): Used new iterators. > (vst3qa): Used new iterators. > (vst3qb): Used new iterators. > (vld3_dup<mode>): Used new iterators. > (vld3_dupv8bf): New. > (vst4): Used new iterators. > (vst4qa): Used new iterators. > (vst4qb): Used new iterators. > (vld4_dup<mode>): Used new iterators. > (vld4_dupv8bf): New. > > > gcc/testsuite/ChangeLog: > > 2019-11-14 Delia Burduv <delia.burduv@arm.com> > > * gcc.target/arm/simd/bf16_vldn_1.c: New test. ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: ACLE intrinsics: BFloat16 load intrinsics for AArch32 2020-01-22 18:20 ` Delia Burduv @ 2020-01-28 17:18 ` Delia Burduv 2020-02-19 17:26 ` Delia Burduv 1 sibling, 0 replies; 9+ messages in thread From: Delia Burduv @ 2020-01-28 17:18 UTC (permalink / raw) To: gcc-patches; +Cc: nickc, Richard Earnshaw, Kyrylo Tkachov, Ramana Radhakrishnan Ping. ________________________________ From: Delia Burduv <delia.burduv@arm.com> Sent: 22 January 2020 17:31 To: gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org> Cc: nickc@redhat.com <nickc@redhat.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>; Ramana Radhakrishnan <Ramana.Radhakrishnan@arm.com> Subject: Re: ACLE intrinsics: BFloat16 load intrinsics for AArch32 Ping. I will change the tests to use the exact input and output registers as Richard Sandiford suggested for the AArch64 patches. On 12/20/19 6:48 PM, Delia Burduv wrote: > This patch adds the ARMv8.6 ACLE BFloat16 load intrinsics vld<n>{q}_bf16 > as part of the BFloat16 extension. > (https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) > > The intrinsics are declared in arm_neon.h . > A new test is added to check assembler output. > > This patch depends on the Arm back-end patche. > (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html) > > Tested for regression on arm-none-eabi and armeb-none-eabi. I don't have > commit rights, so if this is ok can someone please commit it for me? > > gcc/ChangeLog: > > 2019-11-14 Delia Burduv <delia.burduv@arm.com> > > * config/arm/arm_neon.h (bfloat16_t): New typedef. > (bfloat16x4x2_t): New typedef. > (bfloat16x8x2_t): New typedef. > (bfloat16x4x3_t): New typedef. > (bfloat16x8x3_t): New typedef. > (bfloat16x4x4_t): New typedef. > (bfloat16x8x4_t): New typedef. > (vld2_bf16): New. > (vld2q_bf16): New. > (vld3_bf16): New. > (vld3q_bf16): New. > (vld4_bf16): New. > (vld4q_bf16): New. > (vld2_dup_bf16): New. > (vld2q_dup_bf16): New. > (vld3_dup_bf16): New. > (vld3q_dup_bf16): New. > (vld4_dup_bf16): New. > (vld4q_dup_bf16): New. > * config/arm/arm-builtins.c (E_V2BFmode): New mode. > (VAR13): New. > (arm_simd_types[Bfloat16x2_t]):New type. > * config/arm/arm-modes.def (V2BF): New mode. > * config/arm/arm-simd-builtin-types.def > (Bfloat16x2_t): New entry. > * config/arm/arm_neon_builtins.def > (vld2): Changed to VAR13 and added v4bf, v8bf > (vld2_dup): Changed to VAR8 and added v4bf, v8bf > (vld3): Changed to VAR13 and added v4bf, v8bf > (vld3_dup): Changed to VAR8 and added v4bf, v8bf > (vld4): Changed to VAR13 and added v4bf, v8bf > (vld4_dup): Changed to VAR8 and added v4bf, v8bf > * config/arm/iterators.md (VDXBF): New iterator. > (VQ2BF): New iterator. > (V_elem): Added V4BF, V8BF. > (V_sz_elem): Added V4BF, V8BF. > (V_mode_nunits): Added V4BF, V8BF. > (q): Added V4BF, V8BF. > *config/arm/neon.md (vld2): Used new iterators. > (vld2_dup<mode>): Used new iterators. > (vld2_dupv8bf): New. > (vst3): Used new iterators. > (vst3qa): Used new iterators. > (vst3qb): Used new iterators. > (vld3_dup<mode>): Used new iterators. > (vld3_dupv8bf): New. > (vst4): Used new iterators. > (vst4qa): Used new iterators. > (vst4qb): Used new iterators. > (vld4_dup<mode>): Used new iterators. > (vld4_dupv8bf): New. > > > gcc/testsuite/ChangeLog: > > 2019-11-14 Delia Burduv <delia.burduv@arm.com> > > * gcc.target/arm/simd/bf16_vldn_1.c: New test. ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: ACLE intrinsics: BFloat16 load intrinsics for AArch32 2020-01-22 18:20 ` Delia Burduv 2020-01-28 17:18 ` Delia Burduv @ 2020-02-19 17:26 ` Delia Burduv 2020-03-04 14:05 ` Delia Burduv 1 sibling, 1 reply; 9+ messages in thread From: Delia Burduv @ 2020-02-19 17:26 UTC (permalink / raw) To: gcc-patches; +Cc: nickc, richard.earnshaw, kyrylo.tkachov, ramana.radhakrishnan [-- Attachment #1: Type: text/plain, Size: 4516 bytes --] Hi, Here is the latest version of the patch. It just has some minor formatting changes that were brought up by Richard Sandiford in the AArch64 patches Thanks, Delia On 1/22/20 5:31 PM, Delia Burduv wrote: > Ping. > > I will change the tests to use the exact input and output registers as > Richard Sandiford suggested for the AArch64 patches. > > On 12/20/19 6:48 PM, Delia Burduv wrote: >> This patch adds the ARMv8.6 ACLE BFloat16 load intrinsics >> vld<n>{q}_bf16 as part of the BFloat16 extension. >> (https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) >> >> The intrinsics are declared in arm_neon.h . >> A new test is added to check assembler output. >> >> This patch depends on the Arm back-end patche. >> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html) >> >> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't >> have commit rights, so if this is ok can someone please commit it for me? >> >> gcc/ChangeLog: >> >> 2019-11-14 Delia Burduv <delia.burduv@arm.com> >> >>      * config/arm/arm_neon.h (bfloat16_t): New typedef. >>         (bfloat16x4x2_t): New typedef. >>         (bfloat16x8x2_t): New typedef. >>         (bfloat16x4x3_t): New typedef. >>         (bfloat16x8x3_t): New typedef. >>         (bfloat16x4x4_t): New typedef. >>         (bfloat16x8x4_t): New typedef. >>         (vld2_bf16): New. >>      (vld2q_bf16): New. >>      (vld3_bf16): New. >>      (vld3q_bf16): New. >>      (vld4_bf16): New. >>      (vld4q_bf16): New. >>      (vld2_dup_bf16): New. >>      (vld2q_dup_bf16): New. >>      (vld3_dup_bf16): New. >>      (vld3q_dup_bf16): New. >>      (vld4_dup_bf16): New. >>      (vld4q_dup_bf16): New. >>         * config/arm/arm-builtins.c (E_V2BFmode): New mode. >>         (VAR13): New. >>         (arm_simd_types[Bfloat16x2_t]):New type. >>         * config/arm/arm-modes.def (V2BF): New mode. >>         * config/arm/arm-simd-builtin-types.def >>         (Bfloat16x2_t): New entry. >>         * config/arm/arm_neon_builtins.def >>         (vld2): Changed to VAR13 and added v4bf, v8bf >>         (vld2_dup): Changed to VAR8 and added v4bf, v8bf >>         (vld3): Changed to VAR13 and added v4bf, v8bf >>         (vld3_dup): Changed to VAR8 and added v4bf, v8bf >>         (vld4): Changed to VAR13 and added v4bf, v8bf >>         (vld4_dup): Changed to VAR8 and added v4bf, v8bf >>         * config/arm/iterators.md (VDXBF): New iterator. >>         (VQ2BF): New iterator. >>         (V_elem): Added V4BF, V8BF. >>         (V_sz_elem): Added V4BF, V8BF. >>         (V_mode_nunits): Added V4BF, V8BF. >>         (q): Added V4BF, V8BF. >>         *config/arm/neon.md (vld2): Used new iterators. >>         (vld2_dup<mode>): Used new iterators. >>         (vld2_dupv8bf): New. >>         (vst3): Used new iterators. >>         (vst3qa): Used new iterators. >>         (vst3qb): Used new iterators. >>         (vld3_dup<mode>): Used new iterators. >>         (vld3_dupv8bf): New. >>         (vst4): Used new iterators. >>         (vst4qa): Used new iterators. >>         (vst4qb): Used new iterators. >>         (vld4_dup<mode>): Used new iterators. >>         (vld4_dupv8bf): New. >> >> >> gcc/testsuite/ChangeLog: >> >> 2019-11-14 Delia Burduv <delia.burduv@arm.com> >> >>      * gcc.target/arm/simd/bf16_vldn_1.c: New test. [-- Attachment #2: rb12473.patch --] [-- Type: text/x-patch, Size: 23696 bytes --] diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c index 7f279cca6688c6f11948159666ee647ae533c61d..44c6f46fd63d5eaa1c3c84340d9acd017bb663e4 100644 --- a/gcc/config/arm/arm-builtins.c +++ b/gcc/config/arm/arm-builtins.c @@ -318,6 +318,7 @@ arm_set_sat_qualifiers[SIMD_MAX_BUILTIN_ARGS] #define v4bf_UP E_V4BFmode #define v2si_UP E_V2SImode #define v2sf_UP E_V2SFmode +#define v2bf_UP E_V2BFmode #define di_UP E_DImode #define v16qi_UP E_V16QImode #define v8hi_UP E_V8HImode @@ -381,6 +382,9 @@ typedef struct { #define VAR12(T, N, A, B, C, D, E, F, G, H, I, J, K, L) \ VAR11 (T, N, A, B, C, D, E, F, G, H, I, J, K) \ VAR1 (T, N, L) +#define VAR13(T, N, A, B, C, D, E, F, G, H, I, J, K, L, M) \ + VAR12 (T, N, A, B, C, D, E, F, G, H, I, J, K, L) \ + VAR1 (T, N, M) /* The builtin data can be found in arm_neon_builtins.def, arm_vfp_builtins.def and arm_acle_builtins.def. The entries in arm_neon_builtins.def require @@ -1013,6 +1017,7 @@ arm_init_simd_builtin_types (void) arm_simd_types[Float32x4_t].eltype = float_type_node; /* Init Bfloat vector types with underlying __bf16 scalar type. */ + arm_simd_types[Bfloat16x2_t].eltype = arm_bf16_type_node; arm_simd_types[Bfloat16x4_t].eltype = arm_bf16_type_node; arm_simd_types[Bfloat16x8_t].eltype = arm_bf16_type_node; diff --git a/gcc/config/arm/arm-modes.def b/gcc/config/arm/arm-modes.def index ea92ef35723f979c8bb1f6bfb4fbeb6cd1e4b6e9..6e48223b63d98fcbe38960700dd0949d74629f7f 100644 --- a/gcc/config/arm/arm-modes.def +++ b/gcc/config/arm/arm-modes.def @@ -80,6 +80,7 @@ VECTOR_MODE (FLOAT, HF, 2); /* V2HF */ FLOAT_MODE (BF, 2, 0); ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format); +VECTOR_MODE (FLOAT, BF, 2); /* V2BF. */ VECTOR_MODE (FLOAT, BF, 4); /* V4BF. */ VECTOR_MODE (FLOAT, BF, 8); /* V8BF. */ diff --git a/gcc/config/arm/arm-simd-builtin-types.def b/gcc/config/arm/arm-simd-builtin-types.def index ea3c9f97b71f03ac28d83266bcdaddcd0d42678b..e35bb765cdf60b127f844877ca938dfb674ec16a 100644 --- a/gcc/config/arm/arm-simd-builtin-types.def +++ b/gcc/config/arm/arm-simd-builtin-types.def @@ -48,5 +48,6 @@ ENTRY (Float16x8_t, V8HF, none, 128, float16, 19) ENTRY (Float32x4_t, V4SF, none, 128, float32, 19) + ENTRY (Bfloat16x2_t, V2BF, none, 32, bfloat16, 20) ENTRY (Bfloat16x4_t, V4BF, none, 64, bfloat16, 20) ENTRY (Bfloat16x8_t, V8BF, none, 128, bfloat16, 20) diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 3c78f435009ab027f92693d00ab5b40960d5419d..74beb236092f57736e7c0b91f8ed22555473a850 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -18742,6 +18742,149 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b, return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, __index); } +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+bf16") + +typedef struct bfloat16x4x2_t +{ + bfloat16x4_t val[2]; +} bfloat16x4x2_t; + +typedef struct bfloat16x8x2_t +{ + bfloat16x8_t val[2]; +} bfloat16x8x2_t; + +typedef struct bfloat16x4x3_t +{ + bfloat16x4_t val[3]; +} bfloat16x4x3_t; + +typedef struct bfloat16x8x3_t +{ + bfloat16x8_t val[3]; +} bfloat16x8x3_t; + +typedef struct bfloat16x4x4_t +{ + bfloat16x4_t val[4]; +} bfloat16x4x4_t; + +typedef struct bfloat16x8x4_t +{ + bfloat16x8_t val[4]; +} bfloat16x8x4_t; + +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_bf16 (bfloat16_t const * __ptr) +{ + union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2v4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2v8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3v4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3v8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4v4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4v8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +#pragma GCC pop_options + #pragma GCC pop_options #endif diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index e9ff4e501cbb5d16b9211f5bc96db376ddf21afc..36b300c09b11cea8bb578d5063409e7f0f5f26e5 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -320,29 +320,29 @@ VAR12 (STORE1, vst1, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di) VAR12 (STORE1LANE, vst1_lane, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di) -VAR11 (LOAD1, vld2, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (LOAD1, vld2, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR9 (LOAD1LANE, vld2_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR6 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR8 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR11 (STORE1, vst2, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) VAR9 (STORE1LANE, vst2_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR11 (LOAD1, vld3, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (LOAD1, vld3, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR9 (LOAD1LANE, vld3_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR6 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR8 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR11 (STORE1, vst3, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) VAR9 (STORE1LANE, vst3_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR11 (LOAD1, vld4, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (LOAD1, vld4, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR9 (LOAD1LANE, vld4_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR6 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR8 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR11 (STORE1, vst4, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) VAR9 (STORE1LANE, vst4_lane, diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 33e29509f00a89fa23d0546687c0e4643f0b32d2..b821152a33844ad8ce7a50d7923e43c49d07fbdf 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -84,6 +84,12 @@ ;; Double-width vector modes plus 64-bit elements. (define_mode_iterator VDX [V8QI V4HI V4HF V2SI V2SF DI]) +;; Double-width vector modes plus 64-bit elements and V4BF. +(define_mode_iterator VDXBF [V8QI V4HI V4HF V2SI V2SF DI (V4BF "TARGET_BF16_SIMD")]) + +;; Double-width vector modes plus 64-bit elements, V4BF and V8BF. +(define_mode_iterator VDXBF2 [V8QI V4HI V4HF V2SI V2SF DI (V4BF "TARGET_BF16_SIMD") (V8BF ("TARGET_BF16_SIMD"))]) + ;; Double-width vector modes plus 64-bit elements, ;; with V4BFmode added, suitable for moves. (define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI]) @@ -100,6 +106,9 @@ ;; Quad-width vector modes, including V8HF. (define_mode_iterator VQ2 [V16QI V8HI V8HF V4SI V4SF]) +;; Quad-width vector modes, including V8HF and V8BF. +(define_mode_iterator VQ2BF [V16QI V8HI V8HF V4SI V4SF (V8BF "TARGET_BF16_SIMD")]) + ;; Quad-width vector modes with 16- or 32-bit elements (define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF]) @@ -543,6 +552,7 @@ (define_mode_attr V_two_elem [(V8QI "HI") (V16QI "HI") (V4HI "SI") (V8HI "SI") (V4HF "SF") (V8HF "SF") + (V4BF "V2BF") (V8BF "V2BF") (V2SI "V2SI") (V4SI "V2SI") (V2SF "V2SF") (V4SF "V2SF") (DI "V2DI") (V2DI "V2DI")]) @@ -563,6 +573,7 @@ (define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK") (V4HI "BLK") (V8HI "BLK") (V4HF "BLK") (V8HF "BLK") + (V4BF "BLK") (V8BF "BLK") (V2SI "BLK") (V4SI "BLK") (V2SF "BLK") (V4SF "BLK") (DI "EI") (V2DI "EI")]) @@ -571,6 +582,7 @@ (define_mode_attr V_four_elem [(V8QI "SI") (V16QI "SI") (V4HI "V4HI") (V8HI "V4HI") (V4HF "V4HF") (V8HF "V4HF") + (V4BF "V4BF") (V8BF "V4BF") (V2SI "V4SI") (V4SI "V4SI") (V2SF "V4SF") (V4SF "V4SF") (DI "OI") (V2DI "OI")]) @@ -694,6 +706,7 @@ (V2SI "32") (V4SI "32") (DI "64") (V2DI "64") (V4HF "16") (V8HF "16") + (V4BF "16") (V8BF "16") (V2SF "32") (V4SF "32")]) (define_mode_attr V_elem_ch [(V8QI "b") (V16QI "b") @@ -769,6 +782,7 @@ (define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16") (V4HF "4") (V8HF "8") (V4HI "4") (V8HI "8") + (V4BF "4") (V8BF "8") (V2SI "2") (V4SI "4") (V2SF "2") (V4SF "4") (DI "1") (V2DI "2") diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 6087ca6f2badde6a492bb515a2cb5846f3d4ad8e..c5d044e354afbec7bf31e8ba4719f19c21e1dce5 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -5379,7 +5379,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld2<mode>" [(set (match_operand:TI 0 "s_register_operand" "=w") (unspec:TI [(match_operand:TI 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2))] "TARGET_NEON" { @@ -5404,7 +5404,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld2<mode>" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2))] "TARGET_NEON" "vld2.<V_sz_elem>\t%h0, %A1" @@ -5467,7 +5467,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld2_dup<mode>" [(set (match_operand:TI 0 "s_register_operand" "=w") (unspec:TI [(match_operand:<V_two_elem> 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2_DUP))] "TARGET_NEON" { @@ -5482,6 +5482,27 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load1_1reg<q>")))] ) +(define_insn "neon_vld2_dupv8bf" + [(set (match_operand:OI 0 "s_register_operand" "=w") + (unspec:OI [(match_operand:V2BF 1 "neon_struct_operand" "Um") + (unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_BF16_SIMD" + { + rtx ops[5]; + int tabbase = REGNO (operands[0]); + + ops[4] = operands[1]; + ops[0] = gen_rtx_REG (V4BFmode, tabbase); + ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2); + ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4); + ops[3] = gen_rtx_REG (V4BFmode, tabbase + 6); + output_asm_insn ("vld2.16\t{%P0, %P1, %P2, %P3}, %A4", ops); + return ""; + } + [(set_attr "type" "neon_load2_all_lanes_q")] +) + (define_expand "vec_store_lanesti<mode>" [(set (match_operand:TI 0 "neon_struct_operand") (unspec:TI [(match_operand:TI 1 "s_register_operand") @@ -5588,7 +5609,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3<mode>" [(set (match_operand:EI 0 "s_register_operand" "=w") (unspec:EI [(match_operand:EI 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3))] "TARGET_NEON" { @@ -5616,7 +5637,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vld3<mode>" [(match_operand:CI 0 "s_register_operand") (match_operand:CI 1 "neon_struct_operand") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { rtx mem; @@ -5631,7 +5652,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3qa<mode>" [(set (match_operand:CI 0 "s_register_operand" "=w") (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3A))] "TARGET_NEON" { @@ -5651,7 +5672,7 @@ if (BYTES_BIG_ENDIAN) [(set (match_operand:CI 0 "s_register_operand" "=w") (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um") (match_operand:CI 2 "s_register_operand" "0") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3B))] "TARGET_NEON" { @@ -5728,7 +5749,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3_dup<mode>" [(set (match_operand:EI 0 "s_register_operand" "=w") (unspec:EI [(match_operand:<V_three_elem> 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3_DUP))] "TARGET_NEON" { @@ -5751,6 +5772,26 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load3_all_lanes<q>") (const_string "neon_load1_1reg<q>")))]) +(define_insn "neon_vld3_dupv8bf" + [(set (match_operand:CI 0 "s_register_operand" "=w") + (unspec:CI [(match_operand:V2BF 1 "neon_struct_operand" "Um") + (unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_BF16_SIMD" + { + rtx ops[4]; + int tabbase = REGNO (operands[0]); + + ops[3] = operands[1]; + ops[0] = gen_rtx_REG (V4BFmode, tabbase); + ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2); + ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4); + output_asm_insn ("vld3.16\t{%P0[], %P1[], %P2[]}, %A3", ops); + return ""; + } + [(set_attr "type" "neon_load3_all_lanes_q")] +) + (define_expand "vec_store_lanesei<mode>" [(set (match_operand:EI 0 "neon_struct_operand") (unspec:EI [(match_operand:EI 1 "s_register_operand") @@ -5906,7 +5947,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4<mode>" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4))] "TARGET_NEON" { @@ -5934,7 +5975,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vld4<mode>" [(match_operand:XI 0 "s_register_operand") (match_operand:XI 1 "neon_struct_operand") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { rtx mem; @@ -5949,7 +5990,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4qa<mode>" [(set (match_operand:XI 0 "s_register_operand" "=w") (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4A))] "TARGET_NEON" { @@ -5970,7 +6011,7 @@ if (BYTES_BIG_ENDIAN) [(set (match_operand:XI 0 "s_register_operand" "=w") (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um") (match_operand:XI 2 "s_register_operand" "0") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4B))] "TARGET_NEON" { @@ -6050,7 +6091,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4_dup<mode>" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:<V_four_elem> 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4_DUP))] "TARGET_NEON" { @@ -6076,6 +6117,27 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load1_1reg<q>")))] ) +(define_insn "neon_vld4_dupv8bf" + [(set (match_operand:XI 0 "s_register_operand" "=w") + (unspec:XI [(match_operand:V2BF 1 "neon_struct_operand" "Um") + (unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_BF16_SIMD" + { + rtx ops[5]; + int tabbase = REGNO (operands[0]); + + ops[4] = operands[1]; + ops[0] = gen_rtx_REG (V4BFmode, tabbase); + ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2); + ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4); + ops[3] = gen_rtx_REG (V4BFmode, tabbase + 6); + output_asm_insn ("vld4.16\t{%P0[], %P1[], %P2[], %P3[]}, %A4", ops); + return ""; + } + [(set_attr "type" "neon_load4_all_lanes_q")] +) + (define_expand "vec_store_lanesoi<mode>" [(set (match_operand:OI 0 "neon_struct_operand") (unspec:OI [(match_operand:OI 1 "s_register_operand") diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c new file mode 100644 index 0000000000000000000000000000000000000000..7ff8b600827e5c2e313ce40d14382aa641b4bb31 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c @@ -0,0 +1,152 @@ +/* { dg-do assemble } */ +/* { dg-options "-save-temps" } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + + +/* +**test_vld2_bf16: +** ... +** vld2.16 {d16-d17}, \[r3\] +** ... +*/ +bfloat16x4x2_t +test_vld2_bf16 (bfloat16_t * ptr) +{ + vld2_bf16 (ptr); +} + +/* +**test_vld2q_bf16: +** ... +** vld2.16 {d16+-d19+}, \[r3\] +** ... +*/ +bfloat16x8x2_t +test_vld2q_bf16 (bfloat16_t * ptr) +{ + vld2q_bf16 (ptr); +} + +/* +**test_vld2_dup_bf16: +** ... +** vld2.16 {d16\[\], d17\[\]}, \[r3\] +** ... +*/ +bfloat16x4x2_t +test_vld2_dup_bf16 (bfloat16_t * ptr) +{ + vld2_dup_bf16 (ptr); +} + +/* +**test_vld2q_dup_bf16: +** ... +** vld2.16 {d16, d17, d18, d19}, \[r3\] +** ... +*/ +bfloat16x8x2_t +test_vld2q_dup_bf16 (bfloat16_t * ptr) +{ + vld2q_dup_bf16 (ptr); +} + +/* +**test_vld3_bf16: +** ... +** vld3.16 {d16-d18}, \[r3\] +** ... +*/ +bfloat16x4x3_t +test_vld3_bf16 (bfloat16_t * ptr) +{ + vld3_bf16 (ptr); +} + +/* +**test_vld3q_bf16: +** ... +** vld3.16 {d16, d18, d20}, \[r3\] +** ... +*/ +bfloat16x8x3_t +test_vld3q_bf16 (bfloat16_t * ptr) +{ + vld3q_bf16 (ptr); +} + +/* +**test_vld3_dup_bf16: +** ... +** vld3.16 {d16\[\], d17\[\], d18\[\]}, \[r3\] +** ... +*/ +bfloat16x4x3_t +test_vld3_dup_bf16 (bfloat16_t * ptr) +{ + vld3_dup_bf16 (ptr); +} + +/* +**test_vld3q_dup_bf16: +** ... +** vld3.16 {d16\[\], d17\[\], d18\[\]}, \[r3\] +** ... +*/ +bfloat16x8x3_t +test_vld3q_dup_bf16 (bfloat16_t * ptr) +{ + vld3q_dup_bf16 (ptr); +} + +/* +**test_vld4_bf16: +** ... +** vld4.16 {d16-d19}, \[r3\] +** ... +*/ +bfloat16x4x4_t +test_vld4_bf16 (bfloat16_t * ptr) +{ + vld4_bf16 (ptr); +} + +/* +**test_vld4q_bf16: +** ... +** vld4.16 {d16, d18, d20, d22}, \[r3\] +** ... +*/ +bfloat16x8x4_t +test_vld4q_bf16 (bfloat16_t * ptr) +{ + vld4q_bf16 (ptr); +} + +/* +**test_vld4_dup_bf16: +** ... +** vld4.16 {d16\[\], d17\[\], d18\[\], d19\[\]}, \[r3\] +** ... +*/ +bfloat16x4x4_t +test_vld4_dup_bf16 (bfloat16_t * ptr) +{ + vld4_dup_bf16 (ptr); +} + +/* +**test_vld4q_dup_bf16: +** ... +** vld4.16 {d16\[\], d17\[\], d18\[\], d19\[\]}, \[r3\] +** ... +*/ +bfloat16x4x4_t +test_vld4q_dup_bf16 (bfloat16_t * ptr) +{ + vld4q_dup_bf16 (ptr); +} ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: ACLE intrinsics: BFloat16 load intrinsics for AArch32 2020-02-19 17:26 ` Delia Burduv @ 2020-03-04 14:05 ` Delia Burduv 2020-03-04 17:21 ` Kyrill Tkachov 0 siblings, 1 reply; 9+ messages in thread From: Delia Burduv @ 2020-03-04 14:05 UTC (permalink / raw) To: gcc-patches; +Cc: nickc, richard.earnshaw, kyrylo.tkachov, ramana.radhakrishnan [-- Attachment #1: Type: text/plain, Size: 5743 bytes --] Hi, The previous version of this patch shared part of its code with the store intrinsics patch (https://gcc.gnu.org/ml/gcc-patches/2020-03/msg00145.html) so I removed any duplicated code. This patch now depends on the previously mentioned store intrinsics patch. Here is the latest version and the updated ChangeLog. gcc/ChangeLog: 2019-03-04 Delia Burduv <delia.burduv@arm.com> * config/arm/arm_neon.h (bfloat16_t): New typedef. (vld2_bf16): New. (vld2q_bf16): New. (vld3_bf16): New. (vld3q_bf16): New. (vld4_bf16): New. (vld4q_bf16): New. (vld2_dup_bf16): New. (vld2q_dup_bf16): New. (vld3_dup_bf16): New. (vld3q_dup_bf16): New. (vld4_dup_bf16): New. (vld4q_dup_bf16): New. * config/arm/arm_neon_builtins.def (vld2): Changed to VAR13 and added v4bf, v8bf (vld2_dup): Changed to VAR8 and added v4bf, v8bf (vld3): Changed to VAR13 and added v4bf, v8bf (vld3_dup): Changed to VAR8 and added v4bf, v8bf (vld4): Changed to VAR13 and added v4bf, v8bf (vld4_dup): Changed to VAR8 and added v4bf, v8bf * config/arm/iterators.md (VDXBF): New iterator. (VQ2BF): New iterator. *config/arm/neon.md (vld2): Used new iterators. (vld2_dup<mode>): Used new iterators. (vld2_dupv8bf): New. (vst3): Used new iterators. (vst3qa): Used new iterators. (vst3qb): Used new iterators. (vld3_dup<mode>): Used new iterators. (vld3_dupv8bf): New. (vst4): Used new iterators. (vst4qa): Used new iterators. (vst4qb): Used new iterators. (vld4_dup<mode>): Used new iterators. (vld4_dupv8bf): New. gcc/testsuite/ChangeLog: 2019-03-04 Delia Burduv <delia.burduv@arm.com> * gcc.target/arm/simd/bf16_vldn_1.c: New test. Thanks, Delia On 2/19/20 5:25 PM, Delia Burduv wrote: > > Hi, > > Here is the latest version of the patch. It just has some minor > formatting changes that were brought up by Richard Sandiford in the > AArch64 patches > > Thanks, > Delia > > On 1/22/20 5:31 PM, Delia Burduv wrote: >> Ping. >> >> I will change the tests to use the exact input and output registers as >> Richard Sandiford suggested for the AArch64 patches. >> >> On 12/20/19 6:48 PM, Delia Burduv wrote: >>> This patch adds the ARMv8.6 ACLE BFloat16 load intrinsics >>> vld<n>{q}_bf16 as part of the BFloat16 extension. >>> (https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) >>> >>> The intrinsics are declared in arm_neon.h . >>> A new test is added to check assembler output. >>> >>> This patch depends on the Arm back-end patche. >>> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html) >>> >>> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't >>> have commit rights, so if this is ok can someone please commit it for >>> me? >>> >>> gcc/ChangeLog: >>> >>> 2019-11-14 Delia Burduv <delia.burduv@arm.com> >>> >>> * config/arm/arm_neon.h (bfloat16_t): New typedef. >>> (bfloat16x4x2_t): New typedef. >>> (bfloat16x8x2_t): New typedef. >>> (bfloat16x4x3_t): New typedef. >>> (bfloat16x8x3_t): New typedef. >>> (bfloat16x4x4_t): New typedef. >>> (bfloat16x8x4_t): New typedef. >>> (vld2_bf16): New. >>> (vld2q_bf16): New. >>> (vld3_bf16): New. >>> (vld3q_bf16): New. >>> (vld4_bf16): New. >>> (vld4q_bf16): New. >>> (vld2_dup_bf16): New. >>> (vld2q_dup_bf16): New. >>> (vld3_dup_bf16): New. >>> (vld3q_dup_bf16): New. >>> (vld4_dup_bf16): New. >>> (vld4q_dup_bf16): New. >>> * config/arm/arm-builtins.c (E_V2BFmode): New mode. >>> (VAR13): New. >>> (arm_simd_types[Bfloat16x2_t]):New type. >>> * config/arm/arm-modes.def (V2BF): New mode. >>> * config/arm/arm-simd-builtin-types.def >>> (Bfloat16x2_t): New entry. >>> * config/arm/arm_neon_builtins.def >>> (vld2): Changed to VAR13 and added v4bf, v8bf >>> (vld2_dup): Changed to VAR8 and added v4bf, v8bf >>> (vld3): Changed to VAR13 and added v4bf, v8bf >>> (vld3_dup): Changed to VAR8 and added v4bf, v8bf >>> (vld4): Changed to VAR13 and added v4bf, v8bf >>> (vld4_dup): Changed to VAR8 and added v4bf, v8bf >>> * config/arm/iterators.md (VDXBF): New iterator. >>> (VQ2BF): New iterator. >>> (V_elem): Added V4BF, V8BF. >>> (V_sz_elem): Added V4BF, V8BF. >>> (V_mode_nunits): Added V4BF, V8BF. >>> (q): Added V4BF, V8BF. >>> *config/arm/neon.md (vld2): Used new iterators. >>> (vld2_dup<mode>): Used new iterators. >>> (vld2_dupv8bf): New. >>> (vst3): Used new iterators. >>> (vst3qa): Used new iterators. >>> (vst3qb): Used new iterators. >>> (vld3_dup<mode>): Used new iterators. >>> (vld3_dupv8bf): New. >>> (vst4): Used new iterators. >>> (vst4qa): Used new iterators. >>> (vst4qb): Used new iterators. >>> (vld4_dup<mode>): Used new iterators. >>> (vld4_dupv8bf): New. >>> >>> >>> gcc/testsuite/ChangeLog: >>> >>> 2019-11-14 Delia Burduv <delia.burduv@arm.com> >>> >>> * gcc.target/arm/simd/bf16_vldn_1.c: New test. [-- Attachment #2: rb12473.patch --] [-- Type: text/x-patch, Size: 18229 bytes --] diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 425a2a49b69d7e3070059dd0a79ae3d306400f4b..2573cca6bb64f5104a1efd1379ef956f56d0fe04 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -19504,6 +19504,114 @@ vst4q_bf16 (bfloat16_t * __ptr, bfloat16x8x4_t __val) return __builtin_neon_vst4v8bf (__ptr, __bu.__o); } +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_bf16 (bfloat16_t const * __ptr) +{ + union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2v4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2v8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3v4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3v8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4v4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4v8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + #pragma GCC pop_options #ifdef __cplusplus diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index d85a2d4b1fcf9e851f215dfdd4b305e59ded651c..e3c1652b9e92ff5024225279f26c1ccb197dcd69 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -320,29 +320,29 @@ VAR12 (STORE1, vst1, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di) VAR12 (STORE1LANE, vst1_lane, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di) -VAR11 (LOAD1, vld2, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (LOAD1, vld2, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR9 (LOAD1LANE, vld2_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR6 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR8 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR13 (STORE1, vst2, v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) VAR9 (STORE1LANE, vst2_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR11 (LOAD1, vld3, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (LOAD1, vld3, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR9 (LOAD1LANE, vld3_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR6 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR8 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR13 (STORE1, vst3, v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) VAR9 (STORE1LANE, vst3_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR11 (LOAD1, vld4, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (LOAD1, vld4, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR9 (LOAD1LANE, vld4_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR6 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR8 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR13 (STORE1, vst4, v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) VAR9 (STORE1LANE, vst4_lane, diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 0c03e747c3643e018f4f62dda5e832dfb1af758f..7401f16ef59b9854bbc85f98cfdcdd7a8a600337 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -87,6 +87,9 @@ ;; Double-width vector modes plus 64-bit elements, including V4BF. (define_mode_iterator VDXBF [V8QI V4HI V4HF (V4BF "TARGET_BF16_SIMD") V2SI V2SF DI]) +;; Double-width vector modes plus 64-bit elements, V4BF and V8BF. +(define_mode_iterator VDXBF2 [V8QI V4HI V4HF V2SI V2SF DI (V4BF "TARGET_BF16_SIMD") (V8BF ("TARGET_BF16_SIMD"))]) + ;; Double-width vector modes plus 64-bit elements, ;; with V4BFmode added, suitable for moves. (define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index fcf59aee32a955b6bb3e7b98a4d880a0e631b4be..5117f78dd2dce442bc738de6082686421fcdca52 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -5428,7 +5428,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld2<mode>" [(set (match_operand:TI 0 "s_register_operand" "=w") (unspec:TI [(match_operand:TI 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2))] "TARGET_NEON" { @@ -5453,7 +5453,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld2<mode>" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2))] "TARGET_NEON" "vld2.<V_sz_elem>\t%h0, %A1" @@ -5516,7 +5516,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld2_dup<mode>" [(set (match_operand:TI 0 "s_register_operand" "=w") (unspec:TI [(match_operand:<V_two_elem> 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2_DUP))] "TARGET_NEON" { @@ -5531,6 +5531,27 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load1_1reg<q>")))] ) +(define_insn "neon_vld2_dupv8bf" + [(set (match_operand:OI 0 "s_register_operand" "=w") + (unspec:OI [(match_operand:V2BF 1 "neon_struct_operand" "Um") + (unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_BF16_SIMD" + { + rtx ops[5]; + int tabbase = REGNO (operands[0]); + + ops[4] = operands[1]; + ops[0] = gen_rtx_REG (V4BFmode, tabbase); + ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2); + ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4); + ops[3] = gen_rtx_REG (V4BFmode, tabbase + 6); + output_asm_insn ("vld2.16\t{%P0, %P1, %P2, %P3}, %A4", ops); + return ""; + } + [(set_attr "type" "neon_load2_all_lanes_q")] +) + (define_expand "vec_store_lanesti<mode>" [(set (match_operand:TI 0 "neon_struct_operand") (unspec:TI [(match_operand:TI 1 "s_register_operand") @@ -5637,7 +5658,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3<mode>" [(set (match_operand:EI 0 "s_register_operand" "=w") (unspec:EI [(match_operand:EI 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3))] "TARGET_NEON" { @@ -5665,7 +5686,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vld3<mode>" [(match_operand:CI 0 "s_register_operand") (match_operand:CI 1 "neon_struct_operand") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { rtx mem; @@ -5680,7 +5701,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3qa<mode>" [(set (match_operand:CI 0 "s_register_operand" "=w") (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3A))] "TARGET_NEON" { @@ -5700,7 +5721,7 @@ if (BYTES_BIG_ENDIAN) [(set (match_operand:CI 0 "s_register_operand" "=w") (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um") (match_operand:CI 2 "s_register_operand" "0") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3B))] "TARGET_NEON" { @@ -5777,7 +5798,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3_dup<mode>" [(set (match_operand:EI 0 "s_register_operand" "=w") (unspec:EI [(match_operand:<V_three_elem> 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3_DUP))] "TARGET_NEON" { @@ -5800,6 +5821,26 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load3_all_lanes<q>") (const_string "neon_load1_1reg<q>")))]) +(define_insn "neon_vld3_dupv8bf" + [(set (match_operand:CI 0 "s_register_operand" "=w") + (unspec:CI [(match_operand:V2BF 1 "neon_struct_operand" "Um") + (unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_BF16_SIMD" + { + rtx ops[4]; + int tabbase = REGNO (operands[0]); + + ops[3] = operands[1]; + ops[0] = gen_rtx_REG (V4BFmode, tabbase); + ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2); + ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4); + output_asm_insn ("vld3.16\t{%P0[], %P1[], %P2[]}, %A3", ops); + return ""; + } + [(set_attr "type" "neon_load3_all_lanes_q")] +) + (define_expand "vec_store_lanesei<mode>" [(set (match_operand:EI 0 "neon_struct_operand") (unspec:EI [(match_operand:EI 1 "s_register_operand") @@ -5955,7 +5996,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4<mode>" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4))] "TARGET_NEON" { @@ -5983,7 +6024,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vld4<mode>" [(match_operand:XI 0 "s_register_operand") (match_operand:XI 1 "neon_struct_operand") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { rtx mem; @@ -5998,7 +6039,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4qa<mode>" [(set (match_operand:XI 0 "s_register_operand" "=w") (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4A))] "TARGET_NEON" { @@ -6019,7 +6060,7 @@ if (BYTES_BIG_ENDIAN) [(set (match_operand:XI 0 "s_register_operand" "=w") (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um") (match_operand:XI 2 "s_register_operand" "0") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4B))] "TARGET_NEON" { @@ -6099,7 +6140,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4_dup<mode>" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:<V_four_elem> 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4_DUP))] "TARGET_NEON" { @@ -6125,6 +6166,27 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load1_1reg<q>")))] ) +(define_insn "neon_vld4_dupv8bf" + [(set (match_operand:XI 0 "s_register_operand" "=w") + (unspec:XI [(match_operand:V2BF 1 "neon_struct_operand" "Um") + (unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_BF16_SIMD" + { + rtx ops[5]; + int tabbase = REGNO (operands[0]); + + ops[4] = operands[1]; + ops[0] = gen_rtx_REG (V4BFmode, tabbase); + ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2); + ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4); + ops[3] = gen_rtx_REG (V4BFmode, tabbase + 6); + output_asm_insn ("vld4.16\t{%P0[], %P1[], %P2[], %P3[]}, %A4", ops); + return ""; + } + [(set_attr "type" "neon_load4_all_lanes_q")] +) + (define_expand "vec_store_lanesoi<mode>" [(set (match_operand:OI 0 "neon_struct_operand") (unspec:OI [(match_operand:OI 1 "s_register_operand") diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c new file mode 100644 index 0000000000000000000000000000000000000000..7ff8b600827e5c2e313ce40d14382aa641b4bb31 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c @@ -0,0 +1,152 @@ +/* { dg-do assemble } */ +/* { dg-options "-save-temps" } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + + +/* +**test_vld2_bf16: +** ... +** vld2.16 {d16-d17}, \[r3\] +** ... +*/ +bfloat16x4x2_t +test_vld2_bf16 (bfloat16_t * ptr) +{ + vld2_bf16 (ptr); +} + +/* +**test_vld2q_bf16: +** ... +** vld2.16 {d16+-d19+}, \[r3\] +** ... +*/ +bfloat16x8x2_t +test_vld2q_bf16 (bfloat16_t * ptr) +{ + vld2q_bf16 (ptr); +} + +/* +**test_vld2_dup_bf16: +** ... +** vld2.16 {d16\[\], d17\[\]}, \[r3\] +** ... +*/ +bfloat16x4x2_t +test_vld2_dup_bf16 (bfloat16_t * ptr) +{ + vld2_dup_bf16 (ptr); +} + +/* +**test_vld2q_dup_bf16: +** ... +** vld2.16 {d16, d17, d18, d19}, \[r3\] +** ... +*/ +bfloat16x8x2_t +test_vld2q_dup_bf16 (bfloat16_t * ptr) +{ + vld2q_dup_bf16 (ptr); +} + +/* +**test_vld3_bf16: +** ... +** vld3.16 {d16-d18}, \[r3\] +** ... +*/ +bfloat16x4x3_t +test_vld3_bf16 (bfloat16_t * ptr) +{ + vld3_bf16 (ptr); +} + +/* +**test_vld3q_bf16: +** ... +** vld3.16 {d16, d18, d20}, \[r3\] +** ... +*/ +bfloat16x8x3_t +test_vld3q_bf16 (bfloat16_t * ptr) +{ + vld3q_bf16 (ptr); +} + +/* +**test_vld3_dup_bf16: +** ... +** vld3.16 {d16\[\], d17\[\], d18\[\]}, \[r3\] +** ... +*/ +bfloat16x4x3_t +test_vld3_dup_bf16 (bfloat16_t * ptr) +{ + vld3_dup_bf16 (ptr); +} + +/* +**test_vld3q_dup_bf16: +** ... +** vld3.16 {d16\[\], d17\[\], d18\[\]}, \[r3\] +** ... +*/ +bfloat16x8x3_t +test_vld3q_dup_bf16 (bfloat16_t * ptr) +{ + vld3q_dup_bf16 (ptr); +} + +/* +**test_vld4_bf16: +** ... +** vld4.16 {d16-d19}, \[r3\] +** ... +*/ +bfloat16x4x4_t +test_vld4_bf16 (bfloat16_t * ptr) +{ + vld4_bf16 (ptr); +} + +/* +**test_vld4q_bf16: +** ... +** vld4.16 {d16, d18, d20, d22}, \[r3\] +** ... +*/ +bfloat16x8x4_t +test_vld4q_bf16 (bfloat16_t * ptr) +{ + vld4q_bf16 (ptr); +} + +/* +**test_vld4_dup_bf16: +** ... +** vld4.16 {d16\[\], d17\[\], d18\[\], d19\[\]}, \[r3\] +** ... +*/ +bfloat16x4x4_t +test_vld4_dup_bf16 (bfloat16_t * ptr) +{ + vld4_dup_bf16 (ptr); +} + +/* +**test_vld4q_dup_bf16: +** ... +** vld4.16 {d16\[\], d17\[\], d18\[\], d19\[\]}, \[r3\] +** ... +*/ +bfloat16x4x4_t +test_vld4q_dup_bf16 (bfloat16_t * ptr) +{ + vld4q_dup_bf16 (ptr); +} ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: ACLE intrinsics: BFloat16 load intrinsics for AArch32 2020-03-04 14:05 ` Delia Burduv @ 2020-03-04 17:21 ` Kyrill Tkachov 2020-03-05 16:39 ` Delia Burduv 0 siblings, 1 reply; 9+ messages in thread From: Kyrill Tkachov @ 2020-03-04 17:21 UTC (permalink / raw) To: Delia Burduv, gcc-patches; +Cc: nickc, Richard Earnshaw, Ramana Radhakrishnan Hi Delia, On 3/4/20 2:05 PM, Delia Burduv wrote: > Hi, > > The previous version of this patch shared part of its code with the > store intrinsics patch > (https://gcc.gnu.org/ml/gcc-patches/2020-03/msg00145.html) so I removed > any duplicated code. This patch now depends on the previously mentioned > store intrinsics patch. > > Here is the latest version and the updated ChangeLog. > > gcc/ChangeLog: > > 2019-03-04 Delia Burduv <delia.burduv@arm.com> > > * config/arm/arm_neon.h (bfloat16_t): New typedef. > (vld2_bf16): New. > (vld2q_bf16): New. > (vld3_bf16): New. > (vld3q_bf16): New. > (vld4_bf16): New. > (vld4q_bf16): New. > (vld2_dup_bf16): New. > (vld2q_dup_bf16): New. > (vld3_dup_bf16): New. > (vld3q_dup_bf16): New. > (vld4_dup_bf16): New. > (vld4q_dup_bf16): New. > * config/arm/arm_neon_builtins.def > (vld2): Changed to VAR13 and added v4bf, v8bf > (vld2_dup): Changed to VAR8 and added v4bf, v8bf > (vld3): Changed to VAR13 and added v4bf, v8bf > (vld3_dup): Changed to VAR8 and added v4bf, v8bf > (vld4): Changed to VAR13 and added v4bf, v8bf > (vld4_dup): Changed to VAR8 and added v4bf, v8bf > * config/arm/iterators.md (VDXBF): New iterator. > (VQ2BF): New iterator. > *config/arm/neon.md (vld2): Used new iterators. > (vld2_dup<mode>): Used new iterators. > (vld2_dupv8bf): New. > (vst3): Used new iterators. > (vst3qa): Used new iterators. > (vst3qb): Used new iterators. > (vld3_dup<mode>): Used new iterators. > (vld3_dupv8bf): New. > (vst4): Used new iterators. > (vst4qa): Used new iterators. > (vst4qb): Used new iterators. > (vld4_dup<mode>): Used new iterators. > (vld4_dupv8bf): New. > > > gcc/testsuite/ChangeLog: > > 2019-03-04 Delia Burduv <delia.burduv@arm.com> > > * gcc.target/arm/simd/bf16_vldn_1.c: New test. > > Thanks, > Delia > > On 2/19/20 5:25 PM, Delia Burduv wrote: > > > > Hi, > > > > Here is the latest version of the patch. It just has some minor > > formatting changes that were brought up by Richard Sandiford in the > > AArch64 patches > > > > Thanks, > > Delia > > > > On 1/22/20 5:31 PM, Delia Burduv wrote: > >> Ping. > >> > >> I will change the tests to use the exact input and output registers as > >> Richard Sandiford suggested for the AArch64 patches. > >> > >> On 12/20/19 6:48 PM, Delia Burduv wrote: > >>> This patch adds the ARMv8.6 ACLE BFloat16 load intrinsics > >>> vld<n>{q}_bf16 as part of the BFloat16 extension. > >>> > (https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) > > >>> > >>> The intrinsics are declared in arm_neon.h . > >>> A new test is added to check assembler output. > >>> > >>> This patch depends on the Arm back-end patche. > >>> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html) > >>> > >>> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't > >>> have commit rights, so if this is ok can someone please commit it for > >>> me? > >>> > >>> gcc/ChangeLog: > >>> > >>> 2019-11-14 Delia Burduv <delia.burduv@arm.com> > >>> > >>> * config/arm/arm_neon.h (bfloat16_t): New typedef. > >>> (bfloat16x4x2_t): New typedef. > >>> (bfloat16x8x2_t): New typedef. > >>> (bfloat16x4x3_t): New typedef. > >>> (bfloat16x8x3_t): New typedef. > >>> (bfloat16x4x4_t): New typedef. > >>> (bfloat16x8x4_t): New typedef. > >>> (vld2_bf16): New. > >>> (vld2q_bf16): New. > >>> (vld3_bf16): New. > >>> (vld3q_bf16): New. > >>> (vld4_bf16): New. > >>> (vld4q_bf16): New. > >>> (vld2_dup_bf16): New. > >>> (vld2q_dup_bf16): New. > >>> (vld3_dup_bf16): New. > >>> (vld3q_dup_bf16): New. > >>> (vld4_dup_bf16): New. > >>> (vld4q_dup_bf16): New. > >>> * config/arm/arm-builtins.c (E_V2BFmode): New mode. > >>> (VAR13): New. > >>> (arm_simd_types[Bfloat16x2_t]):New type. > >>> * config/arm/arm-modes.def (V2BF): New mode. > >>> * config/arm/arm-simd-builtin-types.def > >>> (Bfloat16x2_t): New entry. > >>> * config/arm/arm_neon_builtins.def > >>> (vld2): Changed to VAR13 and added v4bf, v8bf > >>> (vld2_dup): Changed to VAR8 and added v4bf, v8bf > >>> (vld3): Changed to VAR13 and added v4bf, v8bf > >>> (vld3_dup): Changed to VAR8 and added v4bf, v8bf > >>> (vld4): Changed to VAR13 and added v4bf, v8bf > >>> (vld4_dup): Changed to VAR8 and added v4bf, v8bf > >>> * config/arm/iterators.md (VDXBF): New iterator. > >>> (VQ2BF): New iterator. > >>> (V_elem): Added V4BF, V8BF. > >>> (V_sz_elem): Added V4BF, V8BF. > >>> (V_mode_nunits): Added V4BF, V8BF. > >>> (q): Added V4BF, V8BF. > >>> *config/arm/neon.md (vld2): Used new iterators. > >>> (vld2_dup<mode>): Used new iterators. > >>> (vld2_dupv8bf): New. > >>> (vst3): Used new iterators. > >>> (vst3qa): Used new iterators. > >>> (vst3qb): Used new iterators. > >>> (vld3_dup<mode>): Used new iterators. > >>> (vld3_dupv8bf): New. > >>> (vst4): Used new iterators. > >>> (vst4qa): Used new iterators. > >>> (vst4qb): Used new iterators. > >>> (vld4_dup<mode>): Used new iterators. > >>> (vld4_dupv8bf): New. > >>> > >>> > >>> gcc/testsuite/ChangeLog: > >>> > >>> 2019-11-14 Delia Burduv <delia.burduv@arm.com> > >>> > >>> * gcc.target/arm/simd/bf16_vldn_1.c: New test. diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c new file mode 100644 index 0000000000000000000000000000000000000000..7ff8b600827e5c2e313ce40d14382aa641b4bb31 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c @@ -0,0 +1,152 @@ +/* { dg-do assemble } */ +/* { dg-options "-save-temps" } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-final { check-function-bodies "**" "" } } */ I think this should include an optimisation option like -O2 because... + +#include "arm_neon.h" + + +/* +**test_vld2_bf16: +** ... +** vld2.16 {d16-d17}, \[r3\] ... this is unstable codegen depending on the -O0 register allocator moving the ptr argument to r3 from its initial r0. This should really be r0 and the load instruction should load the low D regs. So let's add an -O2 to the dg-options and scan for the result of that. Otherwise this is ok. Thanks! Kyrill +** ... +*/ +bfloat16x4x2_t +test_vld2_bf16 (bfloat16_t * ptr) +{ + vld2_bf16 (ptr); +} + ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: ACLE intrinsics: BFloat16 load intrinsics for AArch32 2020-03-04 17:21 ` Kyrill Tkachov @ 2020-03-05 16:39 ` Delia Burduv 2020-03-06 10:45 ` Kyrill Tkachov 0 siblings, 1 reply; 9+ messages in thread From: Delia Burduv @ 2020-03-05 16:39 UTC (permalink / raw) To: Kyrill Tkachov, gcc-patches; +Cc: nickc, Richard Earnshaw, Ramana Radhakrishnan [-- Attachment #1: Type: text/plain, Size: 8066 bytes --] Hi, This is the latest version of the patch. I am forcing -mfloat-abi=hard because the code generated is slightly differently depending on the float-abi used. Thanks, Delia On 3/4/20 5:20 PM, Kyrill Tkachov wrote: > Hi Delia, > > On 3/4/20 2:05 PM, Delia Burduv wrote: >> Hi, >> >> The previous version of this patch shared part of its code with the >> store intrinsics patch >> (https://gcc.gnu.org/ml/gcc-patches/2020-03/msg00145.html) so I removed >> any duplicated code. This patch now depends on the previously mentioned >> store intrinsics patch. >> >> Here is the latest version and the updated ChangeLog. >> >> gcc/ChangeLog: >> >> 2019-03-04 Delia Burduv <delia.burduv@arm.com> >> >> * config/arm/arm_neon.h (bfloat16_t): New typedef. >> (vld2_bf16): New. >> (vld2q_bf16): New. >> (vld3_bf16): New. >> (vld3q_bf16): New. >> (vld4_bf16): New. >> (vld4q_bf16): New. >> (vld2_dup_bf16): New. >> (vld2q_dup_bf16): New. >> (vld3_dup_bf16): New. >> (vld3q_dup_bf16): New. >> (vld4_dup_bf16): New. >> (vld4q_dup_bf16): New. >> * config/arm/arm_neon_builtins.def >> (vld2): Changed to VAR13 and added v4bf, v8bf >> (vld2_dup): Changed to VAR8 and added v4bf, v8bf >> (vld3): Changed to VAR13 and added v4bf, v8bf >> (vld3_dup): Changed to VAR8 and added v4bf, v8bf >> (vld4): Changed to VAR13 and added v4bf, v8bf >> (vld4_dup): Changed to VAR8 and added v4bf, v8bf >> * config/arm/iterators.md (VDXBF): New iterator. >> (VQ2BF): New iterator. >> *config/arm/neon.md (vld2): Used new iterators. >> (vld2_dup<mode>): Used new iterators. >> (vld2_dupv8bf): New. >> (vst3): Used new iterators. >> (vst3qa): Used new iterators. >> (vst3qb): Used new iterators. >> (vld3_dup<mode>): Used new iterators. >> (vld3_dupv8bf): New. >> (vst4): Used new iterators. >> (vst4qa): Used new iterators. >> (vst4qb): Used new iterators. >> (vld4_dup<mode>): Used new iterators. >> (vld4_dupv8bf): New. >> >> >> gcc/testsuite/ChangeLog: >> >> 2019-03-04 Delia Burduv <delia.burduv@arm.com> >> >> * gcc.target/arm/simd/bf16_vldn_1.c: New test. >> >> Thanks, >> Delia >> >> On 2/19/20 5:25 PM, Delia Burduv wrote: >> > >> > Hi, >> > >> > Here is the latest version of the patch. It just has some minor >> > formatting changes that were brought up by Richard Sandiford in the >> > AArch64 patches >> > >> > Thanks, >> > Delia >> > >> > On 1/22/20 5:31 PM, Delia Burduv wrote: >> >> Ping. >> >> >> >> I will change the tests to use the exact input and output registers as >> >> Richard Sandiford suggested for the AArch64 patches. >> >> >> >> On 12/20/19 6:48 PM, Delia Burduv wrote: >> >>> This patch adds the ARMv8.6 ACLE BFloat16 load intrinsics >> >>> vld<n>{q}_bf16 as part of the BFloat16 extension. >> >>> >> (https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) >> >> >>> >> >>> The intrinsics are declared in arm_neon.h . >> >>> A new test is added to check assembler output. >> >>> >> >>> This patch depends on the Arm back-end patche. >> >>> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html) >> >>> >> >>> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't >> >>> have commit rights, so if this is ok can someone please commit it for >> >>> me? >> >>> >> >>> gcc/ChangeLog: >> >>> >> >>> 2019-11-14 Delia Burduv <delia.burduv@arm.com> >> >>> >> >>> * config/arm/arm_neon.h (bfloat16_t): New typedef. >> >>> (bfloat16x4x2_t): New typedef. >> >>> (bfloat16x8x2_t): New typedef. >> >>> (bfloat16x4x3_t): New typedef. >> >>> (bfloat16x8x3_t): New typedef. >> >>> (bfloat16x4x4_t): New typedef. >> >>> (bfloat16x8x4_t): New typedef. >> >>> (vld2_bf16): New. >> >>> (vld2q_bf16): New. >> >>> (vld3_bf16): New. >> >>> (vld3q_bf16): New. >> >>> (vld4_bf16): New. >> >>> (vld4q_bf16): New. >> >>> (vld2_dup_bf16): New. >> >>> (vld2q_dup_bf16): New. >> >>> (vld3_dup_bf16): New. >> >>> (vld3q_dup_bf16): New. >> >>> (vld4_dup_bf16): New. >> >>> (vld4q_dup_bf16): New. >> >>> * config/arm/arm-builtins.c (E_V2BFmode): New mode. >> >>> (VAR13): New. >> >>> (arm_simd_types[Bfloat16x2_t]):New type. >> >>> * config/arm/arm-modes.def (V2BF): New mode. >> >>> * config/arm/arm-simd-builtin-types.def >> >>> (Bfloat16x2_t): New entry. >> >>> * config/arm/arm_neon_builtins.def >> >>> (vld2): Changed to VAR13 and added v4bf, v8bf >> >>> (vld2_dup): Changed to VAR8 and added v4bf, v8bf >> >>> (vld3): Changed to VAR13 and added v4bf, v8bf >> >>> (vld3_dup): Changed to VAR8 and added v4bf, v8bf >> >>> (vld4): Changed to VAR13 and added v4bf, v8bf >> >>> (vld4_dup): Changed to VAR8 and added v4bf, v8bf >> >>> * config/arm/iterators.md (VDXBF): New iterator. >> >>> (VQ2BF): New iterator. >> >>> (V_elem): Added V4BF, V8BF. >> >>> (V_sz_elem): Added V4BF, V8BF. >> >>> (V_mode_nunits): Added V4BF, V8BF. >> >>> (q): Added V4BF, V8BF. >> >>> *config/arm/neon.md (vld2): Used new iterators. >> >>> (vld2_dup<mode>): Used new iterators. >> >>> (vld2_dupv8bf): New. >> >>> (vst3): Used new iterators. >> >>> (vst3qa): Used new iterators. >> >>> (vst3qb): Used new iterators. >> >>> (vld3_dup<mode>): Used new iterators. >> >>> (vld3_dupv8bf): New. >> >>> (vst4): Used new iterators. >> >>> (vst4qa): Used new iterators. >> >>> (vst4qb): Used new iterators. >> >>> (vld4_dup<mode>): Used new iterators. >> >>> (vld4_dupv8bf): New. >> >>> >> >>> >> >>> gcc/testsuite/ChangeLog: >> >>> >> >>> 2019-11-14 Delia Burduv <delia.burduv@arm.com> >> >>> >> >>> * gcc.target/arm/simd/bf16_vldn_1.c: New test. > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c > b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..7ff8b600827e5c2e313ce40d14382aa641b4bb31 > > --- /dev/null > +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c > @@ -0,0 +1,152 @@ > +/* { dg-do assemble } */ > +/* { dg-options "-save-temps" } */ > +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ > +/* { dg-add-options arm_v8_2a_bf16_neon } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > > > I think this should include an optimisation option like -O2 because... > > + > +#include "arm_neon.h" > + > + > +/* > +**test_vld2_bf16: > +** ... > +** vld2.16 {d16-d17}, \[r3\] > > ... this is unstable codegen depending on the -O0 register allocator > moving the ptr argument to r3 from its initial r0. > This should really be r0 and the load instruction should load the low D > regs. > So let's add an -O2 to the dg-options and scan for the result of that. > > > Otherwise this is ok. > Thanks! > Kyrill > > > +** ... > +*/ > +bfloat16x4x2_t > +test_vld2_bf16 (bfloat16_t * ptr) > +{ > + vld2_bf16 (ptr); > +} > + > [-- Attachment #2: rb12473.patch --] [-- Type: text/x-patch, Size: 18332 bytes --] diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 425a2a49b69d7e3070059dd0a79ae3d306400f4b..2573cca6bb64f5104a1efd1379ef956f56d0fe04 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -19504,6 +19504,114 @@ vst4q_bf16 (bfloat16_t * __ptr, bfloat16x8x4_t __val) return __builtin_neon_vst4v8bf (__ptr, __bu.__o); } +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_bf16 (bfloat16_t const * __ptr) +{ + union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2v4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2v8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3v4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3v8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4v4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4v8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + #pragma GCC pop_options #ifdef __cplusplus diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index d85a2d4b1fcf9e851f215dfdd4b305e59ded651c..e3c1652b9e92ff5024225279f26c1ccb197dcd69 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -320,29 +320,29 @@ VAR12 (STORE1, vst1, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di) VAR12 (STORE1LANE, vst1_lane, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di) -VAR11 (LOAD1, vld2, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (LOAD1, vld2, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR9 (LOAD1LANE, vld2_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR6 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR8 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR13 (STORE1, vst2, v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) VAR9 (STORE1LANE, vst2_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR11 (LOAD1, vld3, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (LOAD1, vld3, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR9 (LOAD1LANE, vld3_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR6 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR8 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR13 (STORE1, vst3, v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) VAR9 (STORE1LANE, vst3_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR11 (LOAD1, vld4, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (LOAD1, vld4, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR9 (LOAD1LANE, vld4_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR6 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR8 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR13 (STORE1, vst4, v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) VAR9 (STORE1LANE, vst4_lane, diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 0c03e747c3643e018f4f62dda5e832dfb1af758f..7401f16ef59b9854bbc85f98cfdcdd7a8a600337 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -87,6 +87,9 @@ ;; Double-width vector modes plus 64-bit elements, including V4BF. (define_mode_iterator VDXBF [V8QI V4HI V4HF (V4BF "TARGET_BF16_SIMD") V2SI V2SF DI]) +;; Double-width vector modes plus 64-bit elements, V4BF and V8BF. +(define_mode_iterator VDXBF2 [V8QI V4HI V4HF V2SI V2SF DI (V4BF "TARGET_BF16_SIMD") (V8BF ("TARGET_BF16_SIMD"))]) + ;; Double-width vector modes plus 64-bit elements, ;; with V4BFmode added, suitable for moves. (define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index fcf59aee32a955b6bb3e7b98a4d880a0e631b4be..5117f78dd2dce442bc738de6082686421fcdca52 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -5428,7 +5428,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld2<mode>" [(set (match_operand:TI 0 "s_register_operand" "=w") (unspec:TI [(match_operand:TI 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2))] "TARGET_NEON" { @@ -5453,7 +5453,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld2<mode>" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2))] "TARGET_NEON" "vld2.<V_sz_elem>\t%h0, %A1" @@ -5516,7 +5516,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld2_dup<mode>" [(set (match_operand:TI 0 "s_register_operand" "=w") (unspec:TI [(match_operand:<V_two_elem> 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2_DUP))] "TARGET_NEON" { @@ -5531,6 +5531,27 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load1_1reg<q>")))] ) +(define_insn "neon_vld2_dupv8bf" + [(set (match_operand:OI 0 "s_register_operand" "=w") + (unspec:OI [(match_operand:V2BF 1 "neon_struct_operand" "Um") + (unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_BF16_SIMD" + { + rtx ops[5]; + int tabbase = REGNO (operands[0]); + + ops[4] = operands[1]; + ops[0] = gen_rtx_REG (V4BFmode, tabbase); + ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2); + ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4); + ops[3] = gen_rtx_REG (V4BFmode, tabbase + 6); + output_asm_insn ("vld2.16\t{%P0, %P1, %P2, %P3}, %A4", ops); + return ""; + } + [(set_attr "type" "neon_load2_all_lanes_q")] +) + (define_expand "vec_store_lanesti<mode>" [(set (match_operand:TI 0 "neon_struct_operand") (unspec:TI [(match_operand:TI 1 "s_register_operand") @@ -5637,7 +5658,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3<mode>" [(set (match_operand:EI 0 "s_register_operand" "=w") (unspec:EI [(match_operand:EI 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3))] "TARGET_NEON" { @@ -5665,7 +5686,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vld3<mode>" [(match_operand:CI 0 "s_register_operand") (match_operand:CI 1 "neon_struct_operand") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { rtx mem; @@ -5680,7 +5701,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3qa<mode>" [(set (match_operand:CI 0 "s_register_operand" "=w") (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3A))] "TARGET_NEON" { @@ -5700,7 +5721,7 @@ if (BYTES_BIG_ENDIAN) [(set (match_operand:CI 0 "s_register_operand" "=w") (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um") (match_operand:CI 2 "s_register_operand" "0") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3B))] "TARGET_NEON" { @@ -5777,7 +5798,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3_dup<mode>" [(set (match_operand:EI 0 "s_register_operand" "=w") (unspec:EI [(match_operand:<V_three_elem> 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3_DUP))] "TARGET_NEON" { @@ -5800,6 +5821,26 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load3_all_lanes<q>") (const_string "neon_load1_1reg<q>")))]) +(define_insn "neon_vld3_dupv8bf" + [(set (match_operand:CI 0 "s_register_operand" "=w") + (unspec:CI [(match_operand:V2BF 1 "neon_struct_operand" "Um") + (unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_BF16_SIMD" + { + rtx ops[4]; + int tabbase = REGNO (operands[0]); + + ops[3] = operands[1]; + ops[0] = gen_rtx_REG (V4BFmode, tabbase); + ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2); + ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4); + output_asm_insn ("vld3.16\t{%P0[], %P1[], %P2[]}, %A3", ops); + return ""; + } + [(set_attr "type" "neon_load3_all_lanes_q")] +) + (define_expand "vec_store_lanesei<mode>" [(set (match_operand:EI 0 "neon_struct_operand") (unspec:EI [(match_operand:EI 1 "s_register_operand") @@ -5955,7 +5996,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4<mode>" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4))] "TARGET_NEON" { @@ -5983,7 +6024,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vld4<mode>" [(match_operand:XI 0 "s_register_operand") (match_operand:XI 1 "neon_struct_operand") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { rtx mem; @@ -5998,7 +6039,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4qa<mode>" [(set (match_operand:XI 0 "s_register_operand" "=w") (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4A))] "TARGET_NEON" { @@ -6019,7 +6060,7 @@ if (BYTES_BIG_ENDIAN) [(set (match_operand:XI 0 "s_register_operand" "=w") (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um") (match_operand:XI 2 "s_register_operand" "0") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4B))] "TARGET_NEON" { @@ -6099,7 +6140,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4_dup<mode>" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:<V_four_elem> 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4_DUP))] "TARGET_NEON" { @@ -6125,6 +6166,27 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load1_1reg<q>")))] ) +(define_insn "neon_vld4_dupv8bf" + [(set (match_operand:XI 0 "s_register_operand" "=w") + (unspec:XI [(match_operand:V2BF 1 "neon_struct_operand" "Um") + (unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_BF16_SIMD" + { + rtx ops[5]; + int tabbase = REGNO (operands[0]); + + ops[4] = operands[1]; + ops[0] = gen_rtx_REG (V4BFmode, tabbase); + ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2); + ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4); + ops[3] = gen_rtx_REG (V4BFmode, tabbase + 6); + output_asm_insn ("vld4.16\t{%P0[], %P1[], %P2[], %P3[]}, %A4", ops); + return ""; + } + [(set_attr "type" "neon_load4_all_lanes_q")] +) + (define_expand "vec_store_lanesoi<mode>" [(set (match_operand:OI 0 "neon_struct_operand") (unspec:OI [(match_operand:OI 1 "s_register_operand") diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c new file mode 100644 index 0000000000000000000000000000000000000000..222e7af945383bd93b6b280b516a56e684f1d651 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c @@ -0,0 +1,152 @@ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps -O2 -mfloat-abi=hard" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + + +/* +**test_vld2_bf16: +** ... +** vld2.16 {d0-d1}, \[r0\] +** bx lr +*/ +bfloat16x4x2_t +test_vld2_bf16 (bfloat16_t * ptr) +{ + return vld2_bf16 (ptr); +} + +/* +**test_vld2q_bf16: +** ... +** vld2.16 {d0-d3}, \[r0\] +** bx lr +*/ +bfloat16x8x2_t +test_vld2q_bf16 (bfloat16_t * ptr) +{ + return vld2q_bf16 (ptr); +} + +/* +**test_vld2_dup_bf16: +** ... +** vld2.16 {d0\[\], d1\[\]}, \[r0\] +** bx lr +*/ +bfloat16x4x2_t +test_vld2_dup_bf16 (bfloat16_t * ptr) +{ + return vld2_dup_bf16 (ptr); +} + +/* +**test_vld2q_dup_bf16: +** ... +** vld2.16 {d0, d1, d2, d3}, \[r0\] +** bx lr +*/ +bfloat16x8x2_t +test_vld2q_dup_bf16 (bfloat16_t * ptr) +{ + return vld2q_dup_bf16 (ptr); +} + +/* +**test_vld3_bf16: +** ... +** vld3.16 {d0-d2}, \[r0\] +** bx lr +*/ +bfloat16x4x3_t +test_vld3_bf16 (bfloat16_t * ptr) +{ + return vld3_bf16 (ptr); +} + +/* +**test_vld3q_bf16: +** ... +** vld3.16 {d1, d3, d5}, \[r0\] +** bx lr +*/ +bfloat16x8x3_t +test_vld3q_bf16 (bfloat16_t * ptr) +{ + return vld3q_bf16 (ptr); +} + +/* +**test_vld3_dup_bf16: +** ... +** vld3.16 {d0\[\], d1\[\], d2\[\]}, \[r0\] +** bx lr +*/ +bfloat16x4x3_t +test_vld3_dup_bf16 (bfloat16_t * ptr) +{ + return vld3_dup_bf16 (ptr); +} + +/* +**test_vld3q_dup_bf16: +** ... +** vld3.16 {d0\[\], d1\[\], d2\[\]}, \[r0\] +** bx lr +*/ +bfloat16x8x3_t +test_vld3q_dup_bf16 (bfloat16_t * ptr) +{ + return vld3q_dup_bf16 (ptr); +} + +/* +**test_vld4_bf16: +** ... +** vld4.16 {d0-d3}, \[r0\] +** bx lr +*/ +bfloat16x4x4_t +test_vld4_bf16 (bfloat16_t * ptr) +{ + return vld4_bf16 (ptr); +} + +/* +**test_vld4q_bf16: +** ... +** vld4.16 {d1, d3, d5, d7}, \[r0\] +** bx lr +*/ +bfloat16x8x4_t +test_vld4q_bf16 (bfloat16_t * ptr) +{ + return vld4q_bf16 (ptr); +} + +/* +**test_vld4_dup_bf16: +** ... +** vld4.16 {d0\[\], d1\[\], d2\[\], d3\[\]}, \[r0\] +** bx lr +*/ +bfloat16x4x4_t +test_vld4_dup_bf16 (bfloat16_t * ptr) +{ + return vld4_dup_bf16 (ptr); +} + +/* +**test_vld4q_dup_bf16: +** ... +** vld4.16 {d0\[\], d1\[\], d2\[\], d3\[\]}, \[r0\] +** bx lr +*/ +bfloat16x8x4_t +test_vld4q_dup_bf16 (bfloat16_t * ptr) +{ + return vld4q_dup_bf16 (ptr); +} ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: ACLE intrinsics: BFloat16 load intrinsics for AArch32 2020-03-05 16:39 ` Delia Burduv @ 2020-03-06 10:45 ` Kyrill Tkachov 2020-03-09 10:18 ` Christophe Lyon 0 siblings, 1 reply; 9+ messages in thread From: Kyrill Tkachov @ 2020-03-06 10:45 UTC (permalink / raw) To: Delia Burduv, gcc-patches; +Cc: nickc, Richard Earnshaw, Ramana Radhakrishnan Hi Delia, On 3/5/20 4:38 PM, Delia Burduv wrote: > Hi, > > This is the latest version of the patch. I am forcing -mfloat-abi=hard > because the code generated is slightly differently depending on the > float-abi used. Thanks, I've pushed it with an updated ChangeLog. 2020-03-06 Delia Burduv <delia.burduv@arm.com> * config/arm/arm_neon.h (vld2_bf16): New. (vld2q_bf16): New. (vld3_bf16): New. (vld3q_bf16): New. (vld4_bf16): New. (vld4q_bf16): New. (vld2_dup_bf16): New. (vld2q_dup_bf16): New. (vld3_dup_bf16): New. (vld3q_dup_bf16): New. (vld4_dup_bf16): New. (vld4q_dup_bf16): New. * config/arm/arm_neon_builtins.def (vld2): Changed to VAR13 and added v4bf, v8bf (vld2_dup): Changed to VAR8 and added v4bf, v8bf (vld3): Changed to VAR13 and added v4bf, v8bf (vld3_dup): Changed to VAR8 and added v4bf, v8bf (vld4): Changed to VAR13 and added v4bf, v8bf (vld4_dup): Changed to VAR8 and added v4bf, v8bf * config/arm/iterators.md (VDXBF2): New iterator. *config/arm/neon.md (neon_vld2): Use new iterators. (neon_vld2_dup<mode): Use new iterators. (neon_vld3<mode>): Likewise. (neon_vld3qa<mode>): Likewise. (neon_vld3qb<mode>): Likewise. (neon_vld3_dup<mode>): Likewise. (neon_vld4<mode>): Likewise. (neon_vld4qa<mode>): Likewise. (neon_vld4qb<mode>): Likewise. (neon_vld4_dup<mode>): Likewise. (neon_vld2_dupv8bf): New. (neon_vld3_dupv8bf): Likewise. (neon_vld4_dupv8bf): Likewise. Kyrill > > Thanks, > Delia > > On 3/4/20 5:20 PM, Kyrill Tkachov wrote: >> Hi Delia, >> >> On 3/4/20 2:05 PM, Delia Burduv wrote: >>> Hi, >>> >>> The previous version of this patch shared part of its code with the >>> store intrinsics patch >>> (https://gcc.gnu.org/ml/gcc-patches/2020-03/msg00145.html) so I removed >>> any duplicated code. This patch now depends on the previously mentioned >>> store intrinsics patch. >>> >>> Here is the latest version and the updated ChangeLog. >>> >>> gcc/ChangeLog: >>> >>> 2019-03-04 Delia Burduv <delia.burduv@arm.com> >>> >>> * config/arm/arm_neon.h (bfloat16_t): New typedef. >>> (vld2_bf16): New. >>> (vld2q_bf16): New. >>> (vld3_bf16): New. >>> (vld3q_bf16): New. >>> (vld4_bf16): New. >>> (vld4q_bf16): New. >>> (vld2_dup_bf16): New. >>> (vld2q_dup_bf16): New. >>> (vld3_dup_bf16): New. >>> (vld3q_dup_bf16): New. >>> (vld4_dup_bf16): New. >>> (vld4q_dup_bf16): New. >>> * config/arm/arm_neon_builtins.def >>> (vld2): Changed to VAR13 and added v4bf, v8bf >>> (vld2_dup): Changed to VAR8 and added v4bf, v8bf >>> (vld3): Changed to VAR13 and added v4bf, v8bf >>> (vld3_dup): Changed to VAR8 and added v4bf, v8bf >>> (vld4): Changed to VAR13 and added v4bf, v8bf >>> (vld4_dup): Changed to VAR8 and added v4bf, v8bf >>> * config/arm/iterators.md (VDXBF): New iterator. >>> (VQ2BF): New iterator. >>> *config/arm/neon.md (vld2): Used new iterators. >>> (vld2_dup<mode>): Used new iterators. >>> (vld2_dupv8bf): New. >>> (vst3): Used new iterators. >>> (vst3qa): Used new iterators. >>> (vst3qb): Used new iterators. >>> (vld3_dup<mode>): Used new iterators. >>> (vld3_dupv8bf): New. >>> (vst4): Used new iterators. >>> (vst4qa): Used new iterators. >>> (vst4qb): Used new iterators. >>> (vld4_dup<mode>): Used new iterators. >>> (vld4_dupv8bf): New. >>> >>> >>> gcc/testsuite/ChangeLog: >>> >>> 2019-03-04 Delia Burduv <delia.burduv@arm.com> >>> >>> * gcc.target/arm/simd/bf16_vldn_1.c: New test. >>> >>> Thanks, >>> Delia >>> >>> On 2/19/20 5:25 PM, Delia Burduv wrote: >>> > >>> > Hi, >>> > >>> > Here is the latest version of the patch. It just has some minor >>> > formatting changes that were brought up by Richard Sandiford in the >>> > AArch64 patches >>> > >>> > Thanks, >>> > Delia >>> > >>> > On 1/22/20 5:31 PM, Delia Burduv wrote: >>> >> Ping. >>> >> >>> >> I will change the tests to use the exact input and output >>> registers as >>> >> Richard Sandiford suggested for the AArch64 patches. >>> >> >>> >> On 12/20/19 6:48 PM, Delia Burduv wrote: >>> >>> This patch adds the ARMv8.6 ACLE BFloat16 load intrinsics >>> >>> vld<n>{q}_bf16 as part of the BFloat16 extension. >>> >>> >>> (https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) >>> >>> >>> >>> >>> The intrinsics are declared in arm_neon.h . >>> >>> A new test is added to check assembler output. >>> >>> >>> >>> This patch depends on the Arm back-end patche. >>> >>> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html) >>> >>> >>> >>> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't >>> >>> have commit rights, so if this is ok can someone please commit >>> it for >>> >>> me? >>> >>> >>> >>> gcc/ChangeLog: >>> >>> >>> >>> 2019-11-14 Delia Burduv <delia.burduv@arm.com> >>> >>> >>> >>> * config/arm/arm_neon.h (bfloat16_t): New typedef. >>> >>> (bfloat16x4x2_t): New typedef. >>> >>> (bfloat16x8x2_t): New typedef. >>> >>> (bfloat16x4x3_t): New typedef. >>> >>> (bfloat16x8x3_t): New typedef. >>> >>> (bfloat16x4x4_t): New typedef. >>> >>> (bfloat16x8x4_t): New typedef. >>> >>> (vld2_bf16): New. >>> >>> (vld2q_bf16): New. >>> >>> (vld3_bf16): New. >>> >>> (vld3q_bf16): New. >>> >>> (vld4_bf16): New. >>> >>> (vld4q_bf16): New. >>> >>> (vld2_dup_bf16): New. >>> >>> (vld2q_dup_bf16): New. >>> >>> (vld3_dup_bf16): New. >>> >>> (vld3q_dup_bf16): New. >>> >>> (vld4_dup_bf16): New. >>> >>> (vld4q_dup_bf16): New. >>> >>> * config/arm/arm-builtins.c (E_V2BFmode): New mode. >>> >>> (VAR13): New. >>> >>> (arm_simd_types[Bfloat16x2_t]):New type. >>> >>> * config/arm/arm-modes.def (V2BF): New mode. >>> >>> * config/arm/arm-simd-builtin-types.def >>> >>> (Bfloat16x2_t): New entry. >>> >>> * config/arm/arm_neon_builtins.def >>> >>> (vld2): Changed to VAR13 and added v4bf, v8bf >>> >>> (vld2_dup): Changed to VAR8 and added v4bf, v8bf >>> >>> (vld3): Changed to VAR13 and added v4bf, v8bf >>> >>> (vld3_dup): Changed to VAR8 and added v4bf, v8bf >>> >>> (vld4): Changed to VAR13 and added v4bf, v8bf >>> >>> (vld4_dup): Changed to VAR8 and added v4bf, v8bf >>> >>> * config/arm/iterators.md (VDXBF): New iterator. >>> >>> (VQ2BF): New iterator. >>> >>> (V_elem): Added V4BF, V8BF. >>> >>> (V_sz_elem): Added V4BF, V8BF. >>> >>> (V_mode_nunits): Added V4BF, V8BF. >>> >>> (q): Added V4BF, V8BF. >>> >>> *config/arm/neon.md (vld2): Used new iterators. >>> >>> (vld2_dup<mode>): Used new iterators. >>> >>> (vld2_dupv8bf): New. >>> >>> (vst3): Used new iterators. >>> >>> (vst3qa): Used new iterators. >>> >>> (vst3qb): Used new iterators. >>> >>> (vld3_dup<mode>): Used new iterators. >>> >>> (vld3_dupv8bf): New. >>> >>> (vst4): Used new iterators. >>> >>> (vst4qa): Used new iterators. >>> >>> (vst4qb): Used new iterators. >>> >>> (vld4_dup<mode>): Used new iterators. >>> >>> (vld4_dupv8bf): New. >>> >>> >>> >>> >>> >>> gcc/testsuite/ChangeLog: >>> >>> >>> >>> 2019-11-14 Delia Burduv <delia.burduv@arm.com> >>> >>> >>> >>> * gcc.target/arm/simd/bf16_vldn_1.c: New test. >> >> >> diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c >> b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c >> new file mode 100644 >> index >> 0000000000000000000000000000000000000000..7ff8b600827e5c2e313ce40d14382aa641b4bb31 >> >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c >> @@ -0,0 +1,152 @@ >> +/* { dg-do assemble } */ >> +/* { dg-options "-save-temps" } */ >> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ >> +/* { dg-add-options arm_v8_2a_bf16_neon } */ >> +/* { dg-final { check-function-bodies "**" "" } } */ >> >> >> I think this should include an optimisation option like -O2 because... >> >> + >> +#include "arm_neon.h" >> + >> + >> +/* >> +**test_vld2_bf16: >> +** ... >> +** vld2.16 {d16-d17}, \[r3\] >> >> ... this is unstable codegen depending on the -O0 register allocator >> moving the ptr argument to r3 from its initial r0. >> This should really be r0 and the load instruction should load the low >> D regs. >> So let's add an -O2 to the dg-options and scan for the result of that. >> >> >> Otherwise this is ok. >> Thanks! >> Kyrill >> >> >> +** ... >> +*/ >> +bfloat16x4x2_t >> +test_vld2_bf16 (bfloat16_t * ptr) >> +{ >> + vld2_bf16 (ptr); >> +} >> + >> ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: ACLE intrinsics: BFloat16 load intrinsics for AArch32 2020-03-06 10:45 ` Kyrill Tkachov @ 2020-03-09 10:18 ` Christophe Lyon 0 siblings, 0 replies; 9+ messages in thread From: Christophe Lyon @ 2020-03-09 10:18 UTC (permalink / raw) To: Kyrill Tkachov Cc: Delia Burduv, gcc-patches, nickc, Richard Earnshaw, Ramana Radhakrishnan On Fri, 6 Mar 2020 at 11:46, Kyrill Tkachov <kyrylo.tkachov@foss.arm.com> wrote: > > Hi Delia, > > On 3/5/20 4:38 PM, Delia Burduv wrote: > > Hi, > > > > This is the latest version of the patch. I am forcing -mfloat-abi=hard > > because the code generated is slightly differently depending on the > > float-abi used. > > > Thanks, I've pushed it with an updated ChangeLog. > > 2020-03-06 Delia Burduv <delia.burduv@arm.com> > > * config/arm/arm_neon.h (vld2_bf16): New. > (vld2q_bf16): New. > (vld3_bf16): New. > (vld3q_bf16): New. > (vld4_bf16): New. > (vld4q_bf16): New. > (vld2_dup_bf16): New. > (vld2q_dup_bf16): New. > (vld3_dup_bf16): New. > (vld3q_dup_bf16): New. > (vld4_dup_bf16): New. > (vld4q_dup_bf16): New. > * config/arm/arm_neon_builtins.def > (vld2): Changed to VAR13 and added v4bf, v8bf > (vld2_dup): Changed to VAR8 and added v4bf, v8bf > (vld3): Changed to VAR13 and added v4bf, v8bf > (vld3_dup): Changed to VAR8 and added v4bf, v8bf > (vld4): Changed to VAR13 and added v4bf, v8bf > (vld4_dup): Changed to VAR8 and added v4bf, v8bf > * config/arm/iterators.md (VDXBF2): New iterator. > *config/arm/neon.md (neon_vld2): Use new iterators. > (neon_vld2_dup<mode): Use new iterators. > (neon_vld3<mode>): Likewise. > (neon_vld3qa<mode>): Likewise. > (neon_vld3qb<mode>): Likewise. > (neon_vld3_dup<mode>): Likewise. > (neon_vld4<mode>): Likewise. > (neon_vld4qa<mode>): Likewise. > (neon_vld4qb<mode>): Likewise. > (neon_vld4_dup<mode>): Likewise. > (neon_vld2_dupv8bf): New. > (neon_vld3_dupv8bf): Likewise. > (neon_vld4_dupv8bf): Likewise. > > Kyrill Hi! There's a problem with the arm_neon.h update. on arm-none-linux-gnueabihf, there is a regression on g++.dg/other/pr54300.C and g++.dg/other/pr55073.C, because: FAIL: g++.dg/other/pr54300.C -std=gnu++98 (test for excess errors) Excess errors: /aci-gcc-fsf/builds/gcc-fsf-gccsrc/obj-arm-none-linux-gnueabihf/gcc3/gcc/include/arm_neon.h:19565:39: error: cannot convert 'const short int*' to 'const __bf16*' /aci-gcc-fsf/builds/gcc-fsf-gccsrc/obj-arm-none-linux-gnueabihf/gcc3/gcc/include/arm_neon.h:19574:39: error: cannot convert 'const short int*' to 'const __bf16*' [....] The same problem makes a lot (~365) of tests become unsupported on arm-none-linux-gnueabi: g++.dg/abi/mangle-arm-crypto.C g++.dg/abi/mangle-neon.C Can you fix it? Thanks Christophe > > > > > > Thanks, > > Delia > > > > On 3/4/20 5:20 PM, Kyrill Tkachov wrote: > >> Hi Delia, > >> > >> On 3/4/20 2:05 PM, Delia Burduv wrote: > >>> Hi, > >>> > >>> The previous version of this patch shared part of its code with the > >>> store intrinsics patch > >>> (https://gcc.gnu.org/ml/gcc-patches/2020-03/msg00145.html) so I removed > >>> any duplicated code. This patch now depends on the previously mentioned > >>> store intrinsics patch. > >>> > >>> Here is the latest version and the updated ChangeLog. > >>> > >>> gcc/ChangeLog: > >>> > >>> 2019-03-04 Delia Burduv <delia.burduv@arm.com> > >>> > >>> * config/arm/arm_neon.h (bfloat16_t): New typedef. > >>> (vld2_bf16): New. > >>> (vld2q_bf16): New. > >>> (vld3_bf16): New. > >>> (vld3q_bf16): New. > >>> (vld4_bf16): New. > >>> (vld4q_bf16): New. > >>> (vld2_dup_bf16): New. > >>> (vld2q_dup_bf16): New. > >>> (vld3_dup_bf16): New. > >>> (vld3q_dup_bf16): New. > >>> (vld4_dup_bf16): New. > >>> (vld4q_dup_bf16): New. > >>> * config/arm/arm_neon_builtins.def > >>> (vld2): Changed to VAR13 and added v4bf, v8bf > >>> (vld2_dup): Changed to VAR8 and added v4bf, v8bf > >>> (vld3): Changed to VAR13 and added v4bf, v8bf > >>> (vld3_dup): Changed to VAR8 and added v4bf, v8bf > >>> (vld4): Changed to VAR13 and added v4bf, v8bf > >>> (vld4_dup): Changed to VAR8 and added v4bf, v8bf > >>> * config/arm/iterators.md (VDXBF): New iterator. > >>> (VQ2BF): New iterator. > >>> *config/arm/neon.md (vld2): Used new iterators. > >>> (vld2_dup<mode>): Used new iterators. > >>> (vld2_dupv8bf): New. > >>> (vst3): Used new iterators. > >>> (vst3qa): Used new iterators. > >>> (vst3qb): Used new iterators. > >>> (vld3_dup<mode>): Used new iterators. > >>> (vld3_dupv8bf): New. > >>> (vst4): Used new iterators. > >>> (vst4qa): Used new iterators. > >>> (vst4qb): Used new iterators. > >>> (vld4_dup<mode>): Used new iterators. > >>> (vld4_dupv8bf): New. > >>> > >>> > >>> gcc/testsuite/ChangeLog: > >>> > >>> 2019-03-04 Delia Burduv <delia.burduv@arm.com> > >>> > >>> * gcc.target/arm/simd/bf16_vldn_1.c: New test. > >>> > >>> Thanks, > >>> Delia > >>> > >>> On 2/19/20 5:25 PM, Delia Burduv wrote: > >>> > > >>> > Hi, > >>> > > >>> > Here is the latest version of the patch. It just has some minor > >>> > formatting changes that were brought up by Richard Sandiford in the > >>> > AArch64 patches > >>> > > >>> > Thanks, > >>> > Delia > >>> > > >>> > On 1/22/20 5:31 PM, Delia Burduv wrote: > >>> >> Ping. > >>> >> > >>> >> I will change the tests to use the exact input and output > >>> registers as > >>> >> Richard Sandiford suggested for the AArch64 patches. > >>> >> > >>> >> On 12/20/19 6:48 PM, Delia Burduv wrote: > >>> >>> This patch adds the ARMv8.6 ACLE BFloat16 load intrinsics > >>> >>> vld<n>{q}_bf16 as part of the BFloat16 extension. > >>> >>> > >>> (https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) > >>> > >>> >>> > >>> >>> The intrinsics are declared in arm_neon.h . > >>> >>> A new test is added to check assembler output. > >>> >>> > >>> >>> This patch depends on the Arm back-end patche. > >>> >>> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html) > >>> >>> > >>> >>> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't > >>> >>> have commit rights, so if this is ok can someone please commit > >>> it for > >>> >>> me? > >>> >>> > >>> >>> gcc/ChangeLog: > >>> >>> > >>> >>> 2019-11-14 Delia Burduv <delia.burduv@arm.com> > >>> >>> > >>> >>> * config/arm/arm_neon.h (bfloat16_t): New typedef. > >>> >>> (bfloat16x4x2_t): New typedef. > >>> >>> (bfloat16x8x2_t): New typedef. > >>> >>> (bfloat16x4x3_t): New typedef. > >>> >>> (bfloat16x8x3_t): New typedef. > >>> >>> (bfloat16x4x4_t): New typedef. > >>> >>> (bfloat16x8x4_t): New typedef. > >>> >>> (vld2_bf16): New. > >>> >>> (vld2q_bf16): New. > >>> >>> (vld3_bf16): New. > >>> >>> (vld3q_bf16): New. > >>> >>> (vld4_bf16): New. > >>> >>> (vld4q_bf16): New. > >>> >>> (vld2_dup_bf16): New. > >>> >>> (vld2q_dup_bf16): New. > >>> >>> (vld3_dup_bf16): New. > >>> >>> (vld3q_dup_bf16): New. > >>> >>> (vld4_dup_bf16): New. > >>> >>> (vld4q_dup_bf16): New. > >>> >>> * config/arm/arm-builtins.c (E_V2BFmode): New mode. > >>> >>> (VAR13): New. > >>> >>> (arm_simd_types[Bfloat16x2_t]):New type. > >>> >>> * config/arm/arm-modes.def (V2BF): New mode. > >>> >>> * config/arm/arm-simd-builtin-types.def > >>> >>> (Bfloat16x2_t): New entry. > >>> >>> * config/arm/arm_neon_builtins.def > >>> >>> (vld2): Changed to VAR13 and added v4bf, v8bf > >>> >>> (vld2_dup): Changed to VAR8 and added v4bf, v8bf > >>> >>> (vld3): Changed to VAR13 and added v4bf, v8bf > >>> >>> (vld3_dup): Changed to VAR8 and added v4bf, v8bf > >>> >>> (vld4): Changed to VAR13 and added v4bf, v8bf > >>> >>> (vld4_dup): Changed to VAR8 and added v4bf, v8bf > >>> >>> * config/arm/iterators.md (VDXBF): New iterator. > >>> >>> (VQ2BF): New iterator. > >>> >>> (V_elem): Added V4BF, V8BF. > >>> >>> (V_sz_elem): Added V4BF, V8BF. > >>> >>> (V_mode_nunits): Added V4BF, V8BF. > >>> >>> (q): Added V4BF, V8BF. > >>> >>> *config/arm/neon.md (vld2): Used new iterators. > >>> >>> (vld2_dup<mode>): Used new iterators. > >>> >>> (vld2_dupv8bf): New. > >>> >>> (vst3): Used new iterators. > >>> >>> (vst3qa): Used new iterators. > >>> >>> (vst3qb): Used new iterators. > >>> >>> (vld3_dup<mode>): Used new iterators. > >>> >>> (vld3_dupv8bf): New. > >>> >>> (vst4): Used new iterators. > >>> >>> (vst4qa): Used new iterators. > >>> >>> (vst4qb): Used new iterators. > >>> >>> (vld4_dup<mode>): Used new iterators. > >>> >>> (vld4_dupv8bf): New. > >>> >>> > >>> >>> > >>> >>> gcc/testsuite/ChangeLog: > >>> >>> > >>> >>> 2019-11-14 Delia Burduv <delia.burduv@arm.com> > >>> >>> > >>> >>> * gcc.target/arm/simd/bf16_vldn_1.c: New test. > >> > >> > >> diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c > >> b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c > >> new file mode 100644 > >> index > >> 0000000000000000000000000000000000000000..7ff8b600827e5c2e313ce40d14382aa641b4bb31 > >> > >> --- /dev/null > >> +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c > >> @@ -0,0 +1,152 @@ > >> +/* { dg-do assemble } */ > >> +/* { dg-options "-save-temps" } */ > >> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ > >> +/* { dg-add-options arm_v8_2a_bf16_neon } */ > >> +/* { dg-final { check-function-bodies "**" "" } } */ > >> > >> > >> I think this should include an optimisation option like -O2 because... > >> > >> + > >> +#include "arm_neon.h" > >> + > >> + > >> +/* > >> +**test_vld2_bf16: > >> +** ... > >> +** vld2.16 {d16-d17}, \[r3\] > >> > >> ... this is unstable codegen depending on the -O0 register allocator > >> moving the ptr argument to r3 from its initial r0. > >> This should really be r0 and the load instruction should load the low > >> D regs. > >> So let's add an -O2 to the dg-options and scan for the result of that. > >> > >> > >> Otherwise this is ok. > >> Thanks! > >> Kyrill > >> > >> > >> +** ... > >> +*/ > >> +bfloat16x4x2_t > >> +test_vld2_bf16 (bfloat16_t * ptr) > >> +{ > >> + vld2_bf16 (ptr); > >> +} > >> + > >> ^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2020-03-09 10:18 UTC | newest] Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2019-12-20 19:04 ACLE intrinsics: BFloat16 load intrinsics for AArch32 Delia Burduv 2020-01-22 18:20 ` Delia Burduv 2020-01-28 17:18 ` Delia Burduv 2020-02-19 17:26 ` Delia Burduv 2020-03-04 14:05 ` Delia Burduv 2020-03-04 17:21 ` Kyrill Tkachov 2020-03-05 16:39 ` Delia Burduv 2020-03-06 10:45 ` Kyrill Tkachov 2020-03-09 10:18 ` Christophe Lyon
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).