diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 09297831cdcd6e695843c17b7724c114f3a129fe..62aa82af12c602b089966a35ef6645a4d01264ad 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -42,6 +42,18 @@ extern "C" { #include #include +/* For big-endian, GCC's vector indices are reversed within each 64 + bits compared to the architectural lane indices used by Neon + intrinsics. */ +#ifdef __ARM_BIG_ENDIAN +#define __ARM_NUM_LANES(__v) (sizeof (__v) / sizeof (__v[0])) +#define __arm_lane(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec) - 1)) +#define __arm_laneq(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec)/2 - 1)) +#else +#define __arm_lane(__vec, __idx) __idx +#define __arm_laneq(__vec, __idx) __idx +#endif + typedef __simd64_int8_t int8x8_t; typedef __simd64_int16_t int16x4_t; typedef __simd64_int32_t int32x2_t; @@ -6144,18 +6156,6 @@ vget_lane_s32 (int32x2_t __a, const int __b) were marked always-inline so there were no call sites, the declaration would nonetheless raise an error. Hence, we must use a macro instead. */ - /* For big-endian, GCC's vector indices are reversed within each 64 - bits compared to the architectural lane indices used by Neon - intrinsics. */ -#ifdef __ARM_BIG_ENDIAN -#define __ARM_NUM_LANES(__v) (sizeof (__v) / sizeof (__v[0])) -#define __arm_lane(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec) - 1)) -#define __arm_laneq(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec)/2 - 1)) -#else -#define __arm_lane(__vec, __idx) __idx -#define __arm_laneq(__vec, __idx) __idx -#endif - #define vget_lane_f16(__v, __idx) \ __extension__ \ ({ \ @@ -14476,6 +14476,15 @@ vreinterpret_p16_u32 (uint32x2_t __a) #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_f16_bf16 (bfloat16x4_t __a) +{ + return (float16x4_t) __a; +} +#endif + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vreinterpret_f16_p8 (poly8x8_t __a) { return (float16x4_t) __a; @@ -15688,6 +15697,15 @@ vreinterpretq_f16_p16 (poly16x8_t __a) #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_f16_bf16 (bfloat16x8_t __a) +{ + return (float16x8_t) __a; +} +#endif + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vreinterpretq_f16_f32 (float32x4_t __a) { return (float16x8_t) __a; @@ -18750,6 +18768,496 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b, #pragma GCC push_options #pragma GCC target ("arch=armv8.2-a+bf16") +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcreate_bf16 (uint64_t __a) +{ + return (bfloat16x4_t) __a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_n_bf16 (bfloat16_t __a) +{ + return __builtin_neon_vdup_nv4bf (__a); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdupq_n_bf16 (bfloat16_t __a) +{ + return __builtin_neon_vdup_nv8bf (__a); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_lane_bf16 (bfloat16x4_t __a, const int __b) +{ + return __builtin_neon_vdup_lanev4bf (__a, __b); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdupq_lane_bf16 (bfloat16x4_t __a, const int __b) +{ + return __builtin_neon_vdup_lanev8bf (__a, __b); +} + +#define vset_lane_bf16(__e, __v, __idx) \ + __extension__ \ + ({ \ + bfloat16_t __elem = (__e); \ + bfloat16x4_t __vec = (__v); \ + __builtin_arm_lane_check (4, __idx); \ + __vec[__arm_lane(__vec, __idx)] = __elem; \ + __vec; \ + }) + +#define vsetq_lane_bf16(__e, __v, __idx) \ + __extension__ \ + ({ \ + bfloat16_t __elem = (__e); \ + bfloat16x8_t __vec = (__v); \ + __builtin_arm_lane_check (8, __idx); \ + __vec[__arm_laneq(__vec, __idx)] = __elem; \ + __vec; \ + }) + +#define vget_lane_bf16(__v, __idx) \ + __extension__ \ + ({ \ + bfloat16x4_t __vec = (__v); \ + __builtin_arm_lane_check (4, __idx); \ + bfloat16_t __res = __vec[__arm_lane(__vec, __idx)]; \ + __res; \ + }) + +#define vgetq_lane_bf16(__v, __idx) \ + __extension__ \ + ({ \ + bfloat16x8_t __vec = (__v); \ + __builtin_arm_lane_check (8, __idx); \ + bfloat16_t __res = __vec[__arm_laneq(__vec, __idx)]; \ + __res; \ + }) + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_laneq_bf16 (bfloat16x8_t __a, const int __b) +{ + return vdup_n_bf16( vgetq_lane_bf16 (__a, __b)); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdupq_laneq_bf16 (bfloat16x8_t __a, const int __b) +{ + return vdupq_n_bf16( vgetq_lane_bf16 (__a, __b)); +} + +__extension__ extern __inline bfloat16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vduph_lane_bf16 (bfloat16x4_t __a, const int __b) +{ + return vget_lane_bf16 (__a, __b); +} + +__extension__ extern __inline bfloat16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vduph_laneq_bf16 (bfloat16x8_t __a, const int __b) +{ + return vgetq_lane_bf16 (__a, __b); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vget_high_bf16 (bfloat16x8_t __a) +{ + return __builtin_neon_vget_highv8bf (__a); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vget_low_bf16 (bfloat16x8_t __a) +{ + return __builtin_neon_vget_lowv8bf (__a); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcombine_bf16 (bfloat16x4_t __a, bfloat16x4_t __b) +{ + return __builtin_neon_vcombinev4bf (__a, __b); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u8 (uint8x8_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u16 (uint16x4_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u32 (uint32x2_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u64 (uint64x1_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s8 (int8x8_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s16 (int16x4_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s32 (int32x2_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s64 (int64x1_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_p8 (poly8x8_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_p16 (poly16x4_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_p64 (poly64x1_t __a) +{ + return (bfloat16x4_t)__a; +} + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_f16 (float16x4_t __a) +{ + return (bfloat16x4_t)__a; +} +#endif + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_f32 (float32x2_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u8 (uint8x16_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u16 (uint16x8_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u32 (uint32x4_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u64 (uint64x2_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s8 (int8x16_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s16 (int16x8_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s32 (int32x4_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s64 (int64x2_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p8 (poly8x16_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p16 (poly16x8_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p64 (poly64x2_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p128 (poly128_t __a) +{ + return (bfloat16x8_t)__a; +} + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_f16 (float16x8_t __a) +{ + return (bfloat16x8_t)__a; +} +#endif + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_f32 (float32x4_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s8_bf16 (bfloat16x4_t __a) +{ + return (int8x8_t)__a; +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s16_bf16 (bfloat16x4_t __a) +{ + return (int16x4_t)__a; +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s32_bf16 (bfloat16x4_t __a) +{ + return (int32x2_t)__a; +} + +__extension__ extern __inline int64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s64_bf16 (bfloat16x4_t __a) +{ + return (int64x1_t)__a; +} + +__extension__ extern __inline uint8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u8_bf16 (bfloat16x4_t __a) +{ + return (uint8x8_t)__a; +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u16_bf16 (bfloat16x4_t __a) +{ + return (uint16x4_t)__a; +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u32_bf16 (bfloat16x4_t __a) +{ + return (uint32x2_t)__a; +} + +__extension__ extern __inline uint64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u64_bf16 (bfloat16x4_t __a) +{ + return (uint64x1_t)__a; +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_f32_bf16 (bfloat16x4_t __a) +{ + return (float32x2_t)__a; +} + +__extension__ extern __inline poly8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_p8_bf16 (bfloat16x4_t __a) +{ + return (poly8x8_t)__a; +} + +__extension__ extern __inline poly16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_p16_bf16 (bfloat16x4_t __a) +{ + return (poly16x4_t)__a; +} + +__extension__ extern __inline poly64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_p64_bf16 (bfloat16x4_t __a) +{ + return (poly64x1_t)__a; +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s8_bf16 (bfloat16x8_t __a) +{ + return (int8x16_t)__a; +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s16_bf16 (bfloat16x8_t __a) +{ + return (int16x8_t)__a; +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s32_bf16 (bfloat16x8_t __a) +{ + return (int32x4_t)__a; +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s64_bf16 (bfloat16x8_t __a) +{ + return (int64x2_t)__a; +} + +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u8_bf16 (bfloat16x8_t __a) +{ + return (uint8x16_t)__a; +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u16_bf16 (bfloat16x8_t __a) +{ + return (uint16x8_t)__a; +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u32_bf16 (bfloat16x8_t __a) +{ + return (uint32x4_t)__a; +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u64_bf16 (bfloat16x8_t __a) +{ + return (uint64x2_t)__a; +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_f32_bf16 (bfloat16x8_t __a) +{ + return (float32x4_t)__a; +} + +__extension__ extern __inline poly8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p8_bf16 (bfloat16x8_t __a) +{ + return (poly8x16_t)__a; +} + +__extension__ extern __inline poly16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p16_bf16 (bfloat16x8_t __a) +{ + return (poly16x8_t)__a; +} + +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p64_bf16 (bfloat16x8_t __a) +{ + return (poly64x2_t)__a; +} + +__extension__ extern __inline poly128_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p128_bf16 (bfloat16x8_t __a) +{ + return (poly128_t)__a; +} + __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b) diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index 85aeaecf0dc7579f511d0979708635ed65399614..bf28b24b108a081a023aa76f70d4da8bc0cc2d7e 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -221,13 +221,13 @@ VAR10 (SETLANE, vset_lane, VAR5 (UNOP, vcreate, v8qi, v4hi, v2si, v2sf, di) VAR10 (UNOP, vdup_n, v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) -VAR2 (UNOP, vdup_n, v8hf, v4hf) +VAR4 (UNOP, vdup_n, v8hf, v4hf, v8bf, v4bf) VAR10 (GETLANE, vdup_lane, v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) -VAR2 (GETLANE, vdup_lane, v8hf, v4hf) -VAR6 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di) -VAR6 (UNOP, vget_high, v16qi, v8hi, v8hf, v4si, v4sf, v2di) -VAR6 (UNOP, vget_low, v16qi, v8hi, v8hf, v4si, v4sf, v2di) +VAR4 (GETLANE, vdup_lane, v8hf, v4hf, v8bf, v4bf) +VAR7 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf) +VAR7 (UNOP, vget_high, v16qi, v8hi, v8hf, v8bf, v4si, v4sf, v2di) +VAR7 (UNOP, vget_low, v16qi, v8hi, v8hf, v8bf, v4si, v4sf, v2di) VAR3 (UNOP, vmovn, v8hi, v4si, v2di) VAR3 (UNOP, vqmovns, v8hi, v4si, v2di) VAR3 (UNOP, vqmovnu, v8hi, v4si, v2di) @@ -376,4 +376,4 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf) VAR2 (TERNOP, vbfdot, v2sf, v4sf) VAR2 (MAC_LANE_PAIR, vbfdot_lanev4bf, v2sf, v4sf) -VAR2 (MAC_LANE_PAIR, vbfdot_lanev8bf, v2sf, v4sf) \ No newline at end of file +VAR2 (MAC_LANE_PAIR, vbfdot_lanev8bf, v2sf, v4sf) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index cf5bfb4c77a7be0400bada8c517b877537f4d2c6..1b6aada0d0879a7f521bf868ad2c19166962fff2 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -82,14 +82,14 @@ (define_mode_iterator VD_RE [V8QI V4HI V2SI V2SF DI]) ;; Double-width vector modes plus 64-bit elements. -(define_mode_iterator VDX [V8QI V4HI V4HF V2SI V2SF DI]) +(define_mode_iterator VDX [V8QI V4HI V4HF V4BF V2SI V2SF DI]) ;; Double-width vector modes plus 64-bit elements, ;; with V4BFmode added, suitable for moves. (define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI]) ;; Double-width vector modes, with V4HF - for vldN_lane and vstN_lane. -(define_mode_iterator VD_LANE [V8QI V4HI V4HF V2SI V2SF]) +(define_mode_iterator VD_LANE [V8QI V4HI V4HF V4BF V2SI V2SF]) ;; Double-width vector modes without floating-point elements. (define_mode_iterator VDI [V8QI V4HI V2SI]) @@ -104,7 +104,7 @@ (define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF]) ;; Quad-width vector modes plus 64-bit elements. -(define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI]) +(define_mode_iterator VQX [V16QI V8HI V8HF V8BF V4SI V4SF V2DI]) ;; Quad-width vector modes without floating-point elements. (define_mode_iterator VQI [V16QI V8HI V4SI]) @@ -153,7 +153,7 @@ ;; Vector modes, including 64-bit integer elements. (define_mode_iterator VDQX [V8QI V16QI V4HI V8HI V2SI V4SI - V4HF V8HF V2SF V4SF DI V2DI]) + V4HF V8HF V4BF V8BF V2SF V4SF DI V2DI]) ;; Vector modes including 64-bit integer elements, but no floats. (define_mode_iterator VDQIX [V8QI V16QI V4HI V8HI V2SI V4SI DI V2DI]) @@ -518,6 +518,7 @@ (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI") (V4HI "HI") (V8HI "HI") (V4HF "HF") (V8HF "HF") + (V4BF "BF") (V8BF "BF") (V2SI "SI") (V4SI "SI") (V2SF "SF") (V4SF "SF") (DI "DI") (V2DI "DI")]) @@ -526,6 +527,7 @@ (define_mode_attr V_elem_l [(V8QI "qi") (V16QI "qi") (V4HI "hi") (V8HI "hi") (V4HF "hf") (V8HF "hf") + (V4BF "bf") (V8BF "bf") (V2SI "si") (V4SI "si") (V2SF "sf") (V4SF "sf") (DI "di") (V2DI "di")]) @@ -543,6 +545,7 @@ (define_mode_attr V_two_elem [(V8QI "HI") (V16QI "HI") (V4HI "SI") (V8HI "SI") (V4HF "SF") (V8HF "SF") + (V4BF "BF") (V8BF "BF") (V2SI "V2SI") (V4SI "V2SI") (V2SF "V2SF") (V4SF "V2SF") (DI "V2DI") (V2DI "V2DI")]) @@ -563,6 +566,7 @@ (define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK") (V4HI "BLK") (V8HI "BLK") (V4HF "BLK") (V8HF "BLK") + (V4BF "BLK") (V8BF "BLK") (V2SI "BLK") (V4SI "BLK") (V2SF "BLK") (V4SF "BLK") (DI "EI") (V2DI "EI")]) @@ -571,6 +575,7 @@ (define_mode_attr V_four_elem [(V8QI "SI") (V16QI "SI") (V4HI "V4HI") (V8HI "V4HI") (V4HF "V4HF") (V8HF "V4HF") + (V4BF "V4BF") (V8BF "V4BF") (V2SI "V4SI") (V4SI "V4SI") (V2SF "V4SF") (V4SF "V4SF") (DI "OI") (V2DI "OI")]) @@ -579,6 +584,7 @@ (define_mode_attr V_reg [(V8QI "P") (V16QI "q") (V4HI "P") (V8HI "q") (V4HF "P") (V8HF "q") + (V4BF "P") (V8BF "q") (V2SI "P") (V4SI "q") (V2SF "P") (V4SF "q") (DI "P") (V2DI "q") @@ -609,7 +615,8 @@ (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI") (V8HF "V4HF") (V4SI "V2SI") (V4SF "V2SF") (V2DF "DF") - (V2DI "DI") (V4HF "HF")]) + (V2DI "DI") (V4HF "HF") + (V4BF "BF") (V8BF "V4BF")]) ;; Same, but lower-case. (define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi") @@ -620,7 +627,7 @@ (define_mode_attr V_DOUBLE [(V8QI "V16QI") (V4HI "V8HI") (V2SI "V4SI") (V4HF "V8HF") (V2SF "V4SF") (DF "V2DF") - (DI "V2DI")]) + (DI "V2DI") (V4BF "V8BF")]) ;; Same, but lower-case. (define_mode_attr V_double [(V8QI "v16qi") (V4HI "v8hi") @@ -639,6 +646,7 @@ (V4SI "V2SI") (V4SF "V2SF") (V8QI "V8QI") (V4HI "V4HI") (V2SI "V2SI") (V2SF "V2SF") + (V8BF "V4BF") (V4BF "V4BF") (V8HF "V4HF") (V4HF "V4HF")]) ;; Mode of result of comparison operations (and bit-select operand 1). @@ -646,6 +654,7 @@ (V4HI "V4HI") (V8HI "V8HI") (V2SI "V2SI") (V4SI "V4SI") (V4HF "V4HI") (V8HF "V8HI") + (V4BF "V4HI") (V8BF "V8HI") (V2SF "V2SI") (V4SF "V4SI") (DI "DI") (V2DI "V2DI")]) @@ -687,6 +696,7 @@ (V4HI "u16") (V8HI "u16") (V2SI "32") (V4SI "32") (V4HF "u16") (V8HF "u16") + (V4BF "u16") (V8BF "u16") (V2SF "32") (V4SF "32")]) (define_mode_attr V_sz_elem [(V8QI "8") (V16QI "8") @@ -694,6 +704,7 @@ (V2SI "32") (V4SI "32") (DI "64") (V2DI "64") (V4HF "16") (V8HF "16") + (V4BF "16") (V8BF "16") (V2SF "32") (V4SF "32")]) (define_mode_attr V_elem_ch [(V8QI "b") (V16QI "b") @@ -764,10 +775,12 @@ (V2SI "true") (V4SI "false") (V2SF "true") (V4SF "false") (DI "true") (V2DI "false") + (V4BF "true") (V8BF "false") (V4HF "true") (V8HF "false")]) (define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16") (V4HF "4") (V8HF "8") + (V4BF "4") (V8BF "8") (V4HI "4") (V8HI "8") (V2SI "2") (V4SI "4") (V2SF "2") (V4SF "4") diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 21701b34fcd2c86bfd310904c7ceca0ce9fb047e..e732600719e2c0df35e1ec0a4ed1cb235dc25726 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -3704,6 +3704,22 @@ if (BYTES_BIG_ENDIAN) [(set_attr "type" "neon_from_gp_q")] ) +(define_insn "neon_vdup_nv4bf" + [(set (match_operand:V4BF 0 "s_register_operand" "=w") + (vec_duplicate:V4BF (match_operand:BF 1 "s_register_operand" "r")))] + "TARGET_NEON" + "vdup.16\t%P0, %1" + [(set_attr "type" "neon_from_gp")] +) + +(define_insn "neon_vdup_nv8bf" + [(set (match_operand:V8BF 0 "s_register_operand" "=w") + (vec_duplicate:V8BF (match_operand:BF 1 "s_register_operand" "r")))] + "TARGET_NEON" + "vdup.16\t%q0, %1" + [(set_attr "type" "neon_from_gp_q")] +) + (define_insn "neon_vdup_n" [(set (match_operand:V32 0 "s_register_operand" "=w,w") (vec_duplicate:V32 (match_operand: 1 "s_register_operand" "r,t")))] @@ -3737,7 +3753,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vdup_lane_internal" [(set (match_operand:VDQW 0 "s_register_operand" "=w") - (vec_duplicate:VDQW + (vec_duplicate:VDQW (vec_select: (match_operand: 1 "s_register_operand" "w") (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))] @@ -3758,12 +3774,12 @@ if (BYTES_BIG_ENDIAN) ) (define_insn "neon_vdup_lane_internal" - [(set (match_operand:VH 0 "s_register_operand" "=w") - (vec_duplicate:VH + [(set (match_operand:VHFBF 0 "s_register_operand" "=w") + (vec_duplicate:VHFBF (vec_select: (match_operand: 1 "s_register_operand" "w") (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))] - "TARGET_NEON && TARGET_FP16" + "TARGET_NEON && (TARGET_FP16 || TARGET_BF16_SIMD)" { if (BYTES_BIG_ENDIAN) { @@ -3799,10 +3815,10 @@ if (BYTES_BIG_ENDIAN) }) (define_expand "neon_vdup_lane" - [(match_operand:VH 0 "s_register_operand") + [(match_operand:VHFBF 0 "s_register_operand") (match_operand: 1 "s_register_operand") (match_operand:SI 2 "immediate_operand")] - "TARGET_NEON && TARGET_FP16" + "TARGET_NEON && (TARGET_FP16 || TARGET_BF16_SIMD)" { if (BYTES_BIG_ENDIAN) { @@ -6599,4 +6615,4 @@ if (BYTES_BIG_ENDIAN) } } [(set_attr "type" "neon_dot")] -) \ No newline at end of file +) diff --git a/gcc/testsuite/gcc.target/arm/bf16_dup.c b/gcc/testsuite/gcc.target/arm/bf16_dup.c new file mode 100644 index 0000000000000000000000000000000000000000..94be99a254bdaccc88a000615a4fa706bf324248 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/bf16_dup.c @@ -0,0 +1,96 @@ +/* { dg-do assemble { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps -march=armv8.2-a+bf16+fp16 -mfloat-abi=softfp" } */ + +#include "arm_neon.h" + +float32x2_t +test_vbfdot_vcreate (float32x2_t r, uint64_t a, uint64_t b) +{ + bfloat16x4_t _a = vcreate_bf16(a); + bfloat16x4_t _b = vcreate_bf16(b); + + return vbfdot_f32 (r, _a, _b); +} +/* { dg-final { scan-assembler {vdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+} } } */ + +bfloat16x8_t test_vcombine_bf16 (bfloat16x4_t a, bfloat16x4_t b) +{ + return vcombine_bf16 (a, b); +} + +bfloat16x4_t test_vget_high_bf16 (bfloat16x8_t a) +{ + return vget_high_bf16 (a); +} + +bfloat16x4_t test_vget_low_bf16 (bfloat16x8_t a) +{ + return vget_low_bf16 (a); +} + +bfloat16_t test_vget_lane_bf16 (bfloat16x4_t a) +{ + return vget_lane_bf16 (a, 1); +} + +bfloat16_t test_vgetq_lane_bf16 (bfloat16x8_t a) +{ + return vgetq_lane_bf16 (a, 7); +} + +bfloat16x4_t test_vset_lane_bf16 (bfloat16_t a, bfloat16x4_t b) +{ + return vset_lane_bf16 (a, b, 1); +} + +bfloat16x8_t test_vsetq_lane_bf16 (bfloat16_t a, bfloat16x8_t b) +{ + return vsetq_lane_bf16 (a, b, 7); +} + +bfloat16x4_t vdup_test (bfloat16_t a) +{ + return vdup_n_bf16 (a); +} +/* { dg-final { scan-assembler {vdup\.16\td[0-9]+, r[0-9]+} } } */ + +bfloat16x8_t vdupq_test (bfloat16_t a) +{ + return vdupq_n_bf16 (a); +} +/* { dg-final { scan-assembler {vdup\.16\tq[0-9]+, r[0-9]+} } } */ + + +bfloat16x4_t test_vdup_lane_bf16 (bfloat16x4_t a) +{ + return vdup_lane_bf16 (a, 1); +} +/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, d[0-9]+\[1\]} 1 } } */ + +bfloat16x8_t test_vdupq_lane_bf16 (bfloat16x4_t a) +{ + return vdupq_lane_bf16 (a, 1); +} +/* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+, d[0-9]+\[1\]} 1 } } */ + +bfloat16x4_t test_vdup_laneq_bf16 (bfloat16x8_t a) +{ + return vdup_laneq_bf16 (a, 3); +} + +bfloat16x8_t test_vdupq_laneq_bf16 (bfloat16x8_t a) +{ + return vdupq_laneq_bf16 (a, 3); +} + +bfloat16_t test_vduph_lane_bf16 (bfloat16x4_t a) +{ + return vduph_lane_bf16 (a, 1); +} + +bfloat16_t test_vduph_laneq_bf16 (bfloat16x8_t a) +{ + return vduph_laneq_bf16 (a, 7); +} diff --git a/gcc/testsuite/gcc.target/arm/bf16_reinterpret.c b/gcc/testsuite/gcc.target/arm/bf16_reinterpret.c new file mode 100644 index 0000000000000000000000000000000000000000..e7d30a95fbc3ceaf4a92057a10e6be4a34e1957c --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/bf16_reinterpret.c @@ -0,0 +1,435 @@ +/* { dg-do assemble { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps -march=armv8.2-a+fp16+bf16 -mfloat-abi=hard -mfpu=crypto-neon-fp-armv8" } */ + +#include + +float32x2_t +test_vbfdot_f32_s8 (float32x2_t r, int8x8_t a, int8x8_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_s8(a); + bfloat16x4_t _b = vreinterpret_bf16_s8(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_s16 (float32x2_t r, int16x4_t a, int16x4_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_s16(a); + bfloat16x4_t _b = vreinterpret_bf16_s16(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_s32 (float32x2_t r, int32x2_t a, int32x2_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_s32(a); + bfloat16x4_t _b = vreinterpret_bf16_s32(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_s64 (float32x2_t r, int64x1_t a, int64x1_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_s64(a); + bfloat16x4_t _b = vreinterpret_bf16_s64(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_u8 (float32x2_t r, uint8x8_t a, uint8x8_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_u8(a); + bfloat16x4_t _b = vreinterpret_bf16_u8(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_u16 (float32x2_t r, uint16x4_t a, uint16x4_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_u16(a); + bfloat16x4_t _b = vreinterpret_bf16_u16(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_u32 (float32x2_t r, uint32x2_t a, uint32x2_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_u32(a); + bfloat16x4_t _b = vreinterpret_bf16_u32(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_u64 (float32x2_t r, uint64x1_t a, uint64x1_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_u64(a); + bfloat16x4_t _b = vreinterpret_bf16_u64(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_p8 (float32x2_t r, poly8x8_t a, poly8x8_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_p8(a); + bfloat16x4_t _b = vreinterpret_bf16_p8(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_p16 (float32x2_t r, poly16x4_t a, poly16x4_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_p16(a); + bfloat16x4_t _b = vreinterpret_bf16_p16(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_p64 (float32x2_t r, poly64x1_t a, poly64x1_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_p64(a); + bfloat16x4_t _b = vreinterpret_bf16_p64(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_f16 (float32x2_t r, float16x4_t a, float16x4_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_f16(a); + bfloat16x4_t _b = vreinterpret_bf16_f16(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_f32 (float32x2_t r, float32x2_t a, float32x2_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_f32(a); + bfloat16x4_t _b = vreinterpret_bf16_f32(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_s8 (float32x4_t r, int8x16_t a, int8x16_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_s8(a); + bfloat16x8_t _b = vreinterpretq_bf16_s8(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_s16 (float32x4_t r, int16x8_t a, int16x8_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_s16(a); + bfloat16x8_t _b = vreinterpretq_bf16_s16(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_s32 (float32x4_t r, int32x4_t a, int32x4_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_s32(a); + bfloat16x8_t _b = vreinterpretq_bf16_s32(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_s64 (float32x4_t r, int64x2_t a, int64x2_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_s64(a); + bfloat16x8_t _b = vreinterpretq_bf16_s64(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_u8 (float32x4_t r, uint8x16_t a, uint8x16_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_u8(a); + bfloat16x8_t _b = vreinterpretq_bf16_u8(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_u16 (float32x4_t r, uint16x8_t a, uint16x8_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_u16(a); + bfloat16x8_t _b = vreinterpretq_bf16_u16(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_u32 (float32x4_t r, uint32x4_t a, uint32x4_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_u32(a); + bfloat16x8_t _b = vreinterpretq_bf16_u32(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_u64 (float32x4_t r, uint64x2_t a, uint64x2_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_u64(a); + bfloat16x8_t _b = vreinterpretq_bf16_u64(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_p8 (float32x4_t r, poly8x16_t a, poly8x16_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_p8(a); + bfloat16x8_t _b = vreinterpretq_bf16_p8(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_p16 (float32x4_t r, poly16x8_t a, poly16x8_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_p16(a); + bfloat16x8_t _b = vreinterpretq_bf16_p16(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_p64 (float32x4_t r, poly64x2_t a, poly64x2_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_p64(a); + bfloat16x8_t _b = vreinterpretq_bf16_p64(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_p128 (float32x4_t r, poly128_t a, poly128_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_p128(a); + bfloat16x8_t _b = vreinterpretq_bf16_p128(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_f16 (float32x4_t r, float16x8_t a, float16x8_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_f16(a); + bfloat16x8_t _b = vreinterpretq_bf16_f16(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_f32 (float32x4_t r, float32x4_t a, float32x4_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_f32(a); + bfloat16x8_t _b = vreinterpretq_bf16_f32(b); + + return vbfdotq_f32 (r, _a, _b); +} + +/* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\n} 13 } } */ +/* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 14 } } */ + +int8x8_t test_vreinterpret_s8_bf16 (bfloat16x4_t a, int8x8_t b) +{ + int8x8_t _a = vreinterpret_s8_bf16 (a); + return vadd_s8 (_a, b); +} + +int16x4_t test_vreinterpret_s16_bf16 (bfloat16x4_t a, int16x4_t b) +{ + int16x4_t _a = vreinterpret_s16_bf16 (a); + return vadd_s16 (_a, b); +} + +int32x2_t test_vreinterpret_s32_bf16 (bfloat16x4_t a, int32x2_t b) +{ + int32x2_t _a = vreinterpret_s32_bf16 (a); + return vadd_s32 (_a, b); +} + +int64x1_t test_vreinterpret_s64_bf16 (bfloat16x4_t a, int64x1_t b) +{ + int64x1_t _a = vreinterpret_s64_bf16 (a); + return vrshl_s64 (_a, b); +} + +uint8x8_t test_vreinterpret_u8_bf16 (bfloat16x4_t a, uint8x8_t b) +{ + uint8x8_t _a = vreinterpret_u8_bf16 (a); + return vadd_u8 (_a, b); +} + +uint16x4_t test_vreinterpret_u16_bf16 (bfloat16x4_t a, uint16x4_t b) +{ + uint16x4_t _a = vreinterpret_u16_bf16 (a); + return vadd_u16 (_a, b); +} + +uint32x2_t test_vreinterpret_u32_bf16 (bfloat16x4_t a, uint32x2_t b) +{ + uint32x2_t _a = vreinterpret_u32_bf16 (a); + return vadd_u32 (_a, b); +} + +uint64x1_t test_vreinterpret_u64_bf16 (bfloat16x4_t a, int64x1_t b) +{ + uint64x1_t _a = vreinterpret_u64_bf16 (a); + return vrshl_u64 (_a, b); +} + +poly8x8x2_t test_vreinterpret_p8_bf16 (bfloat16x4_t a, poly8x8_t b) +{ + poly8x8_t _a = vreinterpret_p8_bf16 (a); + return vzip_p8 (_a, b); +} + +poly16x4x2_t test_vreinterpret_p16_bf16 (bfloat16x4_t a, poly16x4_t b) +{ + poly16x4_t _a = vreinterpret_p16_bf16 (a); + return vzip_p16 (_a, b); +} + +poly64x1_t test_vreinterpret_p64_bf16 (bfloat16x4_t a, poly64x1_t b) +{ + poly64x1_t _a = vreinterpret_p64_bf16 (a); + return vsli_n_p64 (_a, b, 3); +} + +float32x2_t test_vreinterpret_f32_bf16 (bfloat16x4_t a, float32x2_t b) +{ + float32x2_t _a = vreinterpret_f32_bf16 (a); + return vsub_f32 (_a, b); +} + +int8x16_t test_vreinterpretq_s8_bf16 (bfloat16x8_t a, int8x16_t b) +{ + int8x16_t _a = vreinterpretq_s8_bf16 (a); + return vaddq_s8 (_a, b); +} + +int16x8_t test_vreinterpretq_s16_bf16 (bfloat16x8_t a, int16x8_t b) +{ + int16x8_t _a = vreinterpretq_s16_bf16 (a); + return vaddq_s16 (_a, b); +} + +int32x4_t test_vreinterpretq_s32_bf16 (bfloat16x8_t a, int32x4_t b) +{ + int32x4_t _a = vreinterpretq_s32_bf16 (a); + return vaddq_s32 (_a, b); +} + +int64x2_t test_vreinterpretq_s64_bf16 (bfloat16x8_t a, int64x2_t b) +{ + int64x2_t _a = vreinterpretq_s64_bf16 (a); + return vaddq_s64 (_a, b); +} + +uint8x16_t test_vreinterpretq_u8_bf16 (bfloat16x8_t a, uint8x16_t b) +{ + uint8x16_t _a = vreinterpretq_u8_bf16 (a); + return vaddq_u8 (_a, b); +} + +uint16x8_t test_vreinterpretq_u16_bf16 (bfloat16x8_t a, uint16x8_t b) +{ + uint16x8_t _a = vreinterpretq_u16_bf16 (a); + return vaddq_u16 (_a, b); +} + +uint32x4_t test_vreinterpretq_u32_bf16 (bfloat16x8_t a, uint32x4_t b) +{ + uint32x4_t _a = vreinterpretq_u32_bf16 (a); + return vaddq_u32 (_a, b); +} + +uint64x2_t test_vreinterpretq_u64_bf16 (bfloat16x8_t a, uint64x2_t b) +{ + uint64x2_t _a = vreinterpretq_u64_bf16 (a); + return vaddq_u64 (_a, b); +} + +poly8x16x2_t test_vreinterpretq_p8_bf16 (bfloat16x8_t a, poly8x16_t b) +{ + poly8x16_t _a = vreinterpretq_p8_bf16 (a); + return vzipq_p8 (_a, b); +} + +poly16x8x2_t test_vreinterpretq_p16_bf16 (bfloat16x8_t a, poly16x8_t b) +{ + poly16x8_t _a = vreinterpretq_p16_bf16 (a); + return vzipq_p16 (_a, b); +} + +poly64x2_t test_vreinterpretq_p64_bf16 (bfloat16x8_t a, poly64x2_t b) +{ + poly64x2_t _a = vreinterpretq_p64_bf16 (a); + return vsliq_n_p64 (_a, b, 3); +} + +poly128_t test_vreinterpretq_p128_bf16 (bfloat16x8_t a, poly16x8_t b) +{ + poly128_t _a = vreinterpretq_p128_bf16 (a); + return _a; +} + +float32x4_t test_vreinterpretq_f32_bf16 (bfloat16x8_t a, float32x4_t b) +{ + float32x4_t _a = vreinterpretq_f32_bf16 (a); + return vsubq_f32 (_a, b); +} + +float16x4_t test_vreinterpret_f16_bf16 (bfloat16x4_t a) +{ + return vreinterpret_f16_bf16 (a); +} + +float16x8_t test_vreinterpretq_f16_bf16 (bfloat16x8_t a) +{ + return vreinterpretq_f16_bf16 (a); +} + +/* { dg-final { scan-assembler-times {\tvadd.i8\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tvadd.i16\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tvadd.i32\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tvadd.i8\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tvadd.i16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tvadd.i32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tvadd.i64\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */ + +/* { dg-final { scan-assembler {\tvsub.f32\td[0-9]+, d[0-9]+, d[0-9]+\n} } } */ +/* { dg-final { scan-assembler {\tvsub.f32\tq[0-9]+, q[0-9]+, q[0-9]+\n} } } */ + +/* { dg-final { scan-assembler {\tvzip.8\td[0-9]+, d[0-9]+\n} } } */ +/* { dg-final { scan-assembler {\tvzip.16\td[0-9]+, d[0-9]+\n} } } */ +/* { dg-final { scan-assembler {\tvzip.8\tq[0-9]+, q[0-9]+\n} } } */ +/* { dg-final { scan-assembler {\tvzip.16\tq[0-9]+, q[0-9]+\n} } } */ + +/* { dg-final { scan-assembler {\tvrshl.s64\td[0-9]+, d[0-9]+, d[0-9]+\n} } } */ +/* { dg-final { scan-assembler {\tvrshl.u64\td[0-9]+, d[0-9]+, d[0-9]+\n} } } */ + +/* { dg-final { scan-assembler {\tvsli.64\td[0-9]+, d[0-9]+, #3\n} } } */ +/* { dg-final { scan-assembler {\tvsli.64\tq[0-9]+, q[0-9]+, #3\n} } } */