public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/vendors/ix86/heads/ise046)] i386: Add intrinsic for vector __bf16
@ 2022-10-20 8:48 hongtao Liu
0 siblings, 0 replies; only message in thread
From: hongtao Liu @ 2022-10-20 8:48 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:2814e18775a7a46893970e410afaa91607e7d974
commit 2814e18775a7a46893970e410afaa91607e7d974
Author: konglin1 <lingling.kong@intel.com>
Date: Tue Sep 20 15:37:51 2022 +0800
i386: Add intrinsic for vector __bf16
gcc/ChangeLog:
* config/i386/avx512fp16intrin.h : New intrinsic.
(_mm_load_sbf16): Ditto.
(_mm_mask_load_sbf16): Ditto.
(_mm_maskz_load_sbf16): Ditto.
(_mm_mask_store_sbf16): Ditto.
(_mm_mask_move_sbf16): Ditto.
(_mm_maskz_move_sbf16): Ditto.
* config/i386/avx512bf16intrin.h: New intrinsic.
(_mm_setzero_pbf16): Ditto.
(_mm256_setzero_pbf16): Ditto.
(_mm512_setzero_pbf16): Ditto.
(_mm512_undefined_pbf16): Ditto.
(_mm512_set1_pbf16): Ditto.
(_mm512_set_pbf16): Ditto.
(_mm512_setr_pbf16): Ditto.
(_mm_castpbf16_ps): Ditto.
(_mm256_castpbf16_ps): Ditto.
(_mm512_castpbf16_ps): Ditto.
(_mm_castpbf16_pd): Ditto.
(_mm256_castpbf16_pd): Ditto.
(_mm512_castpbf16_pd): Ditto.
(_mm_castpbf16_si128): Ditto.
(_mm256_castpbf16_si256): Ditto.
(_mm512_castpbf16_si512): Ditto.
(_mm_castps_pbf16): Ditto.
(_mm256_castps_pbf16): Ditto.
(_mm512_castps_pbf16): Ditto.
(_mm_castpd_pbf16): Ditto.
(_mm256_castpd_pbf16): Ditto.
(_mm512_castpd_pbf16): Ditto.
(_mm_castsi128_pbf16): Ditto.
(_mm256_castsi256_pbf16): Ditto.
(_mm512_castsi512_pbf16): Ditto.
(_mm256_castpbf16256_pbf16128): Ditto.
(_mm512_castpbf16512_pbf16128): Ditto.
(_mm512_castpbf16512_pbf16256): Ditto.
(_mm256_castpbf16128_pbf16256): Ditto.
(_mm512_castpbf16128_pbf16512): Ditto.
(_mm512_castpbf16256_pbf16512): Ditto.
(_mm256_zextpbf16128_pbf16256): Ditto.
(_mm512_zextpbf16128_pbf16512): Ditto.
(_mm512_zextpbf16256_pbf16512): Ditto.
(_mm512_abs_pbf16): Ditto.
(_mm512_load_pbf16): Ditto.
(_mm256_load_pbf16): Ditto.
(_mm_load_pbf16): Ditto.
(_mm512_loadu_pbf16): Ditto.
(_mm256_loadu_pbf16): Ditto.
(_mm_loadu_pbf16): Ditto.
(_mm_store_sbf16): Ditto.
(_mm512_store_pbf16): Ditto.
(_mm256_store_pbf16): Ditto.
(_mm_store_pbf16): Ditto.
(_mm512_storeu_pbf16): Ditto.
(_mm256_storeu_pbf16): Ditto.
(_mm_storeu_pbf16): Ditto.
(_mm_move_sbf16): Ditto.
(_mm512_mask_blend_pbf16): Ditto.
(_mm512_permutex2var_pbf16): Ditto.
(_mm512_permutexvar_pbf16): Ditto.
(_mm512_bcstnebf16_ps): Ditto.
(_mm512_mask_bcstnebf16_ps): Ditto.
(_mm512_bcstnesh_ps): Ditto.
(_mm512_mask_bcstnesh_ps): Ditto.
(_mm512_maskz_bcstnesh_ps): Ditto.
(_mm512_cvtne2ps_ph): Ditto.
(_mm512_mask_cvtne2ps_ph): Ditto.
(_mm512_cvtne_round2ps_ph): Ditto.
(_mm512_mask_cvtne_round2ps_ph): Ditto.
(_mm512_cvtneebf16_ps): Ditto.
(_mm512_mask_cvtneebf16_ps): Ditto.
(_mm512_maskz_cvtneebf16_ps): Ditto.
(_mm512_cvtneeph_ps): Ditto.
(_mm512_mask_cvtneeph_ps): Ditto.
(_mm512_cvtneobf16_ps): Ditto.
(_mm512_mask_cvtneobf16_ps): Ditto.
(_mm512_maskz_cvtneobf16_ps): Ditto.
(_mm512_cvtneoph_ps): Ditto.
(_mm512_mask_cvtneoph_ps): Ditto.
* config/i386/avx512bf16vlintrin.h (__attribute__): Ditto.
(_mm_cvtsbf16_bf16): Ditto.
(_mm256_cvtsbf16_bf16): Ditto.
(_mm256_undefined_pbf16): Ditto.
(_mm_undefined_pbf16): Ditto.
(_mm_set_sbf16): Ditto.
(_mm_set1_pbf16): Ditto.
(_mm256_set1_pbf16): Ditto.
(_mm_set_pbf16): Ditto.
(_mm256_set_pbf16): Ditto.
(_mm_setr_pbf16): Ditto.
(_mm256_setr_pbf16): Ditto.
(_mm256_abs_pbf16): Ditto.
(_mm_abs_pbf16): Ditto.
(_mm_mask_blend_pbf16): Ditto.
(_mm256_mask_blend_pbf16): Ditto.
(_mm_permutex2var_pbf16): Ditto.
(_mm256_permutex2var_pbf16): Ditto.
(_mm_permutexvar_pbf16): Ditto.
(_mm256_permutexvar_pbf16): Ditto.
(_mm_cvtneebf16_ps): Change bf16 mode.
(_mm256_cvtneebf16_ps): Diito.
(_mm_cvtneobf16_ps): Diito.
(_mm256_cvtneobf16_ps): Diito.
(_mm_mask_cvtneebf16_ps): Diito.
(_mm_maskz_cvtneebf16_ps): Diito.
(_mm256_mask_cvtneebf16_ps): Diito.
(_mm256_maskz_cvtneebf16_ps): Diito.
(_mm_mask_cvtneobf16_ps): Diito.
(_mm_maskz_cvtneobf16_ps): Diito.
(_mm256_mask_cvtneobf16_ps): Diito.
(_mm256_maskz_cvtneobf16_ps): Diito.
* config/i386/immintrin.h: Add SSE2 depend for avx512bf16.
Diff:
---
gcc/config/i386/avx512bf16intrin.h | 418 +++++++++++++++++++++++++++++++++++
gcc/config/i386/avx512bf16vlintrin.h | 177 +++++++++++++++
gcc/config/i386/avx512fp16intrin.h | 70 ++++++
gcc/config/i386/immintrin.h | 2 +
4 files changed, 667 insertions(+)
diff --git a/gcc/config/i386/avx512bf16intrin.h b/gcc/config/i386/avx512bf16intrin.h
index b6e9ddad157..d09a59c1509 100644
--- a/gcc/config/i386/avx512bf16intrin.h
+++ b/gcc/config/i386/avx512bf16intrin.h
@@ -51,6 +51,424 @@ _mm_cvtsbh_ss (__bfloat16 __A)
return __tmp.a;
}
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero_pbf16 (void)
+{
+ return (__m512bf16)(__v32bf) _mm512_setzero_ps ();
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_undefined_pbf16 (void)
+{
+ __m512bf16 __Y = __Y;
+ return __Y;
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_pbf16 (__bf16 __h)
+{
+ return (__m512bf16)(__v32bf) {__h, __h, __h, __h, __h, __h, __h, __h,
+ __h, __h, __h, __h, __h, __h, __h, __h,
+ __h, __h, __h, __h, __h, __h, __h, __h,
+ __h, __h, __h, __h, __h, __h, __h, __h};
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_pbf16 (__bf16 __h1, __bf16 __h2, __bf16 __h3, __bf16 __h4,
+ __bf16 __h5, __bf16 __h6, __bf16 __h7, __bf16 __h8,
+ __bf16 __h9, __bf16 __h10, __bf16 __h11, __bf16 __h12,
+ __bf16 __h13, __bf16 __h14, __bf16 __h15, __bf16 __h16,
+ __bf16 __h17, __bf16 __h18, __bf16 __h19, __bf16 __h20,
+ __bf16 __h21, __bf16 __h22, __bf16 __h23, __bf16 __h24,
+ __bf16 __h25, __bf16 __h26, __bf16 __h27, __bf16 __h28,
+ __bf16 __h29, __bf16 __h30, __bf16 __h31, __bf16 __h32)
+{
+ return
+ (__m512bf16)(__v32bf) {__h32, __h31, __h30, __h29, __h28, __h27, __h26,
+ __h25, __h24, __h23, __h22, __h21, __h20, __h19,
+ __h18, __h17, __h16, __h15, __h14, __h13, __h12,
+ __h11, __h10, __h9, __h8, __h7, __h6, __h5,
+ __h4, __h3, __h2, __h1};
+}
+
+#define _mm512_setr_pbf16(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, \
+ h13, h14, h15, h16, h17, h18, h19, h20, h21, h22, \
+ h23, h24, h25, h26, h27, h28, h29, h30, h31, h32) \
+ _mm512_set_pbf16 ((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), \
+ (h24), (h23), (h22), (h21), (h20), (h19), (h18), (h17), \
+ (h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), \
+ (h8), (h7), (h6), (h5), (h4), (h3), (h2), (h1))
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpbf16_ps (__m128bf16 __a)
+{
+ return (__m128) __a;
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpbf16_ps (__m256bf16 __a)
+{
+ return (__m256) __a;
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpbf16_ps (__m512bf16 __a)
+{
+ return (__m512) __a;
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpbf16_pd (__m128bf16 __a)
+{
+ return (__m128d) __a;
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpbf16_pd (__m256bf16 __a)
+{
+ return (__m256d) __a;
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpbf16_pd (__m512bf16 __a)
+{
+ return (__m512d) __a;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpbf16_si128 (__m128bf16 __a)
+{
+ return (__m128i) __a;
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpbf16_si256 (__m256bf16 __a)
+{
+ return (__m256i) __a;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpbf16_si512 (__m512bf16 __a)
+{
+ return (__m512i) __a;
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_pbf16 (__m128 __a)
+{
+ return (__m128bf16) __a;
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps_pbf16 (__m256 __a)
+{
+ return (__m256bf16) __a;
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castps_pbf16 (__m512 __a)
+{
+ return (__m512bf16) __a;
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_pbf16 (__m128d __a)
+{
+ return (__m128bf16) __a;
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd_pbf16 (__m256d __a)
+{
+ return (__m256bf16) __a;
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpd_pbf16 (__m512d __a)
+{
+ return (__m512bf16) __a;
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_pbf16 (__m128i __a)
+{
+ return (__m128bf16) __a;
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi256_pbf16 (__m256i __a)
+{
+ return (__m256bf16) __a;
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castsi512_pbf16 (__m512i __a)
+{
+ return (__m512bf16) __a;
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpbf16256_pbf16128 (__m256bf16 __a)
+{
+ return __builtin_shufflevector (__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpbf16512_pbf16128 (__m512bf16 __a)
+{
+ return __builtin_shufflevector (__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpbf16512_pbf16256 (__m512bf16 __a)
+{
+ return __builtin_shufflevector (__a, __a, 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpbf16128_pbf16256 (__m128bf16 __a)
+{
+ return __builtin_shufflevector (__a, __a, 0, 1, 2, 3, 4, 5, 6, 7,
+ -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpbf16128_pbf16512 (__m128bf16 __a)
+{
+ return __builtin_shufflevector (__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpbf16256_pbf16512 (__m256bf16 __a)
+{
+ return __builtin_shufflevector (__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+ 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zextpbf16128_pbf16256 (__m128bf16 __A)
+{
+ return (__m256bf16) _mm256_insertf128_ps (_mm256_setzero_ps (),
+ (__m128) __A, 0);
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextpbf16128_pbf16512 (__m128bf16 __A)
+{
+ return (__m512bf16) _mm512_insertf32x4 (_mm512_setzero_ps (),
+ (__m128) __A, 0);
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextpbf16256_pbf16512 (__m256bf16 __A)
+{
+ return (__m512bf16) _mm512_insertf64x4 (_mm512_setzero_pd (),
+ (__m256d) __A, 0);
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_pbf16 (__m512bf16 __A)
+{
+ return
+ (__m512bf16) _mm512_and_epi32 (_mm512_set1_epi32 (0x7FFF7FFF),
+ (__m512i) __A);
+}
+
+// loads with vmovsh if avx512fp16 enable:
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_pbf16 (void const *__p)
+{
+ return *(const __m512bf16 *) __p;
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_pbf16 (void const *__p)
+{
+ return *(const __m256bf16 *) __p;
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_pbf16 (void const *__p)
+{
+ return *(const __m128bf16 *) __p;
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_pbf16 (void const *__p)
+{
+ struct __loadu_pbf16
+ {
+ __m512bf16_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_pbf16 *) __p)->__v;
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_pbf16 (void const *__p)
+{
+ struct __loadu_pbf16
+ {
+ __m256bf16_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_pbf16 *) __p)->__v;
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_pbf16 (void const *__p)
+{
+ struct __loadu_pbf16
+ {
+ __m128bf16_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_pbf16 *) __p)->__v;
+}
+
+// stores with vmovsh if avx512fp16 enable:
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_sbf16 (void *__dp, __m128bf16 __a)
+{
+ struct __mm_store_sbf16_struct
+ {
+ __bf16 __u;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __mm_store_sbf16_struct *) __dp)->__u = __a[0];
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_pbf16 (void *__P, __m512bf16 __A)
+{
+ *(__m512bf16 *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_pbf16 (void *__P, __m256bf16 __A)
+{
+ *(__m256bf16 *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_pbf16 (void *__P, __m128bf16 __A)
+{
+ *(__m128bf16 *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_pbf16 (void *__P, __m512bf16 __A)
+{
+ struct __storeu_pbf16 {
+ __m512bf16_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_pbf16 *) __P)->__v = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_pbf16 (void *__P, __m256bf16 __A)
+{
+ struct __storeu_pbf16
+ {
+ __m256bf16_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_pbf16 *) __P)->__v = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_pbf16 (void *__P, __m128bf16 __A)
+{
+ struct __storeu_pbf16
+ {
+ __m128bf16_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_pbf16 *) __P)->__v = __A;
+}
+
+// moves with vmovsh if enable avx512fp16:
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_sbf16 (__m128bf16 __a, __m128bf16 __b)
+{
+ __a[0] = __b[0];
+ return __a;
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_pbf16 (__mmask32 __U, __m512bf16 __A, __m512bf16 __W)
+{
+ return (__m512bf16) __builtin_ia32_movdquhi512_mask ((__v32hi) __W,
+ (__v32hi) __A,
+ (__mmask32) __U);
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_pbf16 (__m512bf16 __A, __m512i __I, __m512bf16 __B)
+{
+ return (__m512bf16) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A,
+ (__v32hi) __I,
+ (__v32hi) __B,
+ (__mmask32)-1);
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_pbf16 (__m512i __A, __m512bf16 __B)
+{
+ return (__m512bf16) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+ (__v32hi) __A,
+ (__v32hi)
+ (_mm512_setzero_si512 ()),
+ (__mmask32)-1);
+}
+
/* vcvtne2ps2bf16 */
extern __inline __m512bh
diff --git a/gcc/config/i386/avx512bf16vlintrin.h b/gcc/config/i386/avx512bf16vlintrin.h
index 969335ff358..732623a94a2 100644
--- a/gcc/config/i386/avx512bf16vlintrin.h
+++ b/gcc/config/i386/avx512bf16vlintrin.h
@@ -44,6 +44,183 @@ typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
typedef unsigned short __bfloat16;
+
+extern __inline __bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsbf16_bf16 (__m128bf16 __a)
+{
+ return __a[0];
+}
+
+extern __inline __bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsbf16_bf16 (__m256bf16 __a)
+{
+ return __a[0];
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_undefined_pbf16 (void)
+{
+ __m256bf16 __Y = __Y;
+ return __Y;
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_pbf16 (void)
+{
+ __m128bf16 __Y = __Y;
+ return __Y;
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_pbf16 (void)
+{
+ return (__m128bf16)(__v8bf) _mm_setzero_ps ();
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setzero_pbf16 (void)
+{
+ return (__m256bf16)(__v16bf) _mm256_setzero_ps ();
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_sbf16 (__bf16 bf)
+{
+ return (__v8bf)
+ __builtin_shufflevector ((__v8bf){bf, bf, bf, bf, bf, bf, bf, bf},
+ (__v8bf) _mm_setzero_pbf16 (), 0,
+ 8, 8, 8, 8, 8, 8, 8);
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_pbf16 (__bf16 bf)
+{
+ return (__m128bf16)(__v8bf) {bf, bf, bf, bf, bf, bf, bf, bf};
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_pbf16 (__bf16 bf)
+{
+ return (__m256bf16)(__v16bf) {bf, bf, bf, bf, bf, bf, bf, bf,
+ bf, bf, bf, bf, bf, bf, bf, bf};
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pbf16 (__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4,
+ __bf16 bf5, __bf16 bf6, __bf16 bf7, __bf16 bf8)
+{
+ return (__m128bf16)(__v8bf) {bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8};
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_pbf16 (__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4,
+ __bf16 bf5, __bf16 bf6, __bf16 bf7, __bf16 bf8,
+ __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12,
+ __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16)
+{
+ return (__m256bf16)(__v16bf) {bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8,
+ bf9, bf10, bf11, bf12, bf13, bf14,
+ bf15, bf16};
+}
+
+#define _mm_setr_pbf16(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8) \
+ _mm_set_pbf16 ((bf8), (bf7), (bf6), (bf5), (bf4), (bf3), (bf2), (bf1))
+
+#define _mm256_setr_pbf16(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10, \
+ bf11, bf12, bf13, bf14, bf15, bf16) \
+ _mm256_set_pbf16 ((bf16), (bf15), (bf14), (bf13), (bf12), (bf11), (bf10), \
+ (bf9), (bf8), (bf7), (bf6), (bf5), (bf4), (bf3), (bf2), \
+ (bf1))
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_pbf16 (__m256bf16 __A)
+{
+ return (__m256bf16) _mm256_and_si256 (_mm256_set1_epi32 (0x7FFF7FFF),
+ (__m256i)__A);
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pbf16 (__m128bf16 __A)
+{
+ return (__m128bf16) _mm_and_si128 (_mm_set1_epi32 (0x7FFF7FFF),
+ (__m128i)__A);
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_pbf16 (__mmask8 __U, __m128bf16 __A, __m128bf16 __W)
+{
+ return (__m128bf16)
+ __builtin_ia32_movdquhi128_mask ((__v8hi) __W,
+ (__v8hi) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_pbf16 (__mmask16 __U, __m256bf16 __A, __m256bf16 __W)
+{
+ return (__m256bf16)
+ __builtin_ia32_movdquhi256_mask ((__v16hi) __W,
+ (__v16hi) __A,
+ (__mmask16) __U);
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutex2var_pbf16 (__m128bf16 __A, __m128i __I, __m128bf16 __B)
+{
+ return (__m128bf16)
+ __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A,
+ (__v8hi) __I,
+ (__v8hi) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex2var_pbf16 (__m256bf16 __A, __m256i __I, __m256bf16 __B)
+{
+ return (__m256bf16) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A,
+ (__v16hi) __I,
+ (__v16hi) __B,
+ (__mmask16)-1);
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutexvar_pbf16 (__m128i __A, __m128bf16 __B)
+{
+ return (__m128bf16) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+ (__v8hi) __A,
+ (__v8hi)
+ (_mm_setzero_si128 ()),
+ (__mmask8) -1);
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutexvar_pbf16 (__m256i __A, __m256bf16 __B)
+{
+ return (__m256bf16) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+ (__v16hi) __A,
+ (__v16hi)
+ (_mm256_setzero_si256 ()),
+ (__mmask16) -1);
+}
/* vcvtne2ps2bf16 */
extern __inline __m256bh
diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h
index 75f7475ad18..82b814abde2 100644
--- a/gcc/config/i386/avx512fp16intrin.h
+++ b/gcc/config/i386/avx512fp16intrin.h
@@ -53,6 +53,18 @@ typedef _Float16 __m256h_u __attribute__ ((__vector_size__ (32), \
typedef _Float16 __m512h_u __attribute__ ((__vector_size__ (64), \
__may_alias__, __aligned__ (1)));
+
+/* Internal data types for implementing the bf16 intrinsics. */
+typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
+typedef __bf16 __m512bf16 __attribute__((__vector_size__(64), __aligned__(64)));
+typedef __bf16 __m512bf16_u __attribute__((__vector_size__(64), __aligned__(1)));
+typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
+typedef __bf16 __m128bf16 __attribute__((__vector_size__(16), __aligned__(16)));
+typedef __bf16 __m128bf16_u __attribute__((__vector_size__(16), __aligned__(1)));
+typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
+typedef __bf16 __m256bf16 __attribute__((__vector_size__(32), __aligned__(32)));
+typedef __bf16 __m256bf16_u __attribute__((__vector_size__(32), __aligned__(1)));
+
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_ph (_Float16 __A7, _Float16 __A6, _Float16 __A5,
@@ -2771,6 +2783,44 @@ _mm_mask_store_sh (_Float16 const* __A, __mmask8 __B, __m128h __C)
__builtin_ia32_storesh_mask (__A, __C, __B);
}
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_sbf16 (void const *__dp)
+{
+ return (__m128bf16)
+ __builtin_ia32_loadsh_mask ((_Float16 const*) __dp,
+ _mm_setzero_ph(),
+ (__mmask8) -1);
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_sbf16 (__m128bf16 __A, __mmask8 __B, const void *__C)
+{
+ return (__m128bf16)
+ __builtin_ia32_loadsh_mask ((_Float16 const*) __C,
+ (__v8hf) __A,
+ (__mmask8) __B);
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_sbf16 (__mmask8 __A, const void *__B)
+{
+ return (__m128bf16)
+ __builtin_ia32_loadsh_mask ((_Float16 const*) __B,
+ _mm_setzero_ph(),
+ (__mmask8) __A);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_sbf16 (const void *__A, __mmask8 __B, __m128bf16 __C)
+{
+ __builtin_ia32_storesh_mask ((_Float16 const*) __A,
+ (__v8hf) __C, (__mmask8) __B);
+}
+
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_sh (__m128h __A, __m128h __B)
@@ -2793,6 +2843,26 @@ _mm_maskz_move_sh (__mmask8 __A, __m128h __B, __m128h __C)
return __builtin_ia32_vmovsh_mask (__B, __C, _mm_setzero_ph (), __A);
}
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_move_sbf16 (__m128bf16 __A, __mmask8 __B,
+ __m128bf16 __C, __m128bf16 __D)
+{
+ return (__m128bf16)
+ __builtin_ia32_vmovsh_mask ((__v8hf) __C, (__v8hf) __D,
+ (__v8hf) __A, (__mmask8) __B);
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_move_sbf16 (__mmask8 __A, __m128bf16 __B, __m128bf16 __C)
+{
+ return (__m128bf16)
+ __builtin_ia32_vmovsh_mask ((__v8hf) __B, (__v8hf) __C,
+ _mm_setzero_ph(),
+ (__mmask8) __A);
+}
+
/* Intrinsics vcvtph2dq. */
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h
index ddea249d09b..c62d50f1951 100644
--- a/gcc/config/i386/immintrin.h
+++ b/gcc/config/i386/immintrin.h
@@ -118,9 +118,11 @@
#include <vpclmulqdqintrin.h>
+#ifdef __SSE2__
#include <avx512bf16vlintrin.h>
#include <avx512bf16intrin.h>
+#endif
#include <amxtileintrin.h>
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2022-10-20 8:48 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-20 8:48 [gcc(refs/vendors/ix86/heads/ise046)] i386: Add intrinsic for vector __bf16 hongtao Liu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).