* [PATCH] i386: using __bf16 for AVX512BF16 intrinsics
[not found] <20221028060808.1637178-1-lingling.kong@intel.com>
@ 2022-10-28 6:20 ` Kong, Lingling
2022-10-28 6:29 ` Hongtao Liu
0 siblings, 1 reply; 2+ messages in thread
From: Kong, Lingling @ 2022-10-28 6:20 UTC (permalink / raw)
To: Liu, Hongtao, gcc-patches
Hi,
Previously we use unsigned short to represent bf16. It's not a good expression, and at the time the front end didn't support bf16 type.
Now we introduced __bf16 to X86 psABI. So we can switch intrinsics to the new type.
Ok for trunk ?
Thanks,
Lingling
gcc/ChangeLog:
* config/i386/avx512bf16intrin.h (__attribute__): Change short to bf16.
(_mm_cvtsbh_ss): Ditto.
(_mm512_cvtne2ps_pbh): Ditto.
(_mm512_mask_cvtne2ps_pbh): Ditto.
(_mm512_maskz_cvtne2ps_pbh): Ditto.
* config/i386/avx512bf16vlintrin.h (__attribute__): Ditto.
(_mm256_cvtne2ps_pbh): Ditto.
(_mm256_mask_cvtne2ps_pbh): Ditto.
(_mm256_maskz_cvtne2ps_pbh): Ditto.
(_mm_cvtne2ps_pbh): Ditto.
(_mm_mask_cvtne2ps_pbh): Ditto.
(_mm_maskz_cvtne2ps_pbh): Ditto.
(_mm_cvtness_sbh): Ditto.
* config/i386/i386-builtin-types.def (V8BF): Add new
DEF_VECTOR_TYPE for BFmode.
(V16BF): Ditto.
(V32BF): Ditto.
* config/i386/i386-builtin.def (BDESC): Fixed builtins.
* config/i386/i386-expand.cc (ix86_expand_args_builtin): Changed
avx512bf16 ix86_builtin_func_type included HI to BF.
* config/i386/immintrin.h: Add SSE2 depend for avx512bf16.
* config/i386/sse.md (TARGET_AVX512VL): Changed HI vector to BF
vector.
(avx512f_cvtneps2bf16_v4sf): New define_expand.
(*avx512f_cvtneps2bf16_v4sf): New define_insn.
(avx512f_cvtneps2bf16_v4sf_maskz):Ditto.
(avx512f_cvtneps2bf16_v4sf_mask): Ditto.
(avx512f_cvtneps2bf16_v4sf_mask_1): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/avx512bf16-cvtsbh2ss-1.c: Add fpmath option.
* gcc.target/i386/avx512bf16-vdpbf16ps-2.c: Fixed
scan-assembler.
* gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c: Add x/y suffix
for vcvtneps2bf16.
* gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c: Ditto.
---
gcc/config/i386/avx512bf16intrin.h | 12 +--
gcc/config/i386/avx512bf16vlintrin.h | 29 ++---
gcc/config/i386/i386-builtin-types.def | 51 ++++-----
gcc/config/i386/i386-builtin.def | 54 +++++-----
gcc/config/i386/i386-expand.cc | 48 ++++-----
gcc/config/i386/immintrin.h | 2 +
gcc/config/i386/sse.md | 101 ++++++++++++++----
.../gcc.target/i386/avx512bf16-cvtsbh2ss-1.c | 2 +-
.../gcc.target/i386/avx512bf16-vdpbf16ps-2.c | 2 +-
.../i386/avx512bf16vl-cvtness2sbh-1.c | 2 +-
.../i386/avx512bf16vl-vcvtneps2bf16-1.c | 12 +--
11 files changed, 189 insertions(+), 126 deletions(-)
diff --git a/gcc/config/i386/avx512bf16intrin.h b/gcc/config/i386/avx512bf16intrin.h
index b6e9ddad157..ea1d0125b3f 100644
--- a/gcc/config/i386/avx512bf16intrin.h
+++ b/gcc/config/i386/avx512bf16intrin.h
@@ -35,16 +35,16 @@
#endif /* __AVX512BF16__ */
/* Internal data types for implementing the intrinsics. */
-typedef short __v32bh __attribute__ ((__vector_size__ (64)));
+typedef __bf16 __v32bf __attribute__ ((__vector_size__ (64)));
/* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components. */
-typedef short __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
+typedef __bf16 __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
/* Convert One BF16 Data to One Single Float Data. */
extern __inline float
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cvtsbh_ss (__bfloat16 __A)
+_mm_cvtsbh_ss (__bf16 __A)
{
union{ float a; unsigned int b;} __tmp;
__tmp.b = ((unsigned int)(__A)) << 16;
@@ -57,21 +57,21 @@ extern __inline __m512bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtne2ps_pbh (__m512 __A, __m512 __B)
{
- return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi(__A, __B);
+ return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf(__A, __B);
}
extern __inline __m512bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtne2ps_pbh (__m512bh __A, __mmask32 __B, __m512 __C, __m512 __D)
{
- return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_mask(__C, __D, __A, __B);
+ return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_mask(__C, __D, __A, __B);
}
extern __inline __m512bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtne2ps_pbh (__mmask32 __A, __m512 __B, __m512 __C)
{
- return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_maskz(__B, __C, __A);
+ return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_maskz(__B, __C, __A);
}
/* vcvtneps2bf16 */
diff --git a/gcc/config/i386/avx512bf16vlintrin.h b/gcc/config/i386/avx512bf16vlintrin.h
index 969335ff358..56c28f14cf6 100644
--- a/gcc/config/i386/avx512bf16vlintrin.h
+++ b/gcc/config/i386/avx512bf16vlintrin.h
@@ -35,57 +35,58 @@
#endif /* __AVX512BF16__ */
/* Internal data types for implementing the intrinsics. */
-typedef short __v16bh __attribute__ ((__vector_size__ (32)));
-typedef short __v8bh __attribute__ ((__vector_size__ (16)));
+typedef __bf16 __v16bf __attribute__ ((__vector_size__ (32)));
+typedef __bf16 __v8bf __attribute__ ((__vector_size__ (16)));
/* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components. */
-typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
-typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef __bf16 __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
+typedef __bf16 __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
+
+typedef __bf16 __bfloat16;
-typedef unsigned short __bfloat16;
/* vcvtne2ps2bf16 */
extern __inline __m256bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtne2ps_pbh (__m256 __A, __m256 __B)
{
- return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi(__A, __B);
+ return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf(__A, __B);
}
extern __inline __m256bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D)
{
- return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_mask(__C, __D, __A, __B);
+ return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_mask(__C, __D, __A, __B);
}
extern __inline __m256bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C)
{
- return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_maskz(__B, __C, __A);
+ return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_maskz(__B, __C, __A);
}
extern __inline __m128bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtne2ps_pbh (__m128 __A, __m128 __B)
{
- return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi(__A, __B);
+ return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf(__A, __B);
}
extern __inline __m128bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D)
{
- return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_mask(__C, __D, __A, __B);
+ return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_mask(__C, __D, __A, __B);
}
extern __inline __m128bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C)
{
- return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_maskz(__B, __C, __A);
+ return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_maskz(__B, __C, __A);
}
/* vcvtneps2bf16 */
@@ -176,13 +177,13 @@ _mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D)
return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A);
}
-extern __inline __bfloat16
+extern __inline __bf16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtness_sbh (float __A)
{
__v4sf __V = {__A, 0, 0, 0};
- __v8hi __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
- (__v8hi)_mm_undefined_si128 (), (__mmask8)-1);
+ __v8bf __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
+ (__v8bf)_mm_undefined_si128 (), (__mmask8)-1);
return __R[0];
}
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index 63a360b0f8b..aedae2d7750 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -87,6 +87,7 @@ DEF_VECTOR_TYPE (V8QI, QI)
DEF_VECTOR_TYPE (V2DF, DOUBLE)
DEF_VECTOR_TYPE (V4SF, FLOAT)
DEF_VECTOR_TYPE (V8HF, FLOAT16)
+DEF_VECTOR_TYPE (V8BF, BFLOAT16)
DEF_VECTOR_TYPE (V2DI, DI)
DEF_VECTOR_TYPE (V4SI, SI)
DEF_VECTOR_TYPE (V8HI, HI)
@@ -100,6 +101,7 @@ DEF_VECTOR_TYPE (V16UQI, UQI, V16QI)
DEF_VECTOR_TYPE (V4DF, DOUBLE)
DEF_VECTOR_TYPE (V8SF, FLOAT)
DEF_VECTOR_TYPE (V16HF, FLOAT16)
+DEF_VECTOR_TYPE (V16BF, BFLOAT16)
DEF_VECTOR_TYPE (V4DI, DI)
DEF_VECTOR_TYPE (V8SI, SI)
DEF_VECTOR_TYPE (V16HI, HI)
@@ -111,6 +113,7 @@ DEF_VECTOR_TYPE (V16UHI, UHI, V16HI)
# AVX512F vectors
DEF_VECTOR_TYPE (V32SF, FLOAT)
DEF_VECTOR_TYPE (V32HF, FLOAT16)
+DEF_VECTOR_TYPE (V32BF, BFLOAT16)
DEF_VECTOR_TYPE (V16SF, FLOAT)
DEF_VECTOR_TYPE (V8DF, DOUBLE)
DEF_VECTOR_TYPE (V8DI, DI)
@@ -1273,30 +1276,30 @@ DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, UHI)
DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, UHI)
# BF16 builtins
-DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF)
-DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF, V32HI, USI)
-DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF, USI)
-DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF)
-DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF, V16HI, UHI)
-DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF, UHI)
-DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF)
-DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF, V8HI, UQI)
-DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF, UQI)
-DEF_FUNCTION_TYPE (V16HI, V16SF)
-DEF_FUNCTION_TYPE (V16HI, V16SF, V16HI, UHI)
-DEF_FUNCTION_TYPE (V16HI, V16SF, UHI)
-DEF_FUNCTION_TYPE (V8HI, V8SF)
-DEF_FUNCTION_TYPE (V8HI, V8SF, V8HI, UQI)
-DEF_FUNCTION_TYPE (V8HI, V8SF, UQI)
-DEF_FUNCTION_TYPE (V8HI, V4SF)
-DEF_FUNCTION_TYPE (V8HI, V4SF, V8HI, UQI)
-DEF_FUNCTION_TYPE (V8HI, V4SF, UQI)
-DEF_FUNCTION_TYPE (V16SF, V16SF, V32HI, V32HI)
-DEF_FUNCTION_TYPE (V16SF, V16SF, V32HI, V32HI, UHI)
-DEF_FUNCTION_TYPE (V8SF, V8SF, V16HI, V16HI)
-DEF_FUNCTION_TYPE (V8SF, V8SF, V16HI, V16HI, UQI)
-DEF_FUNCTION_TYPE (V4SF, V4SF, V8HI, V8HI)
-DEF_FUNCTION_TYPE (V4SF, V4SF, V8HI, V8HI, UQI)
+DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF)
+DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF, V32BF, USI)
+DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF, USI)
+DEF_FUNCTION_TYPE (V16BF, V8SF, V8SF)
+DEF_FUNCTION_TYPE (V16BF, V8SF, V8SF, V16BF, UHI)
+DEF_FUNCTION_TYPE (V16BF, V8SF, V8SF, UHI)
+DEF_FUNCTION_TYPE (V8BF, V4SF, V4SF)
+DEF_FUNCTION_TYPE (V8BF, V4SF, V4SF, V8BF, UQI)
+DEF_FUNCTION_TYPE (V8BF, V4SF, V4SF, UQI)
+DEF_FUNCTION_TYPE (V16BF, V16SF)
+DEF_FUNCTION_TYPE (V16BF, V16SF, V16BF, UHI)
+DEF_FUNCTION_TYPE (V16BF, V16SF, UHI)
+DEF_FUNCTION_TYPE (V8BF, V8SF)
+DEF_FUNCTION_TYPE (V8BF, V8SF, V8BF, UQI)
+DEF_FUNCTION_TYPE (V8BF, V8SF, UQI)
+DEF_FUNCTION_TYPE (V8BF, V4SF)
+DEF_FUNCTION_TYPE (V8BF, V4SF, V8BF, UQI)
+DEF_FUNCTION_TYPE (V8BF, V4SF, UQI)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V32BF, V32BF)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V32BF, V32BF, UHI)
+DEF_FUNCTION_TYPE (V8SF, V8SF, V16BF, V16BF)
+DEF_FUNCTION_TYPE (V8SF, V8SF, V16BF, V16BF, UQI)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V8BF, V8BF)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V8BF, V8BF, UQI)
# KEYLOCKER builtins
DEF_FUNCTION_TYPE (UINT, UINT, V2DI, V2DI, PVOID)
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index e35306e27d0..5802e2049a8 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -2779,33 +2779,33 @@ BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, "__builtin_ia32_vae
BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI)
/* BF16 */
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi, "__builtin_ia32_cvtne2ps2bf16_v32hi", IX86_BUILTIN_CVTNE2PS2HI16_V32HI, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi_mask, "__builtin_ia32_cvtne2ps2bf16_v32hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF_V32HI_USI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v32hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V32HI_MASKZ, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF_USI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi, "__builtin_ia32_cvtne2ps2bf16_v16hi", IX86_BUILTIN_CVTNE2PS2HI16_V16HI, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi_mask, "__builtin_ia32_cvtne2ps2bf16_v16hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF_V16HI_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v16hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V16HI_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi, "__builtin_ia32_cvtne2ps2bf16_v8hi", IX86_BUILTIN_CVTNE2PS2HI16_V8HI, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi_mask, "__builtin_ia32_cvtne2ps2bf16_v8hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V8HI_MASK, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF_V8HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v8hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V8HI_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf, "__builtin_ia32_cvtneps2bf16_v16sf", IX86_BUILTIN_CVTNEPS2HI16_V16SF, UNKNOWN, (int) V16HI_FTYPE_V16SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_mask, "__builtin_ia32_cvtneps2bf16_v16sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V16SF_MASK, UNKNOWN, (int) V16HI_FTYPE_V16SF_V16HI_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_maskz, "__builtin_ia32_cvtneps2bf16_v16sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V16SF_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16SF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf, "__builtin_ia32_cvtneps2bf16_v8sf", IX86_BUILTIN_CVTNEPS2HI16_V8SF, UNKNOWN, (int) V8HI_FTYPE_V8SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_mask, "__builtin_ia32_cvtneps2bf16_v8sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V8SF_MASK, UNKNOWN, (int) V8HI_FTYPE_V8SF_V8HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_maskz, "__builtin_ia32_cvtneps2bf16_v8sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V8SF_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V8SF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf, "__builtin_ia32_cvtneps2bf16_v4sf", IX86_BUILTIN_CVTNEPS2HI16_V4SF, UNKNOWN, (int) V8HI_FTYPE_V4SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_mask, "__builtin_ia32_cvtneps2bf16_v4sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V4SF_MASK, UNKNOWN, (int) V8HI_FTYPE_V4SF_V8HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_maskz, "__builtin_ia32_cvtneps2bf16_v4sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V4SF_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V4SF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf, "__builtin_ia32_dpbf16ps_v16sf", IX86_BUILTIN_DPHI16PS_V16SF, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_mask, "__builtin_ia32_dpbf16ps_v16sf_mask", IX86_BUILTIN_DPHI16PS_V16SF_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_maskz, "__builtin_ia32_dpbf16ps_v16sf_maskz", IX86_BUILTIN_DPHI16PS_V16SF_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf, "__builtin_ia32_dpbf16ps_v8sf", IX86_BUILTIN_DPHI16PS_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_mask, "__builtin_ia32_dpbf16ps_v8sf_mask", IX86_BUILTIN_DPHI16PS_V8SF_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_maskz, "__builtin_ia32_dpbf16ps_v8sf_maskz", IX86_BUILTIN_DPHI16PS_V8SF_MASKZ, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builtin_ia32_dpbf16ps_v4sf", IX86_BUILTIN_DPHI16PS_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPHI16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPHI16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf, "__builtin_ia32_cvtne2ps2bf16_v32bf", IX86_BUILTIN_CVTNE2PS2BF16_V32BF, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf_mask, "__builtin_ia32_cvtne2ps2bf16_v32bf_mask", IX86_BUILTIN_CVTNE2PS2BF16_V32BF_MASK, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF_V32BF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf_maskz, "__builtin_ia32_cvtne2ps2bf16_v32bf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V32BF_MASKZ, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16bf, "__builtin_ia32_cvtne2ps2bf16_v16bf", IX86_BUILTIN_CVTNE2PS2BF16_V16BF, UNKNOWN, (int) V16BF_FTYPE_V8SF_V8SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16bf_mask, "__builtin_ia32_cvtne2ps2bf16_v16bf_mask", IX86_BUILTIN_CVTNE2PS2BF16_V16BF_MASK, UNKNOWN, (int) V16BF_FTYPE_V8SF_V8SF_V16BF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16bf_maskz, "__builtin_ia32_cvtne2ps2bf16_v16bf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V16BF_MASKZ, UNKNOWN, (int) V16BF_FTYPE_V8SF_V8SF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8bf, "__builtin_ia32_cvtne2ps2bf16_v8bf", IX86_BUILTIN_CVTNE2PS2BF16_V8BF, UNKNOWN, (int) V8BF_FTYPE_V4SF_V4SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8bf_mask, "__builtin_ia32_cvtne2ps2bf16_v8bf_mask", IX86_BUILTIN_CVTNE2PS2BF16_V8BF_MASK, UNKNOWN, (int) V8BF_FTYPE_V4SF_V4SF_V8BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8bf_maskz, "__builtin_ia32_cvtne2ps2bf16_v8bf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V8BF_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V4SF_V4SF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf, "__builtin_ia32_cvtneps2bf16_v16sf", IX86_BUILTIN_CVTNEPS2BF16_V16SF, UNKNOWN, (int) V16BF_FTYPE_V16SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_mask, "__builtin_ia32_cvtneps2bf16_v16sf_mask", IX86_BUILTIN_CVTNEPS2BF16_V16SF_MASK, UNKNOWN, (int) V16BF_FTYPE_V16SF_V16BF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_maskz, "__builtin_ia32_cvtneps2bf16_v16sf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V16SF_MASKZ, UNKNOWN, (int) V16BF_FTYPE_V16SF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf, "__builtin_ia32_cvtneps2bf16_v8sf", IX86_BUILTIN_CVTNEPS2BF16_V8SF, UNKNOWN, (int) V8BF_FTYPE_V8SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_mask, "__builtin_ia32_cvtneps2bf16_v8sf_mask", IX86_BUILTIN_CVTNEPS2BF16_V8SF_MASK, UNKNOWN, (int) V8BF_FTYPE_V8SF_V8BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_maskz, "__builtin_ia32_cvtneps2bf16_v8sf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V8SF_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V8SF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf, "__builtin_ia32_cvtneps2bf16_v4sf", IX86_BUILTIN_CVTNEPS2BF16_V4SF, UNKNOWN, (int) V8BF_FTYPE_V4SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_mask, "__builtin_ia32_cvtneps2bf16_v4sf_mask", IX86_BUILTIN_CVTNEPS2BF16_V4SF_MASK, UNKNOWN, (int) V8BF_FTYPE_V4SF_V8BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_maskz, "__builtin_ia32_cvtneps2bf16_v4sf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V4SF_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V4SF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf, "__builtin_ia32_dpbf16ps_v16sf", IX86_BUILTIN_DPBF16PS_V16SF, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32BF_V32BF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_mask, "__builtin_ia32_dpbf16ps_v16sf_mask", IX86_BUILTIN_DPBF16PS_V16SF_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32BF_V32BF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_maskz, "__builtin_ia32_dpbf16ps_v16sf_maskz", IX86_BUILTIN_DPBF16PS_V16SF_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32BF_V32BF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf, "__builtin_ia32_dpbf16ps_v8sf", IX86_BUILTIN_DPBF16PS_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16BF_V16BF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_mask, "__builtin_ia32_dpbf16ps_v8sf_mask", IX86_BUILTIN_DPBF16PS_V8SF_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16BF_V16BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_maskz, "__builtin_ia32_dpbf16ps_v8sf_maskz", IX86_BUILTIN_DPBF16PS_V8SF_MASKZ, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16BF_V16BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builtin_ia32_dpbf16ps_v4sf", IX86_BUILTIN_DPBF16PS_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPBF16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPBF16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
/* AVX512FP16. */
BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_addph128_mask", IX86_BUILTIN_ADDPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 5d9e5a12f7e..8e1ef0b4c4a 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -10462,9 +10462,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V8DF_FTYPE_V2DF:
case V8DF_FTYPE_V8DF:
case V4DI_FTYPE_V4DI:
- case V16HI_FTYPE_V16SF:
- case V8HI_FTYPE_V8SF:
- case V8HI_FTYPE_V4SF:
+ case V16BF_FTYPE_V16SF:
+ case V8BF_FTYPE_V8SF:
+ case V8BF_FTYPE_V4SF:
nargs = 1;
break;
case V4SF_FTYPE_V4SF_VEC_MERGE:
@@ -10592,12 +10592,12 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case USI_FTYPE_USI_USI:
case UDI_FTYPE_UDI_UDI:
case V16SI_FTYPE_V8DF_V8DF:
- case V32HI_FTYPE_V16SF_V16SF:
- case V16HI_FTYPE_V8SF_V8SF:
- case V8HI_FTYPE_V4SF_V4SF:
- case V16HI_FTYPE_V16SF_UHI:
- case V8HI_FTYPE_V8SF_UQI:
- case V8HI_FTYPE_V4SF_UQI:
+ case V32BF_FTYPE_V16SF_V16SF:
+ case V16BF_FTYPE_V8SF_V8SF:
+ case V8BF_FTYPE_V4SF_V4SF:
+ case V16BF_FTYPE_V16SF_UHI:
+ case V8BF_FTYPE_V8SF_UQI:
+ case V8BF_FTYPE_V4SF_UQI:
nargs = 2;
break;
case V2DI_FTYPE_V2DI_INT_CONVERT:
@@ -10803,15 +10803,15 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V16HI_FTYPE_V16HI_V16HI_V16HI:
case V8SI_FTYPE_V8SI_V8SI_V8SI:
case V8HI_FTYPE_V8HI_V8HI_V8HI:
- case V32HI_FTYPE_V16SF_V16SF_USI:
- case V16HI_FTYPE_V8SF_V8SF_UHI:
- case V8HI_FTYPE_V4SF_V4SF_UQI:
- case V16HI_FTYPE_V16SF_V16HI_UHI:
- case V8HI_FTYPE_V8SF_V8HI_UQI:
- case V8HI_FTYPE_V4SF_V8HI_UQI:
- case V16SF_FTYPE_V16SF_V32HI_V32HI:
- case V8SF_FTYPE_V8SF_V16HI_V16HI:
- case V4SF_FTYPE_V4SF_V8HI_V8HI:
+ case V32BF_FTYPE_V16SF_V16SF_USI:
+ case V16BF_FTYPE_V8SF_V8SF_UHI:
+ case V8BF_FTYPE_V4SF_V4SF_UQI:
+ case V16BF_FTYPE_V16SF_V16BF_UHI:
+ case V8BF_FTYPE_V8SF_V8BF_UQI:
+ case V8BF_FTYPE_V4SF_V8BF_UQI:
+ case V16SF_FTYPE_V16SF_V32BF_V32BF:
+ case V8SF_FTYPE_V8SF_V16BF_V16BF:
+ case V4SF_FTYPE_V4SF_V8BF_V8BF:
nargs = 3;
break;
case V32QI_FTYPE_V32QI_V32QI_INT:
@@ -10958,9 +10958,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
- case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
- case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
- case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
+ case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
+ case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
+ case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
nargs = 4;
break;
case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
@@ -10998,9 +10998,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
break;
case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
- case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
- case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
- case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
+ case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
+ case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
+ case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
nargs = 4;
break;
case UQI_FTYPE_V8DI_V8DI_INT_UQI:
diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h
index ddea249d09b..c62d50f1951 100644
--- a/gcc/config/i386/immintrin.h
+++ b/gcc/config/i386/immintrin.h
@@ -118,9 +118,11 @@
#include <vpclmulqdqintrin.h>
+#ifdef __SSE2__
#include <avx512bf16vlintrin.h>
#include <avx512bf16intrin.h>
+#endif
#include <amxtileintrin.h>
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f4b5506703f..fba81a93c1a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -187,8 +187,6 @@
UNSPEC_VP2INTERSECT
;; For AVX512BF16 support
- UNSPEC_VCVTNE2PS2BF16
- UNSPEC_VCVTNEPS2BF16
UNSPEC_VDPBF16PS
;; For AVX512FP16 suppport
@@ -28918,41 +28916,101 @@
"vp2intersectd\t{%2, %1, %0|%0, %1, %2}"
[(set_attr ("prefix") ("evex"))])
-(define_mode_iterator BF16 [V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")])
+(define_mode_iterator VF_AVX512BF16VL
+ [V32BF (V16BF "TARGET_AVX512VL") (V8BF "TARGET_AVX512VL")])
;; Converting from BF to SF
(define_mode_attr bf16_cvt_2sf
- [(V32HI "V16SF") (V16HI "V8SF") (V8HI "V4SF")])
+ [(V32BF "V16SF") (V16BF "V8SF") (V8BF "V4SF")])
;; Converting from SF to BF
(define_mode_attr sf_cvt_bf16
- [(V4SF "V8HI") (V8SF "V8HI") (V16SF "V16HI")])
+ [(V8SF "V8BF") (V16SF "V16BF")])
;; Mapping from BF to SF
(define_mode_attr sf_bf16
- [(V4SF "V8HI") (V8SF "V16HI") (V16SF "V32HI")])
+ [(V4SF "V8BF") (V8SF "V16BF") (V16SF "V32BF")])
(define_expand "avx512f_cvtne2ps2bf16_<mode>_maskz"
- [(match_operand:BF16 0 "register_operand")
+ [(match_operand:VF_AVX512BF16VL 0 "register_operand")
(match_operand:<bf16_cvt_2sf> 1 "register_operand")
- (match_operand:<bf16_cvt_2sf> 2 "register_operand")
+ (match_operand:<bf16_cvt_2sf> 2 "nonimmediate_operand")
(match_operand:<avx512fmaskmode> 3 "register_operand")]
"TARGET_AVX512BF16"
{
- emit_insn (gen_avx512f_cvtne2ps2bf16_<mode>_mask(operands[0], operands[1],
- operands[2], CONST0_RTX(<MODE>mode), operands[3]));
+ emit_insn (gen_avx512f_cvtne2ps2bf16_<mode>_mask(operands[0], operands[2],
+ operands[1], CONST0_RTX(<MODE>mode), operands[3]));
DONE;
})
(define_insn "avx512f_cvtne2ps2bf16_<mode><mask_name>"
- [(set (match_operand:BF16 0 "register_operand" "=v")
- (unspec:BF16
- [(match_operand:<bf16_cvt_2sf> 1 "register_operand" "v")
- (match_operand:<bf16_cvt_2sf> 2 "register_operand" "v")]
- UNSPEC_VCVTNE2PS2BF16))]
+ [(set (match_operand:VF_AVX512BF16VL 0 "register_operand" "=v")
+ (vec_concat:VF_AVX512BF16VL
+ (float_truncate:<ssehalfvecmode>
+ (match_operand:<bf16_cvt_2sf> 2 "nonimmediate_operand" "vm"))
+ (float_truncate:<ssehalfvecmode>
+ (match_operand:<bf16_cvt_2sf> 1 "register_operand" "v"))))]
"TARGET_AVX512BF16"
"vcvtne2ps2bf16\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}")
+(define_expand "avx512f_cvtneps2bf16_v4sf"
+ [(set (match_operand:V8BF 0 "register_operand")
+ (vec_concat:V8BF
+ (float_truncate:V4BF
+ (match_operand:V4SF 1 "nonimmediate_operand"))
+ (match_dup 2)))]
+ "TARGET_AVX512BF16 && TARGET_AVX512VL"
+ "operands[2] = CONST0_RTX (V4BFmode);")
+
+(define_insn "*avx512f_cvtneps2bf16_v4sf"
+ [(set (match_operand:V8BF 0 "register_operand" "=v")
+ (vec_concat:V8BF
+ (float_truncate:V4BF
+ (match_operand:V4SF 1 "nonimmediate_operand" "vm"))
+ (match_operand:V4BF 2 "const0_operand")))]
+ "TARGET_AVX512BF16 && TARGET_AVX512VL"
+ "vcvtneps2bf16{x}\t{%1, %0|%0, %1}")
+
+(define_expand "avx512f_cvtneps2bf16_v4sf_maskz"
+ [(match_operand:V8BF 0 "register_operand")
+ (match_operand:V4SF 1 "nonimmediate_operand")
+ (match_operand:QI 2 "register_operand")]
+ "TARGET_AVX512BF16 && TARGET_AVX512VL"
+{
+ emit_insn (gen_avx512f_cvtneps2bf16_v4sf_mask_1(operands[0], operands[1],
+ CONST0_RTX(V8BFmode), operands[2], CONST0_RTX(V4BFmode)));
+ DONE;
+})
+
+(define_expand "avx512f_cvtneps2bf16_v4sf_mask"
+ [(match_operand:V8BF 0 "register_operand")
+ (match_operand:V4SF 1 "nonimmediate_operand")
+ (match_operand:V8BF 2 "nonimm_or_0_operand")
+ (match_operand:QI 3 "register_operand")]
+ "TARGET_AVX512BF16 && TARGET_AVX512VL"
+{
+ emit_insn (gen_avx512f_cvtneps2bf16_v4sf_mask_1(operands[0], operands[1],
+ operands[2], operands[3], CONST0_RTX(V4BFmode)));
+ DONE;
+})
+
+(define_insn "avx512f_cvtneps2bf16_v4sf_mask_1"
+ [(set (match_operand:V8BF 0 "register_operand" "=v")
+ (vec_concat:V8BF
+ (vec_merge:V4BF
+ (float_truncate:V4BF
+ (match_operand:V4SF 1 "nonimmediate_operand" "vm"))
+ (vec_select:V4BF
+ (match_operand:V8BF 2 "nonimm_or_0_operand" "0C")
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)]))
+ (match_operand:QI 3 "register_operand" "Yk"))
+ (match_operand:V4BF 4 "const0_operand")))]
+ "TARGET_AVX512BF16 && TARGET_AVX512VL"
+ "vcvtneps2bf16{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}")
+
+(define_mode_iterator VF1_AVX512_256 [V16SF (V8SF "TARGET_AVX512VL")])
+
(define_expand "avx512f_cvtneps2bf16_<mode>_maskz"
[(match_operand:<sf_cvt_bf16> 0 "register_operand")
- (match_operand:VF1_AVX512VL 1 "register_operand")
+ (match_operand:VF1_AVX512_256 1 "nonimmediate_operand")
(match_operand:<avx512fmaskmode> 2 "register_operand")]
"TARGET_AVX512BF16"
{
@@ -28963,11 +29021,10 @@
(define_insn "avx512f_cvtneps2bf16_<mode><mask_name>"
[(set (match_operand:<sf_cvt_bf16> 0 "register_operand" "=v")
- (unspec:<sf_cvt_bf16>
- [(match_operand:VF1_AVX512VL 1 "register_operand" "v")]
- UNSPEC_VCVTNEPS2BF16))]
+ (float_truncate:<sf_cvt_bf16>
+ (match_operand:VF1_AVX512_256 1 "nonimmediate_operand" "vm")))]
"TARGET_AVX512BF16"
- "vcvtneps2bf16\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}")
+ "vcvtneps2bf16<qq2phsuff>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}")
(define_expand "avx512f_dpbf16ps_<mode>_maskz"
[(match_operand:VF1_AVX512VL 0 "register_operand")
@@ -28987,7 +29044,7 @@
(unspec:VF1_AVX512VL
[(match_operand:VF1_AVX512VL 1 "register_operand" "0")
(match_operand:<sf_bf16> 2 "register_operand" "v")
- (match_operand:<sf_bf16> 3 "register_operand" "v")]
+ (match_operand:<sf_bf16> 3 "nonimmediate_operand" "vm")]
UNSPEC_VDPBF16PS))]
"TARGET_AVX512BF16"
"vdpbf16ps\t{%3, %2, %0<maskz_half_operand4>|%0<maskz_half_operand4>, %2, %3}")
@@ -28998,7 +29055,7 @@
(unspec:VF1_AVX512VL
[(match_operand:VF1_AVX512VL 1 "register_operand" "0")
(match_operand:<sf_bf16> 2 "register_operand" "v")
- (match_operand:<sf_bf16> 3 "register_operand" "v")]
+ (match_operand:<sf_bf16> 3 "nonimmediate_operand" "vm")]
UNSPEC_VDPBF16PS)
(match_dup 1)
(match_operand:<avx512fmaskhalfmode> 4 "register_operand" "Yk")))]
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
index 831abd37d80..8e929e6f159 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
@@ -1,6 +1,6 @@
/* { dg-do compile } */
/* { dg-options "-mavx512bf16 -O2" } */
-/* { dg-additional-options "-fno-PIE" { target ia32 } } */
+/* { dg-additional-options "-fno-PIE -mfpmath=sse" { target ia32 } } */
/* { dg-final { scan-assembler-times "sall\[ \\t\]+\[^\{\n\]*16" 1 } } */
/* { dg-final { scan-assembler-times "movl" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c b/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
index b64ad7b84dd..02ebdd8cf5b 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
@@ -1,6 +1,6 @@
/* { dg-do compile } */
/* { dg-options "-mavx512bf16 -O2" } */
-/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
#include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
index 8f21b1bfdae..b71addd6301 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
@@ -1,6 +1,6 @@
/* { dg-do compile } */
/* { dg-options "-mavx512bf16 -mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
#include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
index 0969ae1b35e..d3a9bdf8c34 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
@@ -1,11 +1,11 @@
/* { dg-do compile } */
/* { dg-options "-mavx512bf16 -mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
#include <immintrin.h>
--
2.27.0
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [PATCH] i386: using __bf16 for AVX512BF16 intrinsics
2022-10-28 6:20 ` [PATCH] i386: using __bf16 for AVX512BF16 intrinsics Kong, Lingling
@ 2022-10-28 6:29 ` Hongtao Liu
0 siblings, 0 replies; 2+ messages in thread
From: Hongtao Liu @ 2022-10-28 6:29 UTC (permalink / raw)
To: Kong, Lingling; +Cc: Liu, Hongtao, gcc-patches
On Fri, Oct 28, 2022 at 2:20 PM Kong, Lingling via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi,
>
> Previously we use unsigned short to represent bf16. It's not a good expression, and at the time the front end didn't support bf16 type.
> Now we introduced __bf16 to X86 psABI. So we can switch intrinsics to the new type.
>
> Ok for trunk ?
LGTM, but please don't commit it until next week to leave some time
for others to take a look.
Also please update GCC13 doc for it.
https://gcc.gnu.org/gcc-13/changes.html.
>
> Thanks,
> Lingling
>
> gcc/ChangeLog:
>
> * config/i386/avx512bf16intrin.h (__attribute__): Change short to bf16.
> (_mm_cvtsbh_ss): Ditto.
> (_mm512_cvtne2ps_pbh): Ditto.
> (_mm512_mask_cvtne2ps_pbh): Ditto.
> (_mm512_maskz_cvtne2ps_pbh): Ditto.
> * config/i386/avx512bf16vlintrin.h (__attribute__): Ditto.
> (_mm256_cvtne2ps_pbh): Ditto.
> (_mm256_mask_cvtne2ps_pbh): Ditto.
> (_mm256_maskz_cvtne2ps_pbh): Ditto.
> (_mm_cvtne2ps_pbh): Ditto.
> (_mm_mask_cvtne2ps_pbh): Ditto.
> (_mm_maskz_cvtne2ps_pbh): Ditto.
> (_mm_cvtness_sbh): Ditto.
> * config/i386/i386-builtin-types.def (V8BF): Add new
> DEF_VECTOR_TYPE for BFmode.
> (V16BF): Ditto.
> (V32BF): Ditto.
> * config/i386/i386-builtin.def (BDESC): Fixed builtins.
> * config/i386/i386-expand.cc (ix86_expand_args_builtin): Changed
> avx512bf16 ix86_builtin_func_type included HI to BF.
> * config/i386/immintrin.h: Add SSE2 depend for avx512bf16.
> * config/i386/sse.md (TARGET_AVX512VL): Changed HI vector to BF
> vector.
> (avx512f_cvtneps2bf16_v4sf): New define_expand.
> (*avx512f_cvtneps2bf16_v4sf): New define_insn.
> (avx512f_cvtneps2bf16_v4sf_maskz):Ditto.
> (avx512f_cvtneps2bf16_v4sf_mask): Ditto.
> (avx512f_cvtneps2bf16_v4sf_mask_1): Ditto.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/avx512bf16-cvtsbh2ss-1.c: Add fpmath option.
> * gcc.target/i386/avx512bf16-vdpbf16ps-2.c: Fixed
> scan-assembler.
> * gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c: Add x/y suffix
> for vcvtneps2bf16.
> * gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c: Ditto.
> ---
> gcc/config/i386/avx512bf16intrin.h | 12 +--
> gcc/config/i386/avx512bf16vlintrin.h | 29 ++---
> gcc/config/i386/i386-builtin-types.def | 51 ++++-----
> gcc/config/i386/i386-builtin.def | 54 +++++-----
> gcc/config/i386/i386-expand.cc | 48 ++++-----
> gcc/config/i386/immintrin.h | 2 +
> gcc/config/i386/sse.md | 101 ++++++++++++++----
> .../gcc.target/i386/avx512bf16-cvtsbh2ss-1.c | 2 +-
> .../gcc.target/i386/avx512bf16-vdpbf16ps-2.c | 2 +-
> .../i386/avx512bf16vl-cvtness2sbh-1.c | 2 +-
> .../i386/avx512bf16vl-vcvtneps2bf16-1.c | 12 +--
> 11 files changed, 189 insertions(+), 126 deletions(-)
>
> diff --git a/gcc/config/i386/avx512bf16intrin.h b/gcc/config/i386/avx512bf16intrin.h
> index b6e9ddad157..ea1d0125b3f 100644
> --- a/gcc/config/i386/avx512bf16intrin.h
> +++ b/gcc/config/i386/avx512bf16intrin.h
> @@ -35,16 +35,16 @@
> #endif /* __AVX512BF16__ */
>
> /* Internal data types for implementing the intrinsics. */
> -typedef short __v32bh __attribute__ ((__vector_size__ (64)));
> +typedef __bf16 __v32bf __attribute__ ((__vector_size__ (64)));
>
> /* The Intel API is flexible enough that we must allow aliasing with other
> vector types, and their scalar components. */
> -typedef short __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
> +typedef __bf16 __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
>
> /* Convert One BF16 Data to One Single Float Data. */
> extern __inline float
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_cvtsbh_ss (__bfloat16 __A)
> +_mm_cvtsbh_ss (__bf16 __A)
> {
> union{ float a; unsigned int b;} __tmp;
> __tmp.b = ((unsigned int)(__A)) << 16;
> @@ -57,21 +57,21 @@ extern __inline __m512bh
> __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_cvtne2ps_pbh (__m512 __A, __m512 __B)
> {
> - return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi(__A, __B);
> + return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf(__A, __B);
> }
>
> extern __inline __m512bh
> __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_mask_cvtne2ps_pbh (__m512bh __A, __mmask32 __B, __m512 __C, __m512 __D)
> {
> - return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_mask(__C, __D, __A, __B);
> + return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_mask(__C, __D, __A, __B);
> }
>
> extern __inline __m512bh
> __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_maskz_cvtne2ps_pbh (__mmask32 __A, __m512 __B, __m512 __C)
> {
> - return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_maskz(__B, __C, __A);
> + return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_maskz(__B, __C, __A);
> }
>
> /* vcvtneps2bf16 */
> diff --git a/gcc/config/i386/avx512bf16vlintrin.h b/gcc/config/i386/avx512bf16vlintrin.h
> index 969335ff358..56c28f14cf6 100644
> --- a/gcc/config/i386/avx512bf16vlintrin.h
> +++ b/gcc/config/i386/avx512bf16vlintrin.h
> @@ -35,57 +35,58 @@
> #endif /* __AVX512BF16__ */
>
> /* Internal data types for implementing the intrinsics. */
> -typedef short __v16bh __attribute__ ((__vector_size__ (32)));
> -typedef short __v8bh __attribute__ ((__vector_size__ (16)));
> +typedef __bf16 __v16bf __attribute__ ((__vector_size__ (32)));
> +typedef __bf16 __v8bf __attribute__ ((__vector_size__ (16)));
>
> /* The Intel API is flexible enough that we must allow aliasing with other
> vector types, and their scalar components. */
> -typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
> -typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
> +typedef __bf16 __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
> +typedef __bf16 __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
> +
> +typedef __bf16 __bfloat16;
>
> -typedef unsigned short __bfloat16;
> /* vcvtne2ps2bf16 */
>
> extern __inline __m256bh
> __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_cvtne2ps_pbh (__m256 __A, __m256 __B)
> {
> - return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi(__A, __B);
> + return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf(__A, __B);
> }
>
> extern __inline __m256bh
> __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D)
> {
> - return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_mask(__C, __D, __A, __B);
> + return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_mask(__C, __D, __A, __B);
> }
>
> extern __inline __m256bh
> __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C)
> {
> - return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_maskz(__B, __C, __A);
> + return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_maskz(__B, __C, __A);
> }
>
> extern __inline __m128bh
> __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _mm_cvtne2ps_pbh (__m128 __A, __m128 __B)
> {
> - return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi(__A, __B);
> + return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf(__A, __B);
> }
>
> extern __inline __m128bh
> __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D)
> {
> - return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_mask(__C, __D, __A, __B);
> + return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_mask(__C, __D, __A, __B);
> }
>
> extern __inline __m128bh
> __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C)
> {
> - return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_maskz(__B, __C, __A);
> + return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_maskz(__B, __C, __A);
> }
>
> /* vcvtneps2bf16 */
> @@ -176,13 +177,13 @@ _mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D)
> return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A);
> }
>
> -extern __inline __bfloat16
> +extern __inline __bf16
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm_cvtness_sbh (float __A)
> {
> __v4sf __V = {__A, 0, 0, 0};
> - __v8hi __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
> - (__v8hi)_mm_undefined_si128 (), (__mmask8)-1);
> + __v8bf __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
> + (__v8bf)_mm_undefined_si128 (), (__mmask8)-1);
> return __R[0];
> }
>
> diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
> index 63a360b0f8b..aedae2d7750 100644
> --- a/gcc/config/i386/i386-builtin-types.def
> +++ b/gcc/config/i386/i386-builtin-types.def
> @@ -87,6 +87,7 @@ DEF_VECTOR_TYPE (V8QI, QI)
> DEF_VECTOR_TYPE (V2DF, DOUBLE)
> DEF_VECTOR_TYPE (V4SF, FLOAT)
> DEF_VECTOR_TYPE (V8HF, FLOAT16)
> +DEF_VECTOR_TYPE (V8BF, BFLOAT16)
> DEF_VECTOR_TYPE (V2DI, DI)
> DEF_VECTOR_TYPE (V4SI, SI)
> DEF_VECTOR_TYPE (V8HI, HI)
> @@ -100,6 +101,7 @@ DEF_VECTOR_TYPE (V16UQI, UQI, V16QI)
> DEF_VECTOR_TYPE (V4DF, DOUBLE)
> DEF_VECTOR_TYPE (V8SF, FLOAT)
> DEF_VECTOR_TYPE (V16HF, FLOAT16)
> +DEF_VECTOR_TYPE (V16BF, BFLOAT16)
> DEF_VECTOR_TYPE (V4DI, DI)
> DEF_VECTOR_TYPE (V8SI, SI)
> DEF_VECTOR_TYPE (V16HI, HI)
> @@ -111,6 +113,7 @@ DEF_VECTOR_TYPE (V16UHI, UHI, V16HI)
> # AVX512F vectors
> DEF_VECTOR_TYPE (V32SF, FLOAT)
> DEF_VECTOR_TYPE (V32HF, FLOAT16)
> +DEF_VECTOR_TYPE (V32BF, BFLOAT16)
> DEF_VECTOR_TYPE (V16SF, FLOAT)
> DEF_VECTOR_TYPE (V8DF, DOUBLE)
> DEF_VECTOR_TYPE (V8DI, DI)
> @@ -1273,30 +1276,30 @@ DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, UHI)
> DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, UHI)
>
> # BF16 builtins
> -DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF)
> -DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF, V32HI, USI)
> -DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF, USI)
> -DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF)
> -DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF, V16HI, UHI)
> -DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF, UHI)
> -DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF)
> -DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF, V8HI, UQI)
> -DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF, UQI)
> -DEF_FUNCTION_TYPE (V16HI, V16SF)
> -DEF_FUNCTION_TYPE (V16HI, V16SF, V16HI, UHI)
> -DEF_FUNCTION_TYPE (V16HI, V16SF, UHI)
> -DEF_FUNCTION_TYPE (V8HI, V8SF)
> -DEF_FUNCTION_TYPE (V8HI, V8SF, V8HI, UQI)
> -DEF_FUNCTION_TYPE (V8HI, V8SF, UQI)
> -DEF_FUNCTION_TYPE (V8HI, V4SF)
> -DEF_FUNCTION_TYPE (V8HI, V4SF, V8HI, UQI)
> -DEF_FUNCTION_TYPE (V8HI, V4SF, UQI)
> -DEF_FUNCTION_TYPE (V16SF, V16SF, V32HI, V32HI)
> -DEF_FUNCTION_TYPE (V16SF, V16SF, V32HI, V32HI, UHI)
> -DEF_FUNCTION_TYPE (V8SF, V8SF, V16HI, V16HI)
> -DEF_FUNCTION_TYPE (V8SF, V8SF, V16HI, V16HI, UQI)
> -DEF_FUNCTION_TYPE (V4SF, V4SF, V8HI, V8HI)
> -DEF_FUNCTION_TYPE (V4SF, V4SF, V8HI, V8HI, UQI)
> +DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF)
> +DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF, V32BF, USI)
> +DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF, USI)
> +DEF_FUNCTION_TYPE (V16BF, V8SF, V8SF)
> +DEF_FUNCTION_TYPE (V16BF, V8SF, V8SF, V16BF, UHI)
> +DEF_FUNCTION_TYPE (V16BF, V8SF, V8SF, UHI)
> +DEF_FUNCTION_TYPE (V8BF, V4SF, V4SF)
> +DEF_FUNCTION_TYPE (V8BF, V4SF, V4SF, V8BF, UQI)
> +DEF_FUNCTION_TYPE (V8BF, V4SF, V4SF, UQI)
> +DEF_FUNCTION_TYPE (V16BF, V16SF)
> +DEF_FUNCTION_TYPE (V16BF, V16SF, V16BF, UHI)
> +DEF_FUNCTION_TYPE (V16BF, V16SF, UHI)
> +DEF_FUNCTION_TYPE (V8BF, V8SF)
> +DEF_FUNCTION_TYPE (V8BF, V8SF, V8BF, UQI)
> +DEF_FUNCTION_TYPE (V8BF, V8SF, UQI)
> +DEF_FUNCTION_TYPE (V8BF, V4SF)
> +DEF_FUNCTION_TYPE (V8BF, V4SF, V8BF, UQI)
> +DEF_FUNCTION_TYPE (V8BF, V4SF, UQI)
> +DEF_FUNCTION_TYPE (V16SF, V16SF, V32BF, V32BF)
> +DEF_FUNCTION_TYPE (V16SF, V16SF, V32BF, V32BF, UHI)
> +DEF_FUNCTION_TYPE (V8SF, V8SF, V16BF, V16BF)
> +DEF_FUNCTION_TYPE (V8SF, V8SF, V16BF, V16BF, UQI)
> +DEF_FUNCTION_TYPE (V4SF, V4SF, V8BF, V8BF)
> +DEF_FUNCTION_TYPE (V4SF, V4SF, V8BF, V8BF, UQI)
>
> # KEYLOCKER builtins
> DEF_FUNCTION_TYPE (UINT, UINT, V2DI, V2DI, PVOID)
> diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
> index e35306e27d0..5802e2049a8 100644
> --- a/gcc/config/i386/i386-builtin.def
> +++ b/gcc/config/i386/i386-builtin.def
> @@ -2779,33 +2779,33 @@ BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, "__builtin_ia32_vae
> BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI)
>
> /* BF16 */
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi, "__builtin_ia32_cvtne2ps2bf16_v32hi", IX86_BUILTIN_CVTNE2PS2HI16_V32HI, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi_mask, "__builtin_ia32_cvtne2ps2bf16_v32hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF_V32HI_USI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v32hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V32HI_MASKZ, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF_USI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi, "__builtin_ia32_cvtne2ps2bf16_v16hi", IX86_BUILTIN_CVTNE2PS2HI16_V16HI, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi_mask, "__builtin_ia32_cvtne2ps2bf16_v16hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF_V16HI_UHI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v16hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V16HI_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF_UHI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi, "__builtin_ia32_cvtne2ps2bf16_v8hi", IX86_BUILTIN_CVTNE2PS2HI16_V8HI, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi_mask, "__builtin_ia32_cvtne2ps2bf16_v8hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V8HI_MASK, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF_V8HI_UQI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v8hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V8HI_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF_UQI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf, "__builtin_ia32_cvtneps2bf16_v16sf", IX86_BUILTIN_CVTNEPS2HI16_V16SF, UNKNOWN, (int) V16HI_FTYPE_V16SF)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_mask, "__builtin_ia32_cvtneps2bf16_v16sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V16SF_MASK, UNKNOWN, (int) V16HI_FTYPE_V16SF_V16HI_UHI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_maskz, "__builtin_ia32_cvtneps2bf16_v16sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V16SF_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16SF_UHI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf, "__builtin_ia32_cvtneps2bf16_v8sf", IX86_BUILTIN_CVTNEPS2HI16_V8SF, UNKNOWN, (int) V8HI_FTYPE_V8SF)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_mask, "__builtin_ia32_cvtneps2bf16_v8sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V8SF_MASK, UNKNOWN, (int) V8HI_FTYPE_V8SF_V8HI_UQI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_maskz, "__builtin_ia32_cvtneps2bf16_v8sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V8SF_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V8SF_UQI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf, "__builtin_ia32_cvtneps2bf16_v4sf", IX86_BUILTIN_CVTNEPS2HI16_V4SF, UNKNOWN, (int) V8HI_FTYPE_V4SF)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_mask, "__builtin_ia32_cvtneps2bf16_v4sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V4SF_MASK, UNKNOWN, (int) V8HI_FTYPE_V4SF_V8HI_UQI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_maskz, "__builtin_ia32_cvtneps2bf16_v4sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V4SF_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V4SF_UQI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf, "__builtin_ia32_dpbf16ps_v16sf", IX86_BUILTIN_DPHI16PS_V16SF, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_mask, "__builtin_ia32_dpbf16ps_v16sf_mask", IX86_BUILTIN_DPHI16PS_V16SF_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI_UHI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_maskz, "__builtin_ia32_dpbf16ps_v16sf_maskz", IX86_BUILTIN_DPHI16PS_V16SF_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI_UHI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf, "__builtin_ia32_dpbf16ps_v8sf", IX86_BUILTIN_DPHI16PS_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_mask, "__builtin_ia32_dpbf16ps_v8sf_mask", IX86_BUILTIN_DPHI16PS_V8SF_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI_UQI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_maskz, "__builtin_ia32_dpbf16ps_v8sf_maskz", IX86_BUILTIN_DPHI16PS_V8SF_MASKZ, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI_UQI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builtin_ia32_dpbf16ps_v4sf", IX86_BUILTIN_DPHI16PS_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPHI16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPHI16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf, "__builtin_ia32_cvtne2ps2bf16_v32bf", IX86_BUILTIN_CVTNE2PS2BF16_V32BF, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf_mask, "__builtin_ia32_cvtne2ps2bf16_v32bf_mask", IX86_BUILTIN_CVTNE2PS2BF16_V32BF_MASK, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF_V32BF_USI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf_maskz, "__builtin_ia32_cvtne2ps2bf16_v32bf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V32BF_MASKZ, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF_USI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16bf, "__builtin_ia32_cvtne2ps2bf16_v16bf", IX86_BUILTIN_CVTNE2PS2BF16_V16BF, UNKNOWN, (int) V16BF_FTYPE_V8SF_V8SF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16bf_mask, "__builtin_ia32_cvtne2ps2bf16_v16bf_mask", IX86_BUILTIN_CVTNE2PS2BF16_V16BF_MASK, UNKNOWN, (int) V16BF_FTYPE_V8SF_V8SF_V16BF_UHI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16bf_maskz, "__builtin_ia32_cvtne2ps2bf16_v16bf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V16BF_MASKZ, UNKNOWN, (int) V16BF_FTYPE_V8SF_V8SF_UHI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8bf, "__builtin_ia32_cvtne2ps2bf16_v8bf", IX86_BUILTIN_CVTNE2PS2BF16_V8BF, UNKNOWN, (int) V8BF_FTYPE_V4SF_V4SF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8bf_mask, "__builtin_ia32_cvtne2ps2bf16_v8bf_mask", IX86_BUILTIN_CVTNE2PS2BF16_V8BF_MASK, UNKNOWN, (int) V8BF_FTYPE_V4SF_V4SF_V8BF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8bf_maskz, "__builtin_ia32_cvtne2ps2bf16_v8bf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V8BF_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V4SF_V4SF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf, "__builtin_ia32_cvtneps2bf16_v16sf", IX86_BUILTIN_CVTNEPS2BF16_V16SF, UNKNOWN, (int) V16BF_FTYPE_V16SF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_mask, "__builtin_ia32_cvtneps2bf16_v16sf_mask", IX86_BUILTIN_CVTNEPS2BF16_V16SF_MASK, UNKNOWN, (int) V16BF_FTYPE_V16SF_V16BF_UHI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_maskz, "__builtin_ia32_cvtneps2bf16_v16sf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V16SF_MASKZ, UNKNOWN, (int) V16BF_FTYPE_V16SF_UHI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf, "__builtin_ia32_cvtneps2bf16_v8sf", IX86_BUILTIN_CVTNEPS2BF16_V8SF, UNKNOWN, (int) V8BF_FTYPE_V8SF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_mask, "__builtin_ia32_cvtneps2bf16_v8sf_mask", IX86_BUILTIN_CVTNEPS2BF16_V8SF_MASK, UNKNOWN, (int) V8BF_FTYPE_V8SF_V8BF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_maskz, "__builtin_ia32_cvtneps2bf16_v8sf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V8SF_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V8SF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf, "__builtin_ia32_cvtneps2bf16_v4sf", IX86_BUILTIN_CVTNEPS2BF16_V4SF, UNKNOWN, (int) V8BF_FTYPE_V4SF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_mask, "__builtin_ia32_cvtneps2bf16_v4sf_mask", IX86_BUILTIN_CVTNEPS2BF16_V4SF_MASK, UNKNOWN, (int) V8BF_FTYPE_V4SF_V8BF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_maskz, "__builtin_ia32_cvtneps2bf16_v4sf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V4SF_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V4SF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf, "__builtin_ia32_dpbf16ps_v16sf", IX86_BUILTIN_DPBF16PS_V16SF, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32BF_V32BF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_mask, "__builtin_ia32_dpbf16ps_v16sf_mask", IX86_BUILTIN_DPBF16PS_V16SF_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32BF_V32BF_UHI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_maskz, "__builtin_ia32_dpbf16ps_v16sf_maskz", IX86_BUILTIN_DPBF16PS_V16SF_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32BF_V32BF_UHI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf, "__builtin_ia32_dpbf16ps_v8sf", IX86_BUILTIN_DPBF16PS_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16BF_V16BF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_mask, "__builtin_ia32_dpbf16ps_v8sf_mask", IX86_BUILTIN_DPBF16PS_V8SF_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16BF_V16BF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_maskz, "__builtin_ia32_dpbf16ps_v8sf_maskz", IX86_BUILTIN_DPBF16PS_V8SF_MASKZ, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16BF_V16BF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builtin_ia32_dpbf16ps_v4sf", IX86_BUILTIN_DPBF16PS_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPBF16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPBF16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
>
> /* AVX512FP16. */
> BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_addph128_mask", IX86_BUILTIN_ADDPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 5d9e5a12f7e..8e1ef0b4c4a 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -10462,9 +10462,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
> case V8DF_FTYPE_V2DF:
> case V8DF_FTYPE_V8DF:
> case V4DI_FTYPE_V4DI:
> - case V16HI_FTYPE_V16SF:
> - case V8HI_FTYPE_V8SF:
> - case V8HI_FTYPE_V4SF:
> + case V16BF_FTYPE_V16SF:
> + case V8BF_FTYPE_V8SF:
> + case V8BF_FTYPE_V4SF:
> nargs = 1;
> break;
> case V4SF_FTYPE_V4SF_VEC_MERGE:
> @@ -10592,12 +10592,12 @@ ix86_expand_args_builtin (const struct builtin_description *d,
> case USI_FTYPE_USI_USI:
> case UDI_FTYPE_UDI_UDI:
> case V16SI_FTYPE_V8DF_V8DF:
> - case V32HI_FTYPE_V16SF_V16SF:
> - case V16HI_FTYPE_V8SF_V8SF:
> - case V8HI_FTYPE_V4SF_V4SF:
> - case V16HI_FTYPE_V16SF_UHI:
> - case V8HI_FTYPE_V8SF_UQI:
> - case V8HI_FTYPE_V4SF_UQI:
> + case V32BF_FTYPE_V16SF_V16SF:
> + case V16BF_FTYPE_V8SF_V8SF:
> + case V8BF_FTYPE_V4SF_V4SF:
> + case V16BF_FTYPE_V16SF_UHI:
> + case V8BF_FTYPE_V8SF_UQI:
> + case V8BF_FTYPE_V4SF_UQI:
> nargs = 2;
> break;
> case V2DI_FTYPE_V2DI_INT_CONVERT:
> @@ -10803,15 +10803,15 @@ ix86_expand_args_builtin (const struct builtin_description *d,
> case V16HI_FTYPE_V16HI_V16HI_V16HI:
> case V8SI_FTYPE_V8SI_V8SI_V8SI:
> case V8HI_FTYPE_V8HI_V8HI_V8HI:
> - case V32HI_FTYPE_V16SF_V16SF_USI:
> - case V16HI_FTYPE_V8SF_V8SF_UHI:
> - case V8HI_FTYPE_V4SF_V4SF_UQI:
> - case V16HI_FTYPE_V16SF_V16HI_UHI:
> - case V8HI_FTYPE_V8SF_V8HI_UQI:
> - case V8HI_FTYPE_V4SF_V8HI_UQI:
> - case V16SF_FTYPE_V16SF_V32HI_V32HI:
> - case V8SF_FTYPE_V8SF_V16HI_V16HI:
> - case V4SF_FTYPE_V4SF_V8HI_V8HI:
> + case V32BF_FTYPE_V16SF_V16SF_USI:
> + case V16BF_FTYPE_V8SF_V8SF_UHI:
> + case V8BF_FTYPE_V4SF_V4SF_UQI:
> + case V16BF_FTYPE_V16SF_V16BF_UHI:
> + case V8BF_FTYPE_V8SF_V8BF_UQI:
> + case V8BF_FTYPE_V4SF_V8BF_UQI:
> + case V16SF_FTYPE_V16SF_V32BF_V32BF:
> + case V8SF_FTYPE_V8SF_V16BF_V16BF:
> + case V4SF_FTYPE_V4SF_V8BF_V8BF:
> nargs = 3;
> break;
> case V32QI_FTYPE_V32QI_V32QI_INT:
> @@ -10958,9 +10958,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
> case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
> case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
> case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
> - case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
> - case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
> - case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
> + case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
> + case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
> + case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
> nargs = 4;
> break;
> case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
> @@ -10998,9 +10998,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
> break;
> case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
> case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
> - case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
> - case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
> - case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
> + case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
> + case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
> + case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
> nargs = 4;
> break;
> case UQI_FTYPE_V8DI_V8DI_INT_UQI:
> diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h
> index ddea249d09b..c62d50f1951 100644
> --- a/gcc/config/i386/immintrin.h
> +++ b/gcc/config/i386/immintrin.h
> @@ -118,9 +118,11 @@
>
> #include <vpclmulqdqintrin.h>
>
> +#ifdef __SSE2__
> #include <avx512bf16vlintrin.h>
>
> #include <avx512bf16intrin.h>
> +#endif
>
> #include <amxtileintrin.h>
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index f4b5506703f..fba81a93c1a 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -187,8 +187,6 @@
> UNSPEC_VP2INTERSECT
>
> ;; For AVX512BF16 support
> - UNSPEC_VCVTNE2PS2BF16
> - UNSPEC_VCVTNEPS2BF16
> UNSPEC_VDPBF16PS
>
> ;; For AVX512FP16 suppport
> @@ -28918,41 +28916,101 @@
> "vp2intersectd\t{%2, %1, %0|%0, %1, %2}"
> [(set_attr ("prefix") ("evex"))])
>
> -(define_mode_iterator BF16 [V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")])
> +(define_mode_iterator VF_AVX512BF16VL
> + [V32BF (V16BF "TARGET_AVX512VL") (V8BF "TARGET_AVX512VL")])
> ;; Converting from BF to SF
> (define_mode_attr bf16_cvt_2sf
> - [(V32HI "V16SF") (V16HI "V8SF") (V8HI "V4SF")])
> + [(V32BF "V16SF") (V16BF "V8SF") (V8BF "V4SF")])
> ;; Converting from SF to BF
> (define_mode_attr sf_cvt_bf16
> - [(V4SF "V8HI") (V8SF "V8HI") (V16SF "V16HI")])
> + [(V8SF "V8BF") (V16SF "V16BF")])
> ;; Mapping from BF to SF
> (define_mode_attr sf_bf16
> - [(V4SF "V8HI") (V8SF "V16HI") (V16SF "V32HI")])
> + [(V4SF "V8BF") (V8SF "V16BF") (V16SF "V32BF")])
>
> (define_expand "avx512f_cvtne2ps2bf16_<mode>_maskz"
> - [(match_operand:BF16 0 "register_operand")
> + [(match_operand:VF_AVX512BF16VL 0 "register_operand")
> (match_operand:<bf16_cvt_2sf> 1 "register_operand")
> - (match_operand:<bf16_cvt_2sf> 2 "register_operand")
> + (match_operand:<bf16_cvt_2sf> 2 "nonimmediate_operand")
> (match_operand:<avx512fmaskmode> 3 "register_operand")]
> "TARGET_AVX512BF16"
> {
> - emit_insn (gen_avx512f_cvtne2ps2bf16_<mode>_mask(operands[0], operands[1],
> - operands[2], CONST0_RTX(<MODE>mode), operands[3]));
> + emit_insn (gen_avx512f_cvtne2ps2bf16_<mode>_mask(operands[0], operands[2],
> + operands[1], CONST0_RTX(<MODE>mode), operands[3]));
> DONE;
> })
>
> (define_insn "avx512f_cvtne2ps2bf16_<mode><mask_name>"
> - [(set (match_operand:BF16 0 "register_operand" "=v")
> - (unspec:BF16
> - [(match_operand:<bf16_cvt_2sf> 1 "register_operand" "v")
> - (match_operand:<bf16_cvt_2sf> 2 "register_operand" "v")]
> - UNSPEC_VCVTNE2PS2BF16))]
> + [(set (match_operand:VF_AVX512BF16VL 0 "register_operand" "=v")
> + (vec_concat:VF_AVX512BF16VL
> + (float_truncate:<ssehalfvecmode>
> + (match_operand:<bf16_cvt_2sf> 2 "nonimmediate_operand" "vm"))
> + (float_truncate:<ssehalfvecmode>
> + (match_operand:<bf16_cvt_2sf> 1 "register_operand" "v"))))]
> "TARGET_AVX512BF16"
> "vcvtne2ps2bf16\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}")
>
> +(define_expand "avx512f_cvtneps2bf16_v4sf"
> + [(set (match_operand:V8BF 0 "register_operand")
> + (vec_concat:V8BF
> + (float_truncate:V4BF
> + (match_operand:V4SF 1 "nonimmediate_operand"))
> + (match_dup 2)))]
> + "TARGET_AVX512BF16 && TARGET_AVX512VL"
> + "operands[2] = CONST0_RTX (V4BFmode);")
> +
> +(define_insn "*avx512f_cvtneps2bf16_v4sf"
> + [(set (match_operand:V8BF 0 "register_operand" "=v")
> + (vec_concat:V8BF
> + (float_truncate:V4BF
> + (match_operand:V4SF 1 "nonimmediate_operand" "vm"))
> + (match_operand:V4BF 2 "const0_operand")))]
> + "TARGET_AVX512BF16 && TARGET_AVX512VL"
> + "vcvtneps2bf16{x}\t{%1, %0|%0, %1}")
> +
> +(define_expand "avx512f_cvtneps2bf16_v4sf_maskz"
> + [(match_operand:V8BF 0 "register_operand")
> + (match_operand:V4SF 1 "nonimmediate_operand")
> + (match_operand:QI 2 "register_operand")]
> + "TARGET_AVX512BF16 && TARGET_AVX512VL"
> +{
> + emit_insn (gen_avx512f_cvtneps2bf16_v4sf_mask_1(operands[0], operands[1],
> + CONST0_RTX(V8BFmode), operands[2], CONST0_RTX(V4BFmode)));
> + DONE;
> +})
> +
> +(define_expand "avx512f_cvtneps2bf16_v4sf_mask"
> + [(match_operand:V8BF 0 "register_operand")
> + (match_operand:V4SF 1 "nonimmediate_operand")
> + (match_operand:V8BF 2 "nonimm_or_0_operand")
> + (match_operand:QI 3 "register_operand")]
> + "TARGET_AVX512BF16 && TARGET_AVX512VL"
> +{
> + emit_insn (gen_avx512f_cvtneps2bf16_v4sf_mask_1(operands[0], operands[1],
> + operands[2], operands[3], CONST0_RTX(V4BFmode)));
> + DONE;
> +})
> +
> +(define_insn "avx512f_cvtneps2bf16_v4sf_mask_1"
> + [(set (match_operand:V8BF 0 "register_operand" "=v")
> + (vec_concat:V8BF
> + (vec_merge:V4BF
> + (float_truncate:V4BF
> + (match_operand:V4SF 1 "nonimmediate_operand" "vm"))
> + (vec_select:V4BF
> + (match_operand:V8BF 2 "nonimm_or_0_operand" "0C")
> + (parallel [(const_int 0) (const_int 1)
> + (const_int 2) (const_int 3)]))
> + (match_operand:QI 3 "register_operand" "Yk"))
> + (match_operand:V4BF 4 "const0_operand")))]
> + "TARGET_AVX512BF16 && TARGET_AVX512VL"
> + "vcvtneps2bf16{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}")
> +
> +(define_mode_iterator VF1_AVX512_256 [V16SF (V8SF "TARGET_AVX512VL")])
> +
> (define_expand "avx512f_cvtneps2bf16_<mode>_maskz"
> [(match_operand:<sf_cvt_bf16> 0 "register_operand")
> - (match_operand:VF1_AVX512VL 1 "register_operand")
> + (match_operand:VF1_AVX512_256 1 "nonimmediate_operand")
> (match_operand:<avx512fmaskmode> 2 "register_operand")]
> "TARGET_AVX512BF16"
> {
> @@ -28963,11 +29021,10 @@
>
> (define_insn "avx512f_cvtneps2bf16_<mode><mask_name>"
> [(set (match_operand:<sf_cvt_bf16> 0 "register_operand" "=v")
> - (unspec:<sf_cvt_bf16>
> - [(match_operand:VF1_AVX512VL 1 "register_operand" "v")]
> - UNSPEC_VCVTNEPS2BF16))]
> + (float_truncate:<sf_cvt_bf16>
> + (match_operand:VF1_AVX512_256 1 "nonimmediate_operand" "vm")))]
> "TARGET_AVX512BF16"
> - "vcvtneps2bf16\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}")
> + "vcvtneps2bf16<qq2phsuff>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}")
>
> (define_expand "avx512f_dpbf16ps_<mode>_maskz"
> [(match_operand:VF1_AVX512VL 0 "register_operand")
> @@ -28987,7 +29044,7 @@
> (unspec:VF1_AVX512VL
> [(match_operand:VF1_AVX512VL 1 "register_operand" "0")
> (match_operand:<sf_bf16> 2 "register_operand" "v")
> - (match_operand:<sf_bf16> 3 "register_operand" "v")]
> + (match_operand:<sf_bf16> 3 "nonimmediate_operand" "vm")]
> UNSPEC_VDPBF16PS))]
> "TARGET_AVX512BF16"
> "vdpbf16ps\t{%3, %2, %0<maskz_half_operand4>|%0<maskz_half_operand4>, %2, %3}")
> @@ -28998,7 +29055,7 @@
> (unspec:VF1_AVX512VL
> [(match_operand:VF1_AVX512VL 1 "register_operand" "0")
> (match_operand:<sf_bf16> 2 "register_operand" "v")
> - (match_operand:<sf_bf16> 3 "register_operand" "v")]
> + (match_operand:<sf_bf16> 3 "nonimmediate_operand" "vm")]
> UNSPEC_VDPBF16PS)
> (match_dup 1)
> (match_operand:<avx512fmaskhalfmode> 4 "register_operand" "Yk")))]
> diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
> index 831abd37d80..8e929e6f159 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
> @@ -1,6 +1,6 @@
> /* { dg-do compile } */
> /* { dg-options "-mavx512bf16 -O2" } */
> -/* { dg-additional-options "-fno-PIE" { target ia32 } } */
> +/* { dg-additional-options "-fno-PIE -mfpmath=sse" { target ia32 } } */
> /* { dg-final { scan-assembler-times "sall\[ \\t\]+\[^\{\n\]*16" 1 } } */
> /* { dg-final { scan-assembler-times "movl" 1 } } */
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c b/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
> index b64ad7b84dd..02ebdd8cf5b 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
> @@ -1,6 +1,6 @@
> /* { dg-do compile } */
> /* { dg-options "-mavx512bf16 -O2" } */
> -/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
>
> #include <immintrin.h>
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
> index 8f21b1bfdae..b71addd6301 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
> @@ -1,6 +1,6 @@
> /* { dg-do compile } */
> /* { dg-options "-mavx512bf16 -mavx512vl -O2" } */
> -/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>
> #include <immintrin.h>
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
> index 0969ae1b35e..d3a9bdf8c34 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
> @@ -1,11 +1,11 @@
> /* { dg-do compile } */
> /* { dg-options "-mavx512bf16 -mavx512vl -O2" } */
> -/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtneps2bf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtneps2bf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtneps2bf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
>
> #include <immintrin.h>
>
> --
> 2.27.0
>
--
BR,
Hongtao
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2022-10-28 6:26 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <20221028060808.1637178-1-lingling.kong@intel.com>
2022-10-28 6:20 ` [PATCH] i386: using __bf16 for AVX512BF16 intrinsics Kong, Lingling
2022-10-28 6:29 ` Hongtao Liu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).