public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] i386: using __bf16 for AVX512BF16 intrinsics
       [not found] <20221028060808.1637178-1-lingling.kong@intel.com>
@ 2022-10-28  6:20 ` Kong, Lingling
  2022-10-28  6:29   ` Hongtao Liu
  0 siblings, 1 reply; 2+ messages in thread
From: Kong, Lingling @ 2022-10-28  6:20 UTC (permalink / raw)
  To: Liu, Hongtao, gcc-patches

Hi,

Previously we use unsigned short to represent bf16. It's not a good expression, and at the time the front end didn't support bf16 type.
Now we introduced __bf16 to X86 psABI. So we can switch intrinsics to the new type.

Ok for trunk ?

Thanks,
Lingling

gcc/ChangeLog:

	* config/i386/avx512bf16intrin.h (__attribute__): Change short to bf16.
	(_mm_cvtsbh_ss): Ditto.
	(_mm512_cvtne2ps_pbh): Ditto.
	(_mm512_mask_cvtne2ps_pbh): Ditto.
	(_mm512_maskz_cvtne2ps_pbh): Ditto.
	* config/i386/avx512bf16vlintrin.h (__attribute__): Ditto.
	(_mm256_cvtne2ps_pbh): Ditto.
	(_mm256_mask_cvtne2ps_pbh): Ditto.
	(_mm256_maskz_cvtne2ps_pbh): Ditto.
	(_mm_cvtne2ps_pbh): Ditto.
	(_mm_mask_cvtne2ps_pbh): Ditto.
	(_mm_maskz_cvtne2ps_pbh): Ditto.
	(_mm_cvtness_sbh): Ditto.
	* config/i386/i386-builtin-types.def (V8BF): Add new
	DEF_VECTOR_TYPE for BFmode.
	(V16BF): Ditto.
	(V32BF): Ditto.
	* config/i386/i386-builtin.def (BDESC): Fixed builtins.
	* config/i386/i386-expand.cc (ix86_expand_args_builtin): Changed
	avx512bf16 ix86_builtin_func_type included HI to BF.
	* config/i386/immintrin.h: Add SSE2 depend for avx512bf16.
	* config/i386/sse.md (TARGET_AVX512VL): Changed HI vector to BF
	vector.
	(avx512f_cvtneps2bf16_v4sf): New define_expand.
	(*avx512f_cvtneps2bf16_v4sf): New define_insn.
	(avx512f_cvtneps2bf16_v4sf_maskz):Ditto.
	(avx512f_cvtneps2bf16_v4sf_mask): Ditto.
	(avx512f_cvtneps2bf16_v4sf_mask_1): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx512bf16-cvtsbh2ss-1.c: Add fpmath option.
	* gcc.target/i386/avx512bf16-vdpbf16ps-2.c: Fixed
	scan-assembler.
	* gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c: Add x/y suffix
	for vcvtneps2bf16.
	* gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c: Ditto.
---
 gcc/config/i386/avx512bf16intrin.h            |  12 +--
 gcc/config/i386/avx512bf16vlintrin.h          |  29 ++---
 gcc/config/i386/i386-builtin-types.def        |  51 ++++-----
 gcc/config/i386/i386-builtin.def              |  54 +++++-----
 gcc/config/i386/i386-expand.cc                |  48 ++++-----
 gcc/config/i386/immintrin.h                   |   2 +
 gcc/config/i386/sse.md                        | 101 ++++++++++++++----
 .../gcc.target/i386/avx512bf16-cvtsbh2ss-1.c  |   2 +-
 .../gcc.target/i386/avx512bf16-vdpbf16ps-2.c  |   2 +-
 .../i386/avx512bf16vl-cvtness2sbh-1.c         |   2 +-
 .../i386/avx512bf16vl-vcvtneps2bf16-1.c       |  12 +--
 11 files changed, 189 insertions(+), 126 deletions(-)

diff --git a/gcc/config/i386/avx512bf16intrin.h b/gcc/config/i386/avx512bf16intrin.h
index b6e9ddad157..ea1d0125b3f 100644
--- a/gcc/config/i386/avx512bf16intrin.h
+++ b/gcc/config/i386/avx512bf16intrin.h
@@ -35,16 +35,16 @@
 #endif /* __AVX512BF16__ */
 
 /* Internal data types for implementing the intrinsics.  */
-typedef short __v32bh __attribute__ ((__vector_size__ (64)));
+typedef __bf16 __v32bf __attribute__ ((__vector_size__ (64)));
 
 /* The Intel API is flexible enough that we must allow aliasing with other
    vector types, and their scalar components.  */
-typedef short __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
+typedef __bf16 __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
 
 /* Convert One BF16 Data to One Single Float Data.  */
 extern __inline float
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cvtsbh_ss (__bfloat16 __A)
+_mm_cvtsbh_ss (__bf16 __A)
 {
   union{ float a; unsigned int b;} __tmp;
   __tmp.b = ((unsigned int)(__A)) << 16;
@@ -57,21 +57,21 @@ extern __inline __m512bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_cvtne2ps_pbh (__m512 __A, __m512 __B)
 {
-  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi(__A, __B);
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf(__A, __B);
 }
 
 extern __inline __m512bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_cvtne2ps_pbh (__m512bh __A, __mmask32 __B, __m512 __C, __m512 __D)
 {
-  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_mask(__C, __D, __A, __B);
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_mask(__C, __D, __A, __B);
 }
 
 extern __inline __m512bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_maskz_cvtne2ps_pbh (__mmask32 __A, __m512 __B, __m512 __C)
 {
-  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_maskz(__B, __C, __A);
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_maskz(__B, __C, __A);
 }
 
 /* vcvtneps2bf16 */
diff --git a/gcc/config/i386/avx512bf16vlintrin.h b/gcc/config/i386/avx512bf16vlintrin.h
index 969335ff358..56c28f14cf6 100644
--- a/gcc/config/i386/avx512bf16vlintrin.h
+++ b/gcc/config/i386/avx512bf16vlintrin.h
@@ -35,57 +35,58 @@
 #endif /* __AVX512BF16__ */
 
 /* Internal data types for implementing the intrinsics.  */
-typedef short __v16bh __attribute__ ((__vector_size__ (32)));
-typedef short __v8bh __attribute__ ((__vector_size__ (16)));
+typedef __bf16 __v16bf __attribute__ ((__vector_size__ (32)));
+typedef __bf16 __v8bf __attribute__ ((__vector_size__ (16)));
 
 /* The Intel API is flexible enough that we must allow aliasing with other
    vector types, and their scalar components.  */
-typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
-typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef __bf16 __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
+typedef __bf16 __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
+
+typedef __bf16 __bfloat16;
 
-typedef unsigned short __bfloat16;
 /* vcvtne2ps2bf16 */
 
 extern __inline __m256bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_cvtne2ps_pbh (__m256 __A, __m256 __B)
 {
-  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi(__A, __B);
+  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf(__A, __B);
 }
 
 extern __inline __m256bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D)
 {
-  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_mask(__C, __D, __A, __B);
+  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_mask(__C, __D, __A, __B);
 }
 
 extern __inline __m256bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C)
 {
-  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_maskz(__B, __C, __A);
+  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_maskz(__B, __C, __A);
 }
 
 extern __inline __m128bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtne2ps_pbh (__m128 __A, __m128 __B)
 {
-  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi(__A, __B);
+  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf(__A, __B);
 }
 
 extern __inline __m128bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D)
 {
-  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_mask(__C, __D, __A, __B);
+  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_mask(__C, __D, __A, __B);
 }
 
 extern __inline __m128bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C)
 {
-  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_maskz(__B, __C, __A);
+  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_maskz(__B, __C, __A);
 }
 
 /* vcvtneps2bf16 */
@@ -176,13 +177,13 @@ _mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D)
   return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A);
 }
 
-extern __inline __bfloat16
+extern __inline __bf16
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtness_sbh (float __A)
 {
   __v4sf __V = {__A, 0, 0, 0};
-  __v8hi __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
-	       (__v8hi)_mm_undefined_si128 (), (__mmask8)-1);
+  __v8bf __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
+	       (__v8bf)_mm_undefined_si128 (), (__mmask8)-1);
   return __R[0];
 }
 
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index 63a360b0f8b..aedae2d7750 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -87,6 +87,7 @@ DEF_VECTOR_TYPE (V8QI, QI)
 DEF_VECTOR_TYPE (V2DF, DOUBLE)
 DEF_VECTOR_TYPE (V4SF, FLOAT)
 DEF_VECTOR_TYPE (V8HF, FLOAT16)
+DEF_VECTOR_TYPE (V8BF, BFLOAT16)
 DEF_VECTOR_TYPE (V2DI, DI)
 DEF_VECTOR_TYPE (V4SI, SI)
 DEF_VECTOR_TYPE (V8HI, HI)
@@ -100,6 +101,7 @@ DEF_VECTOR_TYPE (V16UQI, UQI, V16QI)
 DEF_VECTOR_TYPE (V4DF, DOUBLE)
 DEF_VECTOR_TYPE (V8SF, FLOAT)
 DEF_VECTOR_TYPE (V16HF, FLOAT16)
+DEF_VECTOR_TYPE (V16BF, BFLOAT16)
 DEF_VECTOR_TYPE (V4DI, DI)
 DEF_VECTOR_TYPE (V8SI, SI)
 DEF_VECTOR_TYPE (V16HI, HI)
@@ -111,6 +113,7 @@ DEF_VECTOR_TYPE (V16UHI, UHI, V16HI)
 # AVX512F vectors
 DEF_VECTOR_TYPE (V32SF, FLOAT)
 DEF_VECTOR_TYPE (V32HF, FLOAT16)
+DEF_VECTOR_TYPE (V32BF, BFLOAT16)
 DEF_VECTOR_TYPE (V16SF, FLOAT)
 DEF_VECTOR_TYPE (V8DF, DOUBLE)
 DEF_VECTOR_TYPE (V8DI, DI)
@@ -1273,30 +1276,30 @@ DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, UHI)
 DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, UHI)
 
 # BF16 builtins
-DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF)
-DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF, V32HI, USI)
-DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF, USI)
-DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF)
-DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF, V16HI, UHI)
-DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF, UHI)
-DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF)
-DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF, V8HI, UQI)
-DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF, UQI)
-DEF_FUNCTION_TYPE (V16HI, V16SF)
-DEF_FUNCTION_TYPE (V16HI, V16SF, V16HI, UHI)
-DEF_FUNCTION_TYPE (V16HI, V16SF, UHI)
-DEF_FUNCTION_TYPE (V8HI, V8SF)
-DEF_FUNCTION_TYPE (V8HI, V8SF, V8HI, UQI)
-DEF_FUNCTION_TYPE (V8HI, V8SF, UQI)
-DEF_FUNCTION_TYPE (V8HI, V4SF)
-DEF_FUNCTION_TYPE (V8HI, V4SF, V8HI, UQI)
-DEF_FUNCTION_TYPE (V8HI, V4SF, UQI)
-DEF_FUNCTION_TYPE (V16SF, V16SF, V32HI, V32HI)
-DEF_FUNCTION_TYPE (V16SF, V16SF, V32HI, V32HI, UHI)
-DEF_FUNCTION_TYPE (V8SF, V8SF, V16HI, V16HI)
-DEF_FUNCTION_TYPE (V8SF, V8SF, V16HI, V16HI, UQI)
-DEF_FUNCTION_TYPE (V4SF, V4SF, V8HI, V8HI)
-DEF_FUNCTION_TYPE (V4SF, V4SF, V8HI, V8HI, UQI)
+DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF)
+DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF, V32BF, USI)
+DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF, USI)
+DEF_FUNCTION_TYPE (V16BF, V8SF, V8SF)
+DEF_FUNCTION_TYPE (V16BF, V8SF, V8SF, V16BF, UHI)
+DEF_FUNCTION_TYPE (V16BF, V8SF, V8SF, UHI)
+DEF_FUNCTION_TYPE (V8BF, V4SF, V4SF)
+DEF_FUNCTION_TYPE (V8BF, V4SF, V4SF, V8BF, UQI)
+DEF_FUNCTION_TYPE (V8BF, V4SF, V4SF, UQI)
+DEF_FUNCTION_TYPE (V16BF, V16SF)
+DEF_FUNCTION_TYPE (V16BF, V16SF, V16BF, UHI)
+DEF_FUNCTION_TYPE (V16BF, V16SF, UHI)
+DEF_FUNCTION_TYPE (V8BF, V8SF)
+DEF_FUNCTION_TYPE (V8BF, V8SF, V8BF, UQI)
+DEF_FUNCTION_TYPE (V8BF, V8SF, UQI)
+DEF_FUNCTION_TYPE (V8BF, V4SF)
+DEF_FUNCTION_TYPE (V8BF, V4SF, V8BF, UQI)
+DEF_FUNCTION_TYPE (V8BF, V4SF, UQI)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V32BF, V32BF)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V32BF, V32BF, UHI)
+DEF_FUNCTION_TYPE (V8SF, V8SF, V16BF, V16BF)
+DEF_FUNCTION_TYPE (V8SF, V8SF, V16BF, V16BF, UQI)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V8BF, V8BF)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V8BF, V8BF, UQI)
 
 # KEYLOCKER builtins
 DEF_FUNCTION_TYPE (UINT, UINT, V2DI, V2DI, PVOID)
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index e35306e27d0..5802e2049a8 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -2779,33 +2779,33 @@ BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, "__builtin_ia32_vae
 BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI)
 
 /* BF16 */
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi, "__builtin_ia32_cvtne2ps2bf16_v32hi", IX86_BUILTIN_CVTNE2PS2HI16_V32HI, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi_mask, "__builtin_ia32_cvtne2ps2bf16_v32hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF_V32HI_USI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v32hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V32HI_MASKZ, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF_USI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi, "__builtin_ia32_cvtne2ps2bf16_v16hi", IX86_BUILTIN_CVTNE2PS2HI16_V16HI, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi_mask, "__builtin_ia32_cvtne2ps2bf16_v16hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF_V16HI_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v16hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V16HI_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi, "__builtin_ia32_cvtne2ps2bf16_v8hi", IX86_BUILTIN_CVTNE2PS2HI16_V8HI, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi_mask, "__builtin_ia32_cvtne2ps2bf16_v8hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V8HI_MASK, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF_V8HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v8hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V8HI_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf, "__builtin_ia32_cvtneps2bf16_v16sf", IX86_BUILTIN_CVTNEPS2HI16_V16SF, UNKNOWN, (int) V16HI_FTYPE_V16SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_mask, "__builtin_ia32_cvtneps2bf16_v16sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V16SF_MASK, UNKNOWN, (int) V16HI_FTYPE_V16SF_V16HI_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_maskz, "__builtin_ia32_cvtneps2bf16_v16sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V16SF_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16SF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf, "__builtin_ia32_cvtneps2bf16_v8sf", IX86_BUILTIN_CVTNEPS2HI16_V8SF, UNKNOWN, (int) V8HI_FTYPE_V8SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_mask, "__builtin_ia32_cvtneps2bf16_v8sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V8SF_MASK, UNKNOWN, (int) V8HI_FTYPE_V8SF_V8HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_maskz, "__builtin_ia32_cvtneps2bf16_v8sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V8SF_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V8SF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf, "__builtin_ia32_cvtneps2bf16_v4sf", IX86_BUILTIN_CVTNEPS2HI16_V4SF, UNKNOWN, (int) V8HI_FTYPE_V4SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_mask, "__builtin_ia32_cvtneps2bf16_v4sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V4SF_MASK, UNKNOWN, (int) V8HI_FTYPE_V4SF_V8HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_maskz, "__builtin_ia32_cvtneps2bf16_v4sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V4SF_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V4SF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf, "__builtin_ia32_dpbf16ps_v16sf", IX86_BUILTIN_DPHI16PS_V16SF, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_mask, "__builtin_ia32_dpbf16ps_v16sf_mask", IX86_BUILTIN_DPHI16PS_V16SF_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_maskz, "__builtin_ia32_dpbf16ps_v16sf_maskz", IX86_BUILTIN_DPHI16PS_V16SF_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf, "__builtin_ia32_dpbf16ps_v8sf", IX86_BUILTIN_DPHI16PS_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_mask, "__builtin_ia32_dpbf16ps_v8sf_mask", IX86_BUILTIN_DPHI16PS_V8SF_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_maskz, "__builtin_ia32_dpbf16ps_v8sf_maskz", IX86_BUILTIN_DPHI16PS_V8SF_MASKZ, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builtin_ia32_dpbf16ps_v4sf", IX86_BUILTIN_DPHI16PS_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPHI16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPHI16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf, "__builtin_ia32_cvtne2ps2bf16_v32bf", IX86_BUILTIN_CVTNE2PS2BF16_V32BF, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf_mask, "__builtin_ia32_cvtne2ps2bf16_v32bf_mask", IX86_BUILTIN_CVTNE2PS2BF16_V32BF_MASK, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF_V32BF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf_maskz, "__builtin_ia32_cvtne2ps2bf16_v32bf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V32BF_MASKZ, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16bf, "__builtin_ia32_cvtne2ps2bf16_v16bf", IX86_BUILTIN_CVTNE2PS2BF16_V16BF, UNKNOWN, (int) V16BF_FTYPE_V8SF_V8SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16bf_mask, "__builtin_ia32_cvtne2ps2bf16_v16bf_mask", IX86_BUILTIN_CVTNE2PS2BF16_V16BF_MASK, UNKNOWN, (int) V16BF_FTYPE_V8SF_V8SF_V16BF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16bf_maskz, "__builtin_ia32_cvtne2ps2bf16_v16bf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V16BF_MASKZ, UNKNOWN, (int) V16BF_FTYPE_V8SF_V8SF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8bf, "__builtin_ia32_cvtne2ps2bf16_v8bf", IX86_BUILTIN_CVTNE2PS2BF16_V8BF, UNKNOWN, (int) V8BF_FTYPE_V4SF_V4SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8bf_mask, "__builtin_ia32_cvtne2ps2bf16_v8bf_mask", IX86_BUILTIN_CVTNE2PS2BF16_V8BF_MASK, UNKNOWN, (int) V8BF_FTYPE_V4SF_V4SF_V8BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8bf_maskz, "__builtin_ia32_cvtne2ps2bf16_v8bf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V8BF_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V4SF_V4SF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf, "__builtin_ia32_cvtneps2bf16_v16sf", IX86_BUILTIN_CVTNEPS2BF16_V16SF, UNKNOWN, (int) V16BF_FTYPE_V16SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_mask, "__builtin_ia32_cvtneps2bf16_v16sf_mask", IX86_BUILTIN_CVTNEPS2BF16_V16SF_MASK, UNKNOWN, (int) V16BF_FTYPE_V16SF_V16BF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_maskz, "__builtin_ia32_cvtneps2bf16_v16sf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V16SF_MASKZ, UNKNOWN, (int) V16BF_FTYPE_V16SF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf, "__builtin_ia32_cvtneps2bf16_v8sf", IX86_BUILTIN_CVTNEPS2BF16_V8SF, UNKNOWN, (int) V8BF_FTYPE_V8SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_mask, "__builtin_ia32_cvtneps2bf16_v8sf_mask", IX86_BUILTIN_CVTNEPS2BF16_V8SF_MASK, UNKNOWN, (int) V8BF_FTYPE_V8SF_V8BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_maskz, "__builtin_ia32_cvtneps2bf16_v8sf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V8SF_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V8SF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf, "__builtin_ia32_cvtneps2bf16_v4sf", IX86_BUILTIN_CVTNEPS2BF16_V4SF, UNKNOWN, (int) V8BF_FTYPE_V4SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_mask, "__builtin_ia32_cvtneps2bf16_v4sf_mask", IX86_BUILTIN_CVTNEPS2BF16_V4SF_MASK, UNKNOWN, (int) V8BF_FTYPE_V4SF_V8BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_maskz, "__builtin_ia32_cvtneps2bf16_v4sf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V4SF_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V4SF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf, "__builtin_ia32_dpbf16ps_v16sf", IX86_BUILTIN_DPBF16PS_V16SF, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32BF_V32BF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_mask, "__builtin_ia32_dpbf16ps_v16sf_mask", IX86_BUILTIN_DPBF16PS_V16SF_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32BF_V32BF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_maskz, "__builtin_ia32_dpbf16ps_v16sf_maskz", IX86_BUILTIN_DPBF16PS_V16SF_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32BF_V32BF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf, "__builtin_ia32_dpbf16ps_v8sf", IX86_BUILTIN_DPBF16PS_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16BF_V16BF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_mask, "__builtin_ia32_dpbf16ps_v8sf_mask", IX86_BUILTIN_DPBF16PS_V8SF_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16BF_V16BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_maskz, "__builtin_ia32_dpbf16ps_v8sf_maskz", IX86_BUILTIN_DPBF16PS_V8SF_MASKZ, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16BF_V16BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builtin_ia32_dpbf16ps_v4sf", IX86_BUILTIN_DPBF16PS_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPBF16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPBF16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
 
 /* AVX512FP16.  */
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_addph128_mask", IX86_BUILTIN_ADDPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 5d9e5a12f7e..8e1ef0b4c4a 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -10462,9 +10462,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V8DF_FTYPE_V2DF:
     case V8DF_FTYPE_V8DF:
     case V4DI_FTYPE_V4DI:
-    case V16HI_FTYPE_V16SF:
-    case V8HI_FTYPE_V8SF:
-    case V8HI_FTYPE_V4SF:
+    case V16BF_FTYPE_V16SF:
+    case V8BF_FTYPE_V8SF:
+    case V8BF_FTYPE_V4SF:
       nargs = 1;
       break;
     case V4SF_FTYPE_V4SF_VEC_MERGE:
@@ -10592,12 +10592,12 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case USI_FTYPE_USI_USI:
     case UDI_FTYPE_UDI_UDI:
     case V16SI_FTYPE_V8DF_V8DF:
-    case V32HI_FTYPE_V16SF_V16SF:
-    case V16HI_FTYPE_V8SF_V8SF:
-    case V8HI_FTYPE_V4SF_V4SF:
-    case V16HI_FTYPE_V16SF_UHI:
-    case V8HI_FTYPE_V8SF_UQI:
-    case V8HI_FTYPE_V4SF_UQI:
+    case V32BF_FTYPE_V16SF_V16SF:
+    case V16BF_FTYPE_V8SF_V8SF:
+    case V8BF_FTYPE_V4SF_V4SF:
+    case V16BF_FTYPE_V16SF_UHI:
+    case V8BF_FTYPE_V8SF_UQI:
+    case V8BF_FTYPE_V4SF_UQI:
       nargs = 2;
       break;
     case V2DI_FTYPE_V2DI_INT_CONVERT:
@@ -10803,15 +10803,15 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V16HI_FTYPE_V16HI_V16HI_V16HI:
     case V8SI_FTYPE_V8SI_V8SI_V8SI:
     case V8HI_FTYPE_V8HI_V8HI_V8HI:
-    case V32HI_FTYPE_V16SF_V16SF_USI:
-    case V16HI_FTYPE_V8SF_V8SF_UHI:
-    case V8HI_FTYPE_V4SF_V4SF_UQI:
-    case V16HI_FTYPE_V16SF_V16HI_UHI:
-    case V8HI_FTYPE_V8SF_V8HI_UQI:
-    case V8HI_FTYPE_V4SF_V8HI_UQI:
-    case V16SF_FTYPE_V16SF_V32HI_V32HI:
-    case V8SF_FTYPE_V8SF_V16HI_V16HI:
-    case V4SF_FTYPE_V4SF_V8HI_V8HI:
+    case V32BF_FTYPE_V16SF_V16SF_USI:
+    case V16BF_FTYPE_V8SF_V8SF_UHI:
+    case V8BF_FTYPE_V4SF_V4SF_UQI:
+    case V16BF_FTYPE_V16SF_V16BF_UHI:
+    case V8BF_FTYPE_V8SF_V8BF_UQI:
+    case V8BF_FTYPE_V4SF_V8BF_UQI:
+    case V16SF_FTYPE_V16SF_V32BF_V32BF:
+    case V8SF_FTYPE_V8SF_V16BF_V16BF:
+    case V4SF_FTYPE_V4SF_V8BF_V8BF:
       nargs = 3;
       break;
     case V32QI_FTYPE_V32QI_V32QI_INT:
@@ -10958,9 +10958,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
     case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
     case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
-    case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
-    case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
-    case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
+    case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
+    case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
+    case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
       nargs = 4;
       break;
     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
@@ -10998,9 +10998,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
       break;
     case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
     case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
-    case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
-    case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
-    case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
+    case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
+    case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
+    case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
       nargs = 4;
       break;
     case UQI_FTYPE_V8DI_V8DI_INT_UQI:
diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h
index ddea249d09b..c62d50f1951 100644
--- a/gcc/config/i386/immintrin.h
+++ b/gcc/config/i386/immintrin.h
@@ -118,9 +118,11 @@
 
 #include <vpclmulqdqintrin.h>
 
+#ifdef __SSE2__
 #include <avx512bf16vlintrin.h>
 
 #include <avx512bf16intrin.h>
+#endif
 
 #include <amxtileintrin.h>
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f4b5506703f..fba81a93c1a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -187,8 +187,6 @@
   UNSPEC_VP2INTERSECT
 
   ;; For AVX512BF16 support
-  UNSPEC_VCVTNE2PS2BF16
-  UNSPEC_VCVTNEPS2BF16
   UNSPEC_VDPBF16PS
 
   ;; For AVX512FP16 suppport
@@ -28918,41 +28916,101 @@
   "vp2intersectd\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr ("prefix") ("evex"))])
 
-(define_mode_iterator BF16 [V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")])
+(define_mode_iterator VF_AVX512BF16VL
+  [V32BF (V16BF "TARGET_AVX512VL") (V8BF "TARGET_AVX512VL")])
 ;; Converting from BF to SF
 (define_mode_attr bf16_cvt_2sf
-  [(V32HI  "V16SF") (V16HI  "V8SF") (V8HI  "V4SF")])
+  [(V32BF  "V16SF") (V16BF  "V8SF") (V8BF  "V4SF")])
 ;; Converting from SF to BF
 (define_mode_attr sf_cvt_bf16
-  [(V4SF  "V8HI") (V8SF  "V8HI") (V16SF  "V16HI")])
+  [(V8SF  "V8BF") (V16SF  "V16BF")])
 ;; Mapping from BF to SF
 (define_mode_attr sf_bf16
-  [(V4SF  "V8HI") (V8SF  "V16HI") (V16SF  "V32HI")])
+  [(V4SF  "V8BF") (V8SF  "V16BF") (V16SF  "V32BF")])
 
 (define_expand "avx512f_cvtne2ps2bf16_<mode>_maskz"
-  [(match_operand:BF16 0 "register_operand")
+  [(match_operand:VF_AVX512BF16VL 0 "register_operand")
    (match_operand:<bf16_cvt_2sf> 1 "register_operand")
-   (match_operand:<bf16_cvt_2sf> 2 "register_operand")
+   (match_operand:<bf16_cvt_2sf> 2 "nonimmediate_operand")
    (match_operand:<avx512fmaskmode> 3 "register_operand")]
   "TARGET_AVX512BF16"
 {
-  emit_insn (gen_avx512f_cvtne2ps2bf16_<mode>_mask(operands[0], operands[1],
-    operands[2], CONST0_RTX(<MODE>mode), operands[3]));
+  emit_insn (gen_avx512f_cvtne2ps2bf16_<mode>_mask(operands[0], operands[2],
+    operands[1], CONST0_RTX(<MODE>mode), operands[3]));
   DONE;
 })
 
 (define_insn "avx512f_cvtne2ps2bf16_<mode><mask_name>"
-  [(set (match_operand:BF16 0 "register_operand" "=v")
-	(unspec:BF16
-	  [(match_operand:<bf16_cvt_2sf> 1 "register_operand" "v")
-	   (match_operand:<bf16_cvt_2sf> 2 "register_operand" "v")]
-        UNSPEC_VCVTNE2PS2BF16))]
+  [(set (match_operand:VF_AVX512BF16VL 0 "register_operand" "=v")
+	(vec_concat:VF_AVX512BF16VL
+	  (float_truncate:<ssehalfvecmode>
+	    (match_operand:<bf16_cvt_2sf> 2 "nonimmediate_operand" "vm"))
+	  (float_truncate:<ssehalfvecmode>
+	    (match_operand:<bf16_cvt_2sf> 1 "register_operand" "v"))))]
   "TARGET_AVX512BF16"
   "vcvtne2ps2bf16\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}")
 
+(define_expand "avx512f_cvtneps2bf16_v4sf"
+  [(set (match_operand:V8BF 0 "register_operand")
+	(vec_concat:V8BF
+	  (float_truncate:V4BF
+	    (match_operand:V4SF 1 "nonimmediate_operand"))
+	  (match_dup 2)))]
+  "TARGET_AVX512BF16 && TARGET_AVX512VL"
+  "operands[2] = CONST0_RTX (V4BFmode);")
+
+(define_insn "*avx512f_cvtneps2bf16_v4sf"
+  [(set (match_operand:V8BF 0 "register_operand" "=v")
+	(vec_concat:V8BF
+	  (float_truncate:V4BF
+	    (match_operand:V4SF 1 "nonimmediate_operand" "vm"))
+	  (match_operand:V4BF 2 "const0_operand")))]
+  "TARGET_AVX512BF16 && TARGET_AVX512VL"
+  "vcvtneps2bf16{x}\t{%1, %0|%0, %1}")
+
+(define_expand "avx512f_cvtneps2bf16_v4sf_maskz"
+  [(match_operand:V8BF 0 "register_operand")
+   (match_operand:V4SF 1 "nonimmediate_operand")
+   (match_operand:QI 2 "register_operand")]
+  "TARGET_AVX512BF16 && TARGET_AVX512VL"
+{
+  emit_insn (gen_avx512f_cvtneps2bf16_v4sf_mask_1(operands[0], operands[1],
+    CONST0_RTX(V8BFmode), operands[2], CONST0_RTX(V4BFmode)));
+  DONE;
+})
+
+(define_expand "avx512f_cvtneps2bf16_v4sf_mask"
+  [(match_operand:V8BF 0 "register_operand")
+   (match_operand:V4SF 1 "nonimmediate_operand")
+   (match_operand:V8BF 2 "nonimm_or_0_operand")
+   (match_operand:QI 3 "register_operand")]
+  "TARGET_AVX512BF16 && TARGET_AVX512VL"
+{
+  emit_insn (gen_avx512f_cvtneps2bf16_v4sf_mask_1(operands[0], operands[1],
+    operands[2], operands[3], CONST0_RTX(V4BFmode)));
+  DONE;
+})
+
+(define_insn "avx512f_cvtneps2bf16_v4sf_mask_1"
+  [(set (match_operand:V8BF 0 "register_operand" "=v")
+	(vec_concat:V8BF
+	  (vec_merge:V4BF
+	    (float_truncate:V4BF
+	      (match_operand:V4SF 1 "nonimmediate_operand" "vm"))
+	    (vec_select:V4BF
+	      (match_operand:V8BF 2 "nonimm_or_0_operand" "0C")
+	      (parallel [(const_int 0) (const_int 1)
+			 (const_int 2) (const_int 3)]))
+	    (match_operand:QI 3 "register_operand" "Yk"))
+	  (match_operand:V4BF 4 "const0_operand")))]
+  "TARGET_AVX512BF16 && TARGET_AVX512VL"
+  "vcvtneps2bf16{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}")
+
+(define_mode_iterator VF1_AVX512_256 [V16SF (V8SF "TARGET_AVX512VL")])
+
 (define_expand "avx512f_cvtneps2bf16_<mode>_maskz"
   [(match_operand:<sf_cvt_bf16> 0 "register_operand")
-   (match_operand:VF1_AVX512VL 1 "register_operand")
+   (match_operand:VF1_AVX512_256 1 "nonimmediate_operand")
    (match_operand:<avx512fmaskmode> 2 "register_operand")]
   "TARGET_AVX512BF16"
 {
@@ -28963,11 +29021,10 @@
 
 (define_insn "avx512f_cvtneps2bf16_<mode><mask_name>"
   [(set (match_operand:<sf_cvt_bf16> 0 "register_operand" "=v")
-	(unspec:<sf_cvt_bf16>
-	  [(match_operand:VF1_AVX512VL 1 "register_operand" "v")]
-        UNSPEC_VCVTNEPS2BF16))]
+	(float_truncate:<sf_cvt_bf16>
+	  (match_operand:VF1_AVX512_256 1 "nonimmediate_operand" "vm")))]
   "TARGET_AVX512BF16"
-  "vcvtneps2bf16\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}")
+  "vcvtneps2bf16<qq2phsuff>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}")
 
 (define_expand "avx512f_dpbf16ps_<mode>_maskz"
   [(match_operand:VF1_AVX512VL 0 "register_operand")
@@ -28987,7 +29044,7 @@
 	(unspec:VF1_AVX512VL
 	  [(match_operand:VF1_AVX512VL 1 "register_operand" "0")
 	   (match_operand:<sf_bf16> 2 "register_operand" "v")
-	   (match_operand:<sf_bf16> 3 "register_operand" "v")]
+	   (match_operand:<sf_bf16> 3 "nonimmediate_operand" "vm")]
         UNSPEC_VDPBF16PS))]
   "TARGET_AVX512BF16"
   "vdpbf16ps\t{%3, %2, %0<maskz_half_operand4>|%0<maskz_half_operand4>, %2, %3}")
@@ -28998,7 +29055,7 @@
 	  (unspec:VF1_AVX512VL
 	    [(match_operand:VF1_AVX512VL 1 "register_operand" "0")
 	     (match_operand:<sf_bf16> 2 "register_operand" "v")
-	     (match_operand:<sf_bf16> 3 "register_operand" "v")]
+	     (match_operand:<sf_bf16> 3 "nonimmediate_operand" "vm")]
              UNSPEC_VDPBF16PS)
           (match_dup 1)
           (match_operand:<avx512fmaskhalfmode> 4 "register_operand" "Yk")))]
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
index 831abd37d80..8e929e6f159 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bf16 -O2" } */
-/* { dg-additional-options "-fno-PIE" { target ia32 } } */
+/* { dg-additional-options "-fno-PIE -mfpmath=sse" { target ia32 } } */
 /* { dg-final { scan-assembler-times "sall\[ \\t\]+\[^\{\n\]*16" 1 } } */
 /* { dg-final { scan-assembler-times "movl" 1 } } */
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c b/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
index b64ad7b84dd..02ebdd8cf5b 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bf16 -O2" } */
-/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
index 8f21b1bfdae..b71addd6301 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bf16 -mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
index 0969ae1b35e..d3a9bdf8c34 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
@@ -1,11 +1,11 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bf16 -mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
 
 #include <immintrin.h>
 
-- 
2.27.0


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] i386: using __bf16 for AVX512BF16 intrinsics
  2022-10-28  6:20 ` [PATCH] i386: using __bf16 for AVX512BF16 intrinsics Kong, Lingling
@ 2022-10-28  6:29   ` Hongtao Liu
  0 siblings, 0 replies; 2+ messages in thread
From: Hongtao Liu @ 2022-10-28  6:29 UTC (permalink / raw)
  To: Kong, Lingling; +Cc: Liu, Hongtao, gcc-patches

On Fri, Oct 28, 2022 at 2:20 PM Kong, Lingling via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi,
>
> Previously we use unsigned short to represent bf16. It's not a good expression, and at the time the front end didn't support bf16 type.
> Now we introduced __bf16 to X86 psABI. So we can switch intrinsics to the new type.
>
> Ok for trunk ?
LGTM, but please don't commit it until next week to leave some time
for others to take a look.
Also please update GCC13 doc for it.
https://gcc.gnu.org/gcc-13/changes.html.
>
> Thanks,
> Lingling
>
> gcc/ChangeLog:
>
>         * config/i386/avx512bf16intrin.h (__attribute__): Change short to bf16.
>         (_mm_cvtsbh_ss): Ditto.
>         (_mm512_cvtne2ps_pbh): Ditto.
>         (_mm512_mask_cvtne2ps_pbh): Ditto.
>         (_mm512_maskz_cvtne2ps_pbh): Ditto.
>         * config/i386/avx512bf16vlintrin.h (__attribute__): Ditto.
>         (_mm256_cvtne2ps_pbh): Ditto.
>         (_mm256_mask_cvtne2ps_pbh): Ditto.
>         (_mm256_maskz_cvtne2ps_pbh): Ditto.
>         (_mm_cvtne2ps_pbh): Ditto.
>         (_mm_mask_cvtne2ps_pbh): Ditto.
>         (_mm_maskz_cvtne2ps_pbh): Ditto.
>         (_mm_cvtness_sbh): Ditto.
>         * config/i386/i386-builtin-types.def (V8BF): Add new
>         DEF_VECTOR_TYPE for BFmode.
>         (V16BF): Ditto.
>         (V32BF): Ditto.
>         * config/i386/i386-builtin.def (BDESC): Fixed builtins.
>         * config/i386/i386-expand.cc (ix86_expand_args_builtin): Changed
>         avx512bf16 ix86_builtin_func_type included HI to BF.
>         * config/i386/immintrin.h: Add SSE2 depend for avx512bf16.
>         * config/i386/sse.md (TARGET_AVX512VL): Changed HI vector to BF
>         vector.
>         (avx512f_cvtneps2bf16_v4sf): New define_expand.
>         (*avx512f_cvtneps2bf16_v4sf): New define_insn.
>         (avx512f_cvtneps2bf16_v4sf_maskz):Ditto.
>         (avx512f_cvtneps2bf16_v4sf_mask): Ditto.
>         (avx512f_cvtneps2bf16_v4sf_mask_1): Ditto.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/avx512bf16-cvtsbh2ss-1.c: Add fpmath option.
>         * gcc.target/i386/avx512bf16-vdpbf16ps-2.c: Fixed
>         scan-assembler.
>         * gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c: Add x/y suffix
>         for vcvtneps2bf16.
>         * gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c: Ditto.
> ---
>  gcc/config/i386/avx512bf16intrin.h            |  12 +--
>  gcc/config/i386/avx512bf16vlintrin.h          |  29 ++---
>  gcc/config/i386/i386-builtin-types.def        |  51 ++++-----
>  gcc/config/i386/i386-builtin.def              |  54 +++++-----
>  gcc/config/i386/i386-expand.cc                |  48 ++++-----
>  gcc/config/i386/immintrin.h                   |   2 +
>  gcc/config/i386/sse.md                        | 101 ++++++++++++++----
>  .../gcc.target/i386/avx512bf16-cvtsbh2ss-1.c  |   2 +-
>  .../gcc.target/i386/avx512bf16-vdpbf16ps-2.c  |   2 +-
>  .../i386/avx512bf16vl-cvtness2sbh-1.c         |   2 +-
>  .../i386/avx512bf16vl-vcvtneps2bf16-1.c       |  12 +--
>  11 files changed, 189 insertions(+), 126 deletions(-)
>
> diff --git a/gcc/config/i386/avx512bf16intrin.h b/gcc/config/i386/avx512bf16intrin.h
> index b6e9ddad157..ea1d0125b3f 100644
> --- a/gcc/config/i386/avx512bf16intrin.h
> +++ b/gcc/config/i386/avx512bf16intrin.h
> @@ -35,16 +35,16 @@
>  #endif /* __AVX512BF16__ */
>
>  /* Internal data types for implementing the intrinsics.  */
> -typedef short __v32bh __attribute__ ((__vector_size__ (64)));
> +typedef __bf16 __v32bf __attribute__ ((__vector_size__ (64)));
>
>  /* The Intel API is flexible enough that we must allow aliasing with other
>     vector types, and their scalar components.  */
> -typedef short __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
> +typedef __bf16 __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
>
>  /* Convert One BF16 Data to One Single Float Data.  */
>  extern __inline float
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_cvtsbh_ss (__bfloat16 __A)
> +_mm_cvtsbh_ss (__bf16 __A)
>  {
>    union{ float a; unsigned int b;} __tmp;
>    __tmp.b = ((unsigned int)(__A)) << 16;
> @@ -57,21 +57,21 @@ extern __inline __m512bh
>  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm512_cvtne2ps_pbh (__m512 __A, __m512 __B)
>  {
> -  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi(__A, __B);
> +  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf(__A, __B);
>  }
>
>  extern __inline __m512bh
>  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm512_mask_cvtne2ps_pbh (__m512bh __A, __mmask32 __B, __m512 __C, __m512 __D)
>  {
> -  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_mask(__C, __D, __A, __B);
> +  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_mask(__C, __D, __A, __B);
>  }
>
>  extern __inline __m512bh
>  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm512_maskz_cvtne2ps_pbh (__mmask32 __A, __m512 __B, __m512 __C)
>  {
> -  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_maskz(__B, __C, __A);
> +  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_maskz(__B, __C, __A);
>  }
>
>  /* vcvtneps2bf16 */
> diff --git a/gcc/config/i386/avx512bf16vlintrin.h b/gcc/config/i386/avx512bf16vlintrin.h
> index 969335ff358..56c28f14cf6 100644
> --- a/gcc/config/i386/avx512bf16vlintrin.h
> +++ b/gcc/config/i386/avx512bf16vlintrin.h
> @@ -35,57 +35,58 @@
>  #endif /* __AVX512BF16__ */
>
>  /* Internal data types for implementing the intrinsics.  */
> -typedef short __v16bh __attribute__ ((__vector_size__ (32)));
> -typedef short __v8bh __attribute__ ((__vector_size__ (16)));
> +typedef __bf16 __v16bf __attribute__ ((__vector_size__ (32)));
> +typedef __bf16 __v8bf __attribute__ ((__vector_size__ (16)));
>
>  /* The Intel API is flexible enough that we must allow aliasing with other
>     vector types, and their scalar components.  */
> -typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
> -typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
> +typedef __bf16 __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
> +typedef __bf16 __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
> +
> +typedef __bf16 __bfloat16;
>
> -typedef unsigned short __bfloat16;
>  /* vcvtne2ps2bf16 */
>
>  extern __inline __m256bh
>  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm256_cvtne2ps_pbh (__m256 __A, __m256 __B)
>  {
> -  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi(__A, __B);
> +  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf(__A, __B);
>  }
>
>  extern __inline __m256bh
>  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D)
>  {
> -  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_mask(__C, __D, __A, __B);
> +  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_mask(__C, __D, __A, __B);
>  }
>
>  extern __inline __m256bh
>  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C)
>  {
> -  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_maskz(__B, __C, __A);
> +  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_maskz(__B, __C, __A);
>  }
>
>  extern __inline __m128bh
>  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm_cvtne2ps_pbh (__m128 __A, __m128 __B)
>  {
> -  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi(__A, __B);
> +  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf(__A, __B);
>  }
>
>  extern __inline __m128bh
>  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D)
>  {
> -  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_mask(__C, __D, __A, __B);
> +  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_mask(__C, __D, __A, __B);
>  }
>
>  extern __inline __m128bh
>  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C)
>  {
> -  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_maskz(__B, __C, __A);
> +  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_maskz(__B, __C, __A);
>  }
>
>  /* vcvtneps2bf16 */
> @@ -176,13 +177,13 @@ _mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D)
>    return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A);
>  }
>
> -extern __inline __bfloat16
> +extern __inline __bf16
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm_cvtness_sbh (float __A)
>  {
>    __v4sf __V = {__A, 0, 0, 0};
> -  __v8hi __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
> -              (__v8hi)_mm_undefined_si128 (), (__mmask8)-1);
> +  __v8bf __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
> +              (__v8bf)_mm_undefined_si128 (), (__mmask8)-1);
>    return __R[0];
>  }
>
> diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
> index 63a360b0f8b..aedae2d7750 100644
> --- a/gcc/config/i386/i386-builtin-types.def
> +++ b/gcc/config/i386/i386-builtin-types.def
> @@ -87,6 +87,7 @@ DEF_VECTOR_TYPE (V8QI, QI)
>  DEF_VECTOR_TYPE (V2DF, DOUBLE)
>  DEF_VECTOR_TYPE (V4SF, FLOAT)
>  DEF_VECTOR_TYPE (V8HF, FLOAT16)
> +DEF_VECTOR_TYPE (V8BF, BFLOAT16)
>  DEF_VECTOR_TYPE (V2DI, DI)
>  DEF_VECTOR_TYPE (V4SI, SI)
>  DEF_VECTOR_TYPE (V8HI, HI)
> @@ -100,6 +101,7 @@ DEF_VECTOR_TYPE (V16UQI, UQI, V16QI)
>  DEF_VECTOR_TYPE (V4DF, DOUBLE)
>  DEF_VECTOR_TYPE (V8SF, FLOAT)
>  DEF_VECTOR_TYPE (V16HF, FLOAT16)
> +DEF_VECTOR_TYPE (V16BF, BFLOAT16)
>  DEF_VECTOR_TYPE (V4DI, DI)
>  DEF_VECTOR_TYPE (V8SI, SI)
>  DEF_VECTOR_TYPE (V16HI, HI)
> @@ -111,6 +113,7 @@ DEF_VECTOR_TYPE (V16UHI, UHI, V16HI)
>  # AVX512F vectors
>  DEF_VECTOR_TYPE (V32SF, FLOAT)
>  DEF_VECTOR_TYPE (V32HF, FLOAT16)
> +DEF_VECTOR_TYPE (V32BF, BFLOAT16)
>  DEF_VECTOR_TYPE (V16SF, FLOAT)
>  DEF_VECTOR_TYPE (V8DF, DOUBLE)
>  DEF_VECTOR_TYPE (V8DI, DI)
> @@ -1273,30 +1276,30 @@ DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, UHI)
>  DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, UHI)
>
>  # BF16 builtins
> -DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF)
> -DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF, V32HI, USI)
> -DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF, USI)
> -DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF)
> -DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF, V16HI, UHI)
> -DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF, UHI)
> -DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF)
> -DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF, V8HI, UQI)
> -DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF, UQI)
> -DEF_FUNCTION_TYPE (V16HI, V16SF)
> -DEF_FUNCTION_TYPE (V16HI, V16SF, V16HI, UHI)
> -DEF_FUNCTION_TYPE (V16HI, V16SF, UHI)
> -DEF_FUNCTION_TYPE (V8HI, V8SF)
> -DEF_FUNCTION_TYPE (V8HI, V8SF, V8HI, UQI)
> -DEF_FUNCTION_TYPE (V8HI, V8SF, UQI)
> -DEF_FUNCTION_TYPE (V8HI, V4SF)
> -DEF_FUNCTION_TYPE (V8HI, V4SF, V8HI, UQI)
> -DEF_FUNCTION_TYPE (V8HI, V4SF, UQI)
> -DEF_FUNCTION_TYPE (V16SF, V16SF, V32HI, V32HI)
> -DEF_FUNCTION_TYPE (V16SF, V16SF, V32HI, V32HI, UHI)
> -DEF_FUNCTION_TYPE (V8SF, V8SF, V16HI, V16HI)
> -DEF_FUNCTION_TYPE (V8SF, V8SF, V16HI, V16HI, UQI)
> -DEF_FUNCTION_TYPE (V4SF, V4SF, V8HI, V8HI)
> -DEF_FUNCTION_TYPE (V4SF, V4SF, V8HI, V8HI, UQI)
> +DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF)
> +DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF, V32BF, USI)
> +DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF, USI)
> +DEF_FUNCTION_TYPE (V16BF, V8SF, V8SF)
> +DEF_FUNCTION_TYPE (V16BF, V8SF, V8SF, V16BF, UHI)
> +DEF_FUNCTION_TYPE (V16BF, V8SF, V8SF, UHI)
> +DEF_FUNCTION_TYPE (V8BF, V4SF, V4SF)
> +DEF_FUNCTION_TYPE (V8BF, V4SF, V4SF, V8BF, UQI)
> +DEF_FUNCTION_TYPE (V8BF, V4SF, V4SF, UQI)
> +DEF_FUNCTION_TYPE (V16BF, V16SF)
> +DEF_FUNCTION_TYPE (V16BF, V16SF, V16BF, UHI)
> +DEF_FUNCTION_TYPE (V16BF, V16SF, UHI)
> +DEF_FUNCTION_TYPE (V8BF, V8SF)
> +DEF_FUNCTION_TYPE (V8BF, V8SF, V8BF, UQI)
> +DEF_FUNCTION_TYPE (V8BF, V8SF, UQI)
> +DEF_FUNCTION_TYPE (V8BF, V4SF)
> +DEF_FUNCTION_TYPE (V8BF, V4SF, V8BF, UQI)
> +DEF_FUNCTION_TYPE (V8BF, V4SF, UQI)
> +DEF_FUNCTION_TYPE (V16SF, V16SF, V32BF, V32BF)
> +DEF_FUNCTION_TYPE (V16SF, V16SF, V32BF, V32BF, UHI)
> +DEF_FUNCTION_TYPE (V8SF, V8SF, V16BF, V16BF)
> +DEF_FUNCTION_TYPE (V8SF, V8SF, V16BF, V16BF, UQI)
> +DEF_FUNCTION_TYPE (V4SF, V4SF, V8BF, V8BF)
> +DEF_FUNCTION_TYPE (V4SF, V4SF, V8BF, V8BF, UQI)
>
>  # KEYLOCKER builtins
>  DEF_FUNCTION_TYPE (UINT, UINT, V2DI, V2DI, PVOID)
> diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
> index e35306e27d0..5802e2049a8 100644
> --- a/gcc/config/i386/i386-builtin.def
> +++ b/gcc/config/i386/i386-builtin.def
> @@ -2779,33 +2779,33 @@ BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, "__builtin_ia32_vae
>  BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI)
>
>  /* BF16 */
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi, "__builtin_ia32_cvtne2ps2bf16_v32hi", IX86_BUILTIN_CVTNE2PS2HI16_V32HI, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi_mask, "__builtin_ia32_cvtne2ps2bf16_v32hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF_V32HI_USI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v32hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V32HI_MASKZ, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF_USI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi, "__builtin_ia32_cvtne2ps2bf16_v16hi", IX86_BUILTIN_CVTNE2PS2HI16_V16HI, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi_mask, "__builtin_ia32_cvtne2ps2bf16_v16hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF_V16HI_UHI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v16hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V16HI_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF_UHI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi, "__builtin_ia32_cvtne2ps2bf16_v8hi", IX86_BUILTIN_CVTNE2PS2HI16_V8HI, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi_mask, "__builtin_ia32_cvtne2ps2bf16_v8hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V8HI_MASK, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF_V8HI_UQI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v8hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V8HI_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF_UQI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf, "__builtin_ia32_cvtneps2bf16_v16sf", IX86_BUILTIN_CVTNEPS2HI16_V16SF, UNKNOWN, (int) V16HI_FTYPE_V16SF)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_mask, "__builtin_ia32_cvtneps2bf16_v16sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V16SF_MASK, UNKNOWN, (int) V16HI_FTYPE_V16SF_V16HI_UHI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_maskz, "__builtin_ia32_cvtneps2bf16_v16sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V16SF_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16SF_UHI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf, "__builtin_ia32_cvtneps2bf16_v8sf", IX86_BUILTIN_CVTNEPS2HI16_V8SF, UNKNOWN, (int) V8HI_FTYPE_V8SF)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_mask, "__builtin_ia32_cvtneps2bf16_v8sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V8SF_MASK, UNKNOWN, (int) V8HI_FTYPE_V8SF_V8HI_UQI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_maskz, "__builtin_ia32_cvtneps2bf16_v8sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V8SF_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V8SF_UQI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf, "__builtin_ia32_cvtneps2bf16_v4sf", IX86_BUILTIN_CVTNEPS2HI16_V4SF, UNKNOWN, (int) V8HI_FTYPE_V4SF)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_mask, "__builtin_ia32_cvtneps2bf16_v4sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V4SF_MASK, UNKNOWN, (int) V8HI_FTYPE_V4SF_V8HI_UQI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_maskz, "__builtin_ia32_cvtneps2bf16_v4sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V4SF_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V4SF_UQI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf, "__builtin_ia32_dpbf16ps_v16sf", IX86_BUILTIN_DPHI16PS_V16SF, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_mask, "__builtin_ia32_dpbf16ps_v16sf_mask", IX86_BUILTIN_DPHI16PS_V16SF_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI_UHI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_maskz, "__builtin_ia32_dpbf16ps_v16sf_maskz", IX86_BUILTIN_DPHI16PS_V16SF_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI_UHI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf, "__builtin_ia32_dpbf16ps_v8sf", IX86_BUILTIN_DPHI16PS_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_mask, "__builtin_ia32_dpbf16ps_v8sf_mask", IX86_BUILTIN_DPHI16PS_V8SF_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI_UQI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_maskz, "__builtin_ia32_dpbf16ps_v8sf_maskz", IX86_BUILTIN_DPHI16PS_V8SF_MASKZ, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI_UQI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builtin_ia32_dpbf16ps_v4sf", IX86_BUILTIN_DPHI16PS_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPHI16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPHI16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf, "__builtin_ia32_cvtne2ps2bf16_v32bf", IX86_BUILTIN_CVTNE2PS2BF16_V32BF, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf_mask, "__builtin_ia32_cvtne2ps2bf16_v32bf_mask", IX86_BUILTIN_CVTNE2PS2BF16_V32BF_MASK, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF_V32BF_USI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf_maskz, "__builtin_ia32_cvtne2ps2bf16_v32bf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V32BF_MASKZ, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF_USI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16bf, "__builtin_ia32_cvtne2ps2bf16_v16bf", IX86_BUILTIN_CVTNE2PS2BF16_V16BF, UNKNOWN, (int) V16BF_FTYPE_V8SF_V8SF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16bf_mask, "__builtin_ia32_cvtne2ps2bf16_v16bf_mask", IX86_BUILTIN_CVTNE2PS2BF16_V16BF_MASK, UNKNOWN, (int) V16BF_FTYPE_V8SF_V8SF_V16BF_UHI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16bf_maskz, "__builtin_ia32_cvtne2ps2bf16_v16bf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V16BF_MASKZ, UNKNOWN, (int) V16BF_FTYPE_V8SF_V8SF_UHI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8bf, "__builtin_ia32_cvtne2ps2bf16_v8bf", IX86_BUILTIN_CVTNE2PS2BF16_V8BF, UNKNOWN, (int) V8BF_FTYPE_V4SF_V4SF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8bf_mask, "__builtin_ia32_cvtne2ps2bf16_v8bf_mask", IX86_BUILTIN_CVTNE2PS2BF16_V8BF_MASK, UNKNOWN, (int) V8BF_FTYPE_V4SF_V4SF_V8BF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8bf_maskz, "__builtin_ia32_cvtne2ps2bf16_v8bf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V8BF_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V4SF_V4SF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf, "__builtin_ia32_cvtneps2bf16_v16sf", IX86_BUILTIN_CVTNEPS2BF16_V16SF, UNKNOWN, (int) V16BF_FTYPE_V16SF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_mask, "__builtin_ia32_cvtneps2bf16_v16sf_mask", IX86_BUILTIN_CVTNEPS2BF16_V16SF_MASK, UNKNOWN, (int) V16BF_FTYPE_V16SF_V16BF_UHI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_maskz, "__builtin_ia32_cvtneps2bf16_v16sf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V16SF_MASKZ, UNKNOWN, (int) V16BF_FTYPE_V16SF_UHI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf, "__builtin_ia32_cvtneps2bf16_v8sf", IX86_BUILTIN_CVTNEPS2BF16_V8SF, UNKNOWN, (int) V8BF_FTYPE_V8SF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_mask, "__builtin_ia32_cvtneps2bf16_v8sf_mask", IX86_BUILTIN_CVTNEPS2BF16_V8SF_MASK, UNKNOWN, (int) V8BF_FTYPE_V8SF_V8BF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_maskz, "__builtin_ia32_cvtneps2bf16_v8sf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V8SF_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V8SF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf, "__builtin_ia32_cvtneps2bf16_v4sf", IX86_BUILTIN_CVTNEPS2BF16_V4SF, UNKNOWN, (int) V8BF_FTYPE_V4SF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_mask, "__builtin_ia32_cvtneps2bf16_v4sf_mask", IX86_BUILTIN_CVTNEPS2BF16_V4SF_MASK, UNKNOWN, (int) V8BF_FTYPE_V4SF_V8BF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_maskz, "__builtin_ia32_cvtneps2bf16_v4sf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V4SF_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V4SF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf, "__builtin_ia32_dpbf16ps_v16sf", IX86_BUILTIN_DPBF16PS_V16SF, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32BF_V32BF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_mask, "__builtin_ia32_dpbf16ps_v16sf_mask", IX86_BUILTIN_DPBF16PS_V16SF_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32BF_V32BF_UHI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_maskz, "__builtin_ia32_dpbf16ps_v16sf_maskz", IX86_BUILTIN_DPBF16PS_V16SF_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32BF_V32BF_UHI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf, "__builtin_ia32_dpbf16ps_v8sf", IX86_BUILTIN_DPBF16PS_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16BF_V16BF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_mask, "__builtin_ia32_dpbf16ps_v8sf_mask", IX86_BUILTIN_DPBF16PS_V8SF_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16BF_V16BF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_maskz, "__builtin_ia32_dpbf16ps_v8sf_maskz", IX86_BUILTIN_DPBF16PS_V8SF_MASKZ, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16BF_V16BF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builtin_ia32_dpbf16ps_v4sf", IX86_BUILTIN_DPBF16PS_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPBF16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPBF16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
>
>  /* AVX512FP16.  */
>  BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_addph128_mask", IX86_BUILTIN_ADDPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 5d9e5a12f7e..8e1ef0b4c4a 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -10462,9 +10462,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
>      case V8DF_FTYPE_V2DF:
>      case V8DF_FTYPE_V8DF:
>      case V4DI_FTYPE_V4DI:
> -    case V16HI_FTYPE_V16SF:
> -    case V8HI_FTYPE_V8SF:
> -    case V8HI_FTYPE_V4SF:
> +    case V16BF_FTYPE_V16SF:
> +    case V8BF_FTYPE_V8SF:
> +    case V8BF_FTYPE_V4SF:
>        nargs = 1;
>        break;
>      case V4SF_FTYPE_V4SF_VEC_MERGE:
> @@ -10592,12 +10592,12 @@ ix86_expand_args_builtin (const struct builtin_description *d,
>      case USI_FTYPE_USI_USI:
>      case UDI_FTYPE_UDI_UDI:
>      case V16SI_FTYPE_V8DF_V8DF:
> -    case V32HI_FTYPE_V16SF_V16SF:
> -    case V16HI_FTYPE_V8SF_V8SF:
> -    case V8HI_FTYPE_V4SF_V4SF:
> -    case V16HI_FTYPE_V16SF_UHI:
> -    case V8HI_FTYPE_V8SF_UQI:
> -    case V8HI_FTYPE_V4SF_UQI:
> +    case V32BF_FTYPE_V16SF_V16SF:
> +    case V16BF_FTYPE_V8SF_V8SF:
> +    case V8BF_FTYPE_V4SF_V4SF:
> +    case V16BF_FTYPE_V16SF_UHI:
> +    case V8BF_FTYPE_V8SF_UQI:
> +    case V8BF_FTYPE_V4SF_UQI:
>        nargs = 2;
>        break;
>      case V2DI_FTYPE_V2DI_INT_CONVERT:
> @@ -10803,15 +10803,15 @@ ix86_expand_args_builtin (const struct builtin_description *d,
>      case V16HI_FTYPE_V16HI_V16HI_V16HI:
>      case V8SI_FTYPE_V8SI_V8SI_V8SI:
>      case V8HI_FTYPE_V8HI_V8HI_V8HI:
> -    case V32HI_FTYPE_V16SF_V16SF_USI:
> -    case V16HI_FTYPE_V8SF_V8SF_UHI:
> -    case V8HI_FTYPE_V4SF_V4SF_UQI:
> -    case V16HI_FTYPE_V16SF_V16HI_UHI:
> -    case V8HI_FTYPE_V8SF_V8HI_UQI:
> -    case V8HI_FTYPE_V4SF_V8HI_UQI:
> -    case V16SF_FTYPE_V16SF_V32HI_V32HI:
> -    case V8SF_FTYPE_V8SF_V16HI_V16HI:
> -    case V4SF_FTYPE_V4SF_V8HI_V8HI:
> +    case V32BF_FTYPE_V16SF_V16SF_USI:
> +    case V16BF_FTYPE_V8SF_V8SF_UHI:
> +    case V8BF_FTYPE_V4SF_V4SF_UQI:
> +    case V16BF_FTYPE_V16SF_V16BF_UHI:
> +    case V8BF_FTYPE_V8SF_V8BF_UQI:
> +    case V8BF_FTYPE_V4SF_V8BF_UQI:
> +    case V16SF_FTYPE_V16SF_V32BF_V32BF:
> +    case V8SF_FTYPE_V8SF_V16BF_V16BF:
> +    case V4SF_FTYPE_V4SF_V8BF_V8BF:
>        nargs = 3;
>        break;
>      case V32QI_FTYPE_V32QI_V32QI_INT:
> @@ -10958,9 +10958,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
>      case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
>      case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
>      case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
> -    case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
> -    case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
> -    case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
> +    case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
> +    case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
> +    case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
>        nargs = 4;
>        break;
>      case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
> @@ -10998,9 +10998,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
>        break;
>      case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
>      case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
> -    case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
> -    case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
> -    case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
> +    case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
> +    case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
> +    case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
>        nargs = 4;
>        break;
>      case UQI_FTYPE_V8DI_V8DI_INT_UQI:
> diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h
> index ddea249d09b..c62d50f1951 100644
> --- a/gcc/config/i386/immintrin.h
> +++ b/gcc/config/i386/immintrin.h
> @@ -118,9 +118,11 @@
>
>  #include <vpclmulqdqintrin.h>
>
> +#ifdef __SSE2__
>  #include <avx512bf16vlintrin.h>
>
>  #include <avx512bf16intrin.h>
> +#endif
>
>  #include <amxtileintrin.h>
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index f4b5506703f..fba81a93c1a 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -187,8 +187,6 @@
>    UNSPEC_VP2INTERSECT
>
>    ;; For AVX512BF16 support
> -  UNSPEC_VCVTNE2PS2BF16
> -  UNSPEC_VCVTNEPS2BF16
>    UNSPEC_VDPBF16PS
>
>    ;; For AVX512FP16 suppport
> @@ -28918,41 +28916,101 @@
>    "vp2intersectd\t{%2, %1, %0|%0, %1, %2}"
>    [(set_attr ("prefix") ("evex"))])
>
> -(define_mode_iterator BF16 [V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")])
> +(define_mode_iterator VF_AVX512BF16VL
> +  [V32BF (V16BF "TARGET_AVX512VL") (V8BF "TARGET_AVX512VL")])
>  ;; Converting from BF to SF
>  (define_mode_attr bf16_cvt_2sf
> -  [(V32HI  "V16SF") (V16HI  "V8SF") (V8HI  "V4SF")])
> +  [(V32BF  "V16SF") (V16BF  "V8SF") (V8BF  "V4SF")])
>  ;; Converting from SF to BF
>  (define_mode_attr sf_cvt_bf16
> -  [(V4SF  "V8HI") (V8SF  "V8HI") (V16SF  "V16HI")])
> +  [(V8SF  "V8BF") (V16SF  "V16BF")])
>  ;; Mapping from BF to SF
>  (define_mode_attr sf_bf16
> -  [(V4SF  "V8HI") (V8SF  "V16HI") (V16SF  "V32HI")])
> +  [(V4SF  "V8BF") (V8SF  "V16BF") (V16SF  "V32BF")])
>
>  (define_expand "avx512f_cvtne2ps2bf16_<mode>_maskz"
> -  [(match_operand:BF16 0 "register_operand")
> +  [(match_operand:VF_AVX512BF16VL 0 "register_operand")
>     (match_operand:<bf16_cvt_2sf> 1 "register_operand")
> -   (match_operand:<bf16_cvt_2sf> 2 "register_operand")
> +   (match_operand:<bf16_cvt_2sf> 2 "nonimmediate_operand")
>     (match_operand:<avx512fmaskmode> 3 "register_operand")]
>    "TARGET_AVX512BF16"
>  {
> -  emit_insn (gen_avx512f_cvtne2ps2bf16_<mode>_mask(operands[0], operands[1],
> -    operands[2], CONST0_RTX(<MODE>mode), operands[3]));
> +  emit_insn (gen_avx512f_cvtne2ps2bf16_<mode>_mask(operands[0], operands[2],
> +    operands[1], CONST0_RTX(<MODE>mode), operands[3]));
>    DONE;
>  })
>
>  (define_insn "avx512f_cvtne2ps2bf16_<mode><mask_name>"
> -  [(set (match_operand:BF16 0 "register_operand" "=v")
> -       (unspec:BF16
> -         [(match_operand:<bf16_cvt_2sf> 1 "register_operand" "v")
> -          (match_operand:<bf16_cvt_2sf> 2 "register_operand" "v")]
> -        UNSPEC_VCVTNE2PS2BF16))]
> +  [(set (match_operand:VF_AVX512BF16VL 0 "register_operand" "=v")
> +       (vec_concat:VF_AVX512BF16VL
> +         (float_truncate:<ssehalfvecmode>
> +           (match_operand:<bf16_cvt_2sf> 2 "nonimmediate_operand" "vm"))
> +         (float_truncate:<ssehalfvecmode>
> +           (match_operand:<bf16_cvt_2sf> 1 "register_operand" "v"))))]
>    "TARGET_AVX512BF16"
>    "vcvtne2ps2bf16\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}")
>
> +(define_expand "avx512f_cvtneps2bf16_v4sf"
> +  [(set (match_operand:V8BF 0 "register_operand")
> +       (vec_concat:V8BF
> +         (float_truncate:V4BF
> +           (match_operand:V4SF 1 "nonimmediate_operand"))
> +         (match_dup 2)))]
> +  "TARGET_AVX512BF16 && TARGET_AVX512VL"
> +  "operands[2] = CONST0_RTX (V4BFmode);")
> +
> +(define_insn "*avx512f_cvtneps2bf16_v4sf"
> +  [(set (match_operand:V8BF 0 "register_operand" "=v")
> +       (vec_concat:V8BF
> +         (float_truncate:V4BF
> +           (match_operand:V4SF 1 "nonimmediate_operand" "vm"))
> +         (match_operand:V4BF 2 "const0_operand")))]
> +  "TARGET_AVX512BF16 && TARGET_AVX512VL"
> +  "vcvtneps2bf16{x}\t{%1, %0|%0, %1}")
> +
> +(define_expand "avx512f_cvtneps2bf16_v4sf_maskz"
> +  [(match_operand:V8BF 0 "register_operand")
> +   (match_operand:V4SF 1 "nonimmediate_operand")
> +   (match_operand:QI 2 "register_operand")]
> +  "TARGET_AVX512BF16 && TARGET_AVX512VL"
> +{
> +  emit_insn (gen_avx512f_cvtneps2bf16_v4sf_mask_1(operands[0], operands[1],
> +    CONST0_RTX(V8BFmode), operands[2], CONST0_RTX(V4BFmode)));
> +  DONE;
> +})
> +
> +(define_expand "avx512f_cvtneps2bf16_v4sf_mask"
> +  [(match_operand:V8BF 0 "register_operand")
> +   (match_operand:V4SF 1 "nonimmediate_operand")
> +   (match_operand:V8BF 2 "nonimm_or_0_operand")
> +   (match_operand:QI 3 "register_operand")]
> +  "TARGET_AVX512BF16 && TARGET_AVX512VL"
> +{
> +  emit_insn (gen_avx512f_cvtneps2bf16_v4sf_mask_1(operands[0], operands[1],
> +    operands[2], operands[3], CONST0_RTX(V4BFmode)));
> +  DONE;
> +})
> +
> +(define_insn "avx512f_cvtneps2bf16_v4sf_mask_1"
> +  [(set (match_operand:V8BF 0 "register_operand" "=v")
> +       (vec_concat:V8BF
> +         (vec_merge:V4BF
> +           (float_truncate:V4BF
> +             (match_operand:V4SF 1 "nonimmediate_operand" "vm"))
> +           (vec_select:V4BF
> +             (match_operand:V8BF 2 "nonimm_or_0_operand" "0C")
> +             (parallel [(const_int 0) (const_int 1)
> +                        (const_int 2) (const_int 3)]))
> +           (match_operand:QI 3 "register_operand" "Yk"))
> +         (match_operand:V4BF 4 "const0_operand")))]
> +  "TARGET_AVX512BF16 && TARGET_AVX512VL"
> +  "vcvtneps2bf16{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}")
> +
> +(define_mode_iterator VF1_AVX512_256 [V16SF (V8SF "TARGET_AVX512VL")])
> +
>  (define_expand "avx512f_cvtneps2bf16_<mode>_maskz"
>    [(match_operand:<sf_cvt_bf16> 0 "register_operand")
> -   (match_operand:VF1_AVX512VL 1 "register_operand")
> +   (match_operand:VF1_AVX512_256 1 "nonimmediate_operand")
>     (match_operand:<avx512fmaskmode> 2 "register_operand")]
>    "TARGET_AVX512BF16"
>  {
> @@ -28963,11 +29021,10 @@
>
>  (define_insn "avx512f_cvtneps2bf16_<mode><mask_name>"
>    [(set (match_operand:<sf_cvt_bf16> 0 "register_operand" "=v")
> -       (unspec:<sf_cvt_bf16>
> -         [(match_operand:VF1_AVX512VL 1 "register_operand" "v")]
> -        UNSPEC_VCVTNEPS2BF16))]
> +       (float_truncate:<sf_cvt_bf16>
> +         (match_operand:VF1_AVX512_256 1 "nonimmediate_operand" "vm")))]
>    "TARGET_AVX512BF16"
> -  "vcvtneps2bf16\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}")
> +  "vcvtneps2bf16<qq2phsuff>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}")
>
>  (define_expand "avx512f_dpbf16ps_<mode>_maskz"
>    [(match_operand:VF1_AVX512VL 0 "register_operand")
> @@ -28987,7 +29044,7 @@
>         (unspec:VF1_AVX512VL
>           [(match_operand:VF1_AVX512VL 1 "register_operand" "0")
>            (match_operand:<sf_bf16> 2 "register_operand" "v")
> -          (match_operand:<sf_bf16> 3 "register_operand" "v")]
> +          (match_operand:<sf_bf16> 3 "nonimmediate_operand" "vm")]
>          UNSPEC_VDPBF16PS))]
>    "TARGET_AVX512BF16"
>    "vdpbf16ps\t{%3, %2, %0<maskz_half_operand4>|%0<maskz_half_operand4>, %2, %3}")
> @@ -28998,7 +29055,7 @@
>           (unspec:VF1_AVX512VL
>             [(match_operand:VF1_AVX512VL 1 "register_operand" "0")
>              (match_operand:<sf_bf16> 2 "register_operand" "v")
> -            (match_operand:<sf_bf16> 3 "register_operand" "v")]
> +            (match_operand:<sf_bf16> 3 "nonimmediate_operand" "vm")]
>               UNSPEC_VDPBF16PS)
>            (match_dup 1)
>            (match_operand:<avx512fmaskhalfmode> 4 "register_operand" "Yk")))]
> diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
> index 831abd37d80..8e929e6f159 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-options "-mavx512bf16 -O2" } */
> -/* { dg-additional-options "-fno-PIE" { target ia32 } } */
> +/* { dg-additional-options "-fno-PIE -mfpmath=sse" { target ia32 } } */
>  /* { dg-final { scan-assembler-times "sall\[ \\t\]+\[^\{\n\]*16" 1 } } */
>  /* { dg-final { scan-assembler-times "movl" 1 } } */
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c b/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
> index b64ad7b84dd..02ebdd8cf5b 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-options "-mavx512bf16 -O2" } */
> -/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
>
>  #include <immintrin.h>
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
> index 8f21b1bfdae..b71addd6301 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-options "-mavx512bf16 -mavx512vl -O2" } */
> -/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>
>  #include <immintrin.h>
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
> index 0969ae1b35e..d3a9bdf8c34 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
> @@ -1,11 +1,11 @@
>  /* { dg-do compile } */
>  /* { dg-options "-mavx512bf16 -mavx512vl -O2" } */
> -/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtneps2bf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtneps2bf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtneps2bf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
>
>  #include <immintrin.h>
>
> --
> 2.27.0
>


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2022-10-28  6:26 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20221028060808.1637178-1-lingling.kong@intel.com>
2022-10-28  6:20 ` [PATCH] i386: using __bf16 for AVX512BF16 intrinsics Kong, Lingling
2022-10-28  6:29   ` Hongtao Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).