public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r13-3566] i386:: using __bf16 for AVX512BF16 intrinsics
@ 2022-10-31  6:07 Kong Lingling
  0 siblings, 0 replies; only message in thread
From: Kong Lingling @ 2022-10-31  6:07 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:87235f1e5c740de9c6f72a5dd7d7eb9cb7df2e1d

commit r13-3566-g87235f1e5c740de9c6f72a5dd7d7eb9cb7df2e1d
Author: konglin1 <lingling.kong@intel.com>
Date:   Mon Oct 31 14:04:08 2022 +0800

    i386:: using __bf16 for AVX512BF16 intrinsics
    
    gcc/ChangeLog:
    
            * config/i386/avx512bf16intrin.h (__attribute__): Change short to bf16.
            (_mm_cvtsbh_ss): Ditto.
            (_mm512_cvtne2ps_pbh): Ditto.
            (_mm512_mask_cvtne2ps_pbh): Ditto.
            (_mm512_maskz_cvtne2ps_pbh): Ditto.
            * config/i386/avx512bf16vlintrin.h (__attribute__): Ditto.
            (_mm256_cvtne2ps_pbh): Ditto.
            (_mm256_mask_cvtne2ps_pbh): Ditto.
            (_mm256_maskz_cvtne2ps_pbh): Ditto.
            (_mm_cvtne2ps_pbh): Ditto.
            (_mm_mask_cvtne2ps_pbh): Ditto.
            (_mm_maskz_cvtne2ps_pbh): Ditto.
            (_mm_cvtness_sbh): Ditto.
            * config/i386/i386-builtin-types.def (V8BF): Add new
            DEF_VECTOR_TYPE for BFmode.
            (V16BF): Ditto.
            (V32BF): Ditto.
            * config/i386/i386-builtin.def (BDESC): Fixed builtins.
            * config/i386/i386-expand.cc (ix86_expand_args_builtin): Changed
            avx512bf16 ix86_builtin_func_type included HI to BF.
            * config/i386/immintrin.h: Add SSE2 depend for avx512bf16.
            * config/i386/sse.md (TARGET_AVX512VL): Changed HI vector to BF
            vector.
            (avx512f_cvtneps2bf16_v4sf): New define_expand.
            (*avx512f_cvtneps2bf16_v4sf): New define_insn.
            (avx512f_cvtneps2bf16_v4sf_maskz):Ditto.
            (avx512f_cvtneps2bf16_v4sf_mask): Ditto.
            (avx512f_cvtneps2bf16_v4sf_mask_1): Ditto.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/i386/avx512bf16-cvtsbh2ss-1.c: Add fpmath option.
            * gcc.target/i386/avx512bf16-vdpbf16ps-2.c: Fixed
            scan-assembler.
            * gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c: Add x/y suffix
            for vcvtneps2bf16.
            * gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c: Ditto.

Diff:
---
 gcc/config/i386/avx512bf16intrin.h                 |  12 +--
 gcc/config/i386/avx512bf16vlintrin.h               |  29 +++---
 gcc/config/i386/i386-builtin-types.def             |  51 ++++++-----
 gcc/config/i386/i386-builtin.def                   |  54 +++++------
 gcc/config/i386/i386-expand.cc                     |  48 +++++-----
 gcc/config/i386/immintrin.h                        |   2 +
 gcc/config/i386/sse.md                             | 101 ++++++++++++++++-----
 .../gcc.target/i386/avx512bf16-cvtsbh2ss-1.c       |   2 +-
 .../gcc.target/i386/avx512bf16-vdpbf16ps-2.c       |   2 +-
 .../gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c   |   2 +-
 .../gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c |  12 +--
 11 files changed, 189 insertions(+), 126 deletions(-)

diff --git a/gcc/config/i386/avx512bf16intrin.h b/gcc/config/i386/avx512bf16intrin.h
index b6e9ddad157..ea1d0125b3f 100644
--- a/gcc/config/i386/avx512bf16intrin.h
+++ b/gcc/config/i386/avx512bf16intrin.h
@@ -35,16 +35,16 @@
 #endif /* __AVX512BF16__ */
 
 /* Internal data types for implementing the intrinsics.  */
-typedef short __v32bh __attribute__ ((__vector_size__ (64)));
+typedef __bf16 __v32bf __attribute__ ((__vector_size__ (64)));
 
 /* The Intel API is flexible enough that we must allow aliasing with other
    vector types, and their scalar components.  */
-typedef short __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
+typedef __bf16 __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
 
 /* Convert One BF16 Data to One Single Float Data.  */
 extern __inline float
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cvtsbh_ss (__bfloat16 __A)
+_mm_cvtsbh_ss (__bf16 __A)
 {
   union{ float a; unsigned int b;} __tmp;
   __tmp.b = ((unsigned int)(__A)) << 16;
@@ -57,21 +57,21 @@ extern __inline __m512bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_cvtne2ps_pbh (__m512 __A, __m512 __B)
 {
-  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi(__A, __B);
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf(__A, __B);
 }
 
 extern __inline __m512bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_cvtne2ps_pbh (__m512bh __A, __mmask32 __B, __m512 __C, __m512 __D)
 {
-  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_mask(__C, __D, __A, __B);
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_mask(__C, __D, __A, __B);
 }
 
 extern __inline __m512bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_maskz_cvtne2ps_pbh (__mmask32 __A, __m512 __B, __m512 __C)
 {
-  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_maskz(__B, __C, __A);
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_maskz(__B, __C, __A);
 }
 
 /* vcvtneps2bf16 */
diff --git a/gcc/config/i386/avx512bf16vlintrin.h b/gcc/config/i386/avx512bf16vlintrin.h
index 969335ff358..56c28f14cf6 100644
--- a/gcc/config/i386/avx512bf16vlintrin.h
+++ b/gcc/config/i386/avx512bf16vlintrin.h
@@ -35,57 +35,58 @@
 #endif /* __AVX512BF16__ */
 
 /* Internal data types for implementing the intrinsics.  */
-typedef short __v16bh __attribute__ ((__vector_size__ (32)));
-typedef short __v8bh __attribute__ ((__vector_size__ (16)));
+typedef __bf16 __v16bf __attribute__ ((__vector_size__ (32)));
+typedef __bf16 __v8bf __attribute__ ((__vector_size__ (16)));
 
 /* The Intel API is flexible enough that we must allow aliasing with other
    vector types, and their scalar components.  */
-typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
-typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef __bf16 __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
+typedef __bf16 __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
+
+typedef __bf16 __bfloat16;
 
-typedef unsigned short __bfloat16;
 /* vcvtne2ps2bf16 */
 
 extern __inline __m256bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_cvtne2ps_pbh (__m256 __A, __m256 __B)
 {
-  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi(__A, __B);
+  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf(__A, __B);
 }
 
 extern __inline __m256bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D)
 {
-  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_mask(__C, __D, __A, __B);
+  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_mask(__C, __D, __A, __B);
 }
 
 extern __inline __m256bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C)
 {
-  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_maskz(__B, __C, __A);
+  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_maskz(__B, __C, __A);
 }
 
 extern __inline __m128bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtne2ps_pbh (__m128 __A, __m128 __B)
 {
-  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi(__A, __B);
+  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf(__A, __B);
 }
 
 extern __inline __m128bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D)
 {
-  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_mask(__C, __D, __A, __B);
+  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_mask(__C, __D, __A, __B);
 }
 
 extern __inline __m128bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C)
 {
-  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_maskz(__B, __C, __A);
+  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_maskz(__B, __C, __A);
 }
 
 /* vcvtneps2bf16 */
@@ -176,13 +177,13 @@ _mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D)
   return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A);
 }
 
-extern __inline __bfloat16
+extern __inline __bf16
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtness_sbh (float __A)
 {
   __v4sf __V = {__A, 0, 0, 0};
-  __v8hi __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
-	       (__v8hi)_mm_undefined_si128 (), (__mmask8)-1);
+  __v8bf __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
+	       (__v8bf)_mm_undefined_si128 (), (__mmask8)-1);
   return __R[0];
 }
 
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index 63a360b0f8b..aedae2d7750 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -87,6 +87,7 @@ DEF_VECTOR_TYPE (V8QI, QI)
 DEF_VECTOR_TYPE (V2DF, DOUBLE)
 DEF_VECTOR_TYPE (V4SF, FLOAT)
 DEF_VECTOR_TYPE (V8HF, FLOAT16)
+DEF_VECTOR_TYPE (V8BF, BFLOAT16)
 DEF_VECTOR_TYPE (V2DI, DI)
 DEF_VECTOR_TYPE (V4SI, SI)
 DEF_VECTOR_TYPE (V8HI, HI)
@@ -100,6 +101,7 @@ DEF_VECTOR_TYPE (V16UQI, UQI, V16QI)
 DEF_VECTOR_TYPE (V4DF, DOUBLE)
 DEF_VECTOR_TYPE (V8SF, FLOAT)
 DEF_VECTOR_TYPE (V16HF, FLOAT16)
+DEF_VECTOR_TYPE (V16BF, BFLOAT16)
 DEF_VECTOR_TYPE (V4DI, DI)
 DEF_VECTOR_TYPE (V8SI, SI)
 DEF_VECTOR_TYPE (V16HI, HI)
@@ -111,6 +113,7 @@ DEF_VECTOR_TYPE (V16UHI, UHI, V16HI)
 # AVX512F vectors
 DEF_VECTOR_TYPE (V32SF, FLOAT)
 DEF_VECTOR_TYPE (V32HF, FLOAT16)
+DEF_VECTOR_TYPE (V32BF, BFLOAT16)
 DEF_VECTOR_TYPE (V16SF, FLOAT)
 DEF_VECTOR_TYPE (V8DF, DOUBLE)
 DEF_VECTOR_TYPE (V8DI, DI)
@@ -1273,30 +1276,30 @@ DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, UHI)
 DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, UHI)
 
 # BF16 builtins
-DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF)
-DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF, V32HI, USI)
-DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF, USI)
-DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF)
-DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF, V16HI, UHI)
-DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF, UHI)
-DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF)
-DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF, V8HI, UQI)
-DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF, UQI)
-DEF_FUNCTION_TYPE (V16HI, V16SF)
-DEF_FUNCTION_TYPE (V16HI, V16SF, V16HI, UHI)
-DEF_FUNCTION_TYPE (V16HI, V16SF, UHI)
-DEF_FUNCTION_TYPE (V8HI, V8SF)
-DEF_FUNCTION_TYPE (V8HI, V8SF, V8HI, UQI)
-DEF_FUNCTION_TYPE (V8HI, V8SF, UQI)
-DEF_FUNCTION_TYPE (V8HI, V4SF)
-DEF_FUNCTION_TYPE (V8HI, V4SF, V8HI, UQI)
-DEF_FUNCTION_TYPE (V8HI, V4SF, UQI)
-DEF_FUNCTION_TYPE (V16SF, V16SF, V32HI, V32HI)
-DEF_FUNCTION_TYPE (V16SF, V16SF, V32HI, V32HI, UHI)
-DEF_FUNCTION_TYPE (V8SF, V8SF, V16HI, V16HI)
-DEF_FUNCTION_TYPE (V8SF, V8SF, V16HI, V16HI, UQI)
-DEF_FUNCTION_TYPE (V4SF, V4SF, V8HI, V8HI)
-DEF_FUNCTION_TYPE (V4SF, V4SF, V8HI, V8HI, UQI)
+DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF)
+DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF, V32BF, USI)
+DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF, USI)
+DEF_FUNCTION_TYPE (V16BF, V8SF, V8SF)
+DEF_FUNCTION_TYPE (V16BF, V8SF, V8SF, V16BF, UHI)
+DEF_FUNCTION_TYPE (V16BF, V8SF, V8SF, UHI)
+DEF_FUNCTION_TYPE (V8BF, V4SF, V4SF)
+DEF_FUNCTION_TYPE (V8BF, V4SF, V4SF, V8BF, UQI)
+DEF_FUNCTION_TYPE (V8BF, V4SF, V4SF, UQI)
+DEF_FUNCTION_TYPE (V16BF, V16SF)
+DEF_FUNCTION_TYPE (V16BF, V16SF, V16BF, UHI)
+DEF_FUNCTION_TYPE (V16BF, V16SF, UHI)
+DEF_FUNCTION_TYPE (V8BF, V8SF)
+DEF_FUNCTION_TYPE (V8BF, V8SF, V8BF, UQI)
+DEF_FUNCTION_TYPE (V8BF, V8SF, UQI)
+DEF_FUNCTION_TYPE (V8BF, V4SF)
+DEF_FUNCTION_TYPE (V8BF, V4SF, V8BF, UQI)
+DEF_FUNCTION_TYPE (V8BF, V4SF, UQI)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V32BF, V32BF)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V32BF, V32BF, UHI)
+DEF_FUNCTION_TYPE (V8SF, V8SF, V16BF, V16BF)
+DEF_FUNCTION_TYPE (V8SF, V8SF, V16BF, V16BF, UQI)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V8BF, V8BF)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V8BF, V8BF, UQI)
 
 # KEYLOCKER builtins
 DEF_FUNCTION_TYPE (UINT, UINT, V2DI, V2DI, PVOID)
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index e35306e27d0..5802e2049a8 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -2779,33 +2779,33 @@ BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, "__builtin_ia32_vae
 BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI)
 
 /* BF16 */
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi, "__builtin_ia32_cvtne2ps2bf16_v32hi", IX86_BUILTIN_CVTNE2PS2HI16_V32HI, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi_mask, "__builtin_ia32_cvtne2ps2bf16_v32hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF_V32HI_USI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v32hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V32HI_MASKZ, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF_USI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi, "__builtin_ia32_cvtne2ps2bf16_v16hi", IX86_BUILTIN_CVTNE2PS2HI16_V16HI, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi_mask, "__builtin_ia32_cvtne2ps2bf16_v16hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF_V16HI_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v16hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V16HI_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi, "__builtin_ia32_cvtne2ps2bf16_v8hi", IX86_BUILTIN_CVTNE2PS2HI16_V8HI, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi_mask, "__builtin_ia32_cvtne2ps2bf16_v8hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V8HI_MASK, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF_V8HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v8hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V8HI_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf, "__builtin_ia32_cvtneps2bf16_v16sf", IX86_BUILTIN_CVTNEPS2HI16_V16SF, UNKNOWN, (int) V16HI_FTYPE_V16SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_mask, "__builtin_ia32_cvtneps2bf16_v16sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V16SF_MASK, UNKNOWN, (int) V16HI_FTYPE_V16SF_V16HI_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_maskz, "__builtin_ia32_cvtneps2bf16_v16sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V16SF_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16SF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf, "__builtin_ia32_cvtneps2bf16_v8sf", IX86_BUILTIN_CVTNEPS2HI16_V8SF, UNKNOWN, (int) V8HI_FTYPE_V8SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_mask, "__builtin_ia32_cvtneps2bf16_v8sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V8SF_MASK, UNKNOWN, (int) V8HI_FTYPE_V8SF_V8HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_maskz, "__builtin_ia32_cvtneps2bf16_v8sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V8SF_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V8SF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf, "__builtin_ia32_cvtneps2bf16_v4sf", IX86_BUILTIN_CVTNEPS2HI16_V4SF, UNKNOWN, (int) V8HI_FTYPE_V4SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_mask, "__builtin_ia32_cvtneps2bf16_v4sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V4SF_MASK, UNKNOWN, (int) V8HI_FTYPE_V4SF_V8HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_maskz, "__builtin_ia32_cvtneps2bf16_v4sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V4SF_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V4SF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf, "__builtin_ia32_dpbf16ps_v16sf", IX86_BUILTIN_DPHI16PS_V16SF, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_mask, "__builtin_ia32_dpbf16ps_v16sf_mask", IX86_BUILTIN_DPHI16PS_V16SF_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_maskz, "__builtin_ia32_dpbf16ps_v16sf_maskz", IX86_BUILTIN_DPHI16PS_V16SF_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf, "__builtin_ia32_dpbf16ps_v8sf", IX86_BUILTIN_DPHI16PS_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_mask, "__builtin_ia32_dpbf16ps_v8sf_mask", IX86_BUILTIN_DPHI16PS_V8SF_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_maskz, "__builtin_ia32_dpbf16ps_v8sf_maskz", IX86_BUILTIN_DPHI16PS_V8SF_MASKZ, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builtin_ia32_dpbf16ps_v4sf", IX86_BUILTIN_DPHI16PS_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPHI16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPHI16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf, "__builtin_ia32_cvtne2ps2bf16_v32bf", IX86_BUILTIN_CVTNE2PS2BF16_V32BF, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf_mask, "__builtin_ia32_cvtne2ps2bf16_v32bf_mask", IX86_BUILTIN_CVTNE2PS2BF16_V32BF_MASK, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF_V32BF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf_maskz, "__builtin_ia32_cvtne2ps2bf16_v32bf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V32BF_MASKZ, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16bf, "__builtin_ia32_cvtne2ps2bf16_v16bf", IX86_BUILTIN_CVTNE2PS2BF16_V16BF, UNKNOWN, (int) V16BF_FTYPE_V8SF_V8SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16bf_mask, "__builtin_ia32_cvtne2ps2bf16_v16bf_mask", IX86_BUILTIN_CVTNE2PS2BF16_V16BF_MASK, UNKNOWN, (int) V16BF_FTYPE_V8SF_V8SF_V16BF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16bf_maskz, "__builtin_ia32_cvtne2ps2bf16_v16bf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V16BF_MASKZ, UNKNOWN, (int) V16BF_FTYPE_V8SF_V8SF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8bf, "__builtin_ia32_cvtne2ps2bf16_v8bf", IX86_BUILTIN_CVTNE2PS2BF16_V8BF, UNKNOWN, (int) V8BF_FTYPE_V4SF_V4SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8bf_mask, "__builtin_ia32_cvtne2ps2bf16_v8bf_mask", IX86_BUILTIN_CVTNE2PS2BF16_V8BF_MASK, UNKNOWN, (int) V8BF_FTYPE_V4SF_V4SF_V8BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8bf_maskz, "__builtin_ia32_cvtne2ps2bf16_v8bf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V8BF_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V4SF_V4SF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf, "__builtin_ia32_cvtneps2bf16_v16sf", IX86_BUILTIN_CVTNEPS2BF16_V16SF, UNKNOWN, (int) V16BF_FTYPE_V16SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_mask, "__builtin_ia32_cvtneps2bf16_v16sf_mask", IX86_BUILTIN_CVTNEPS2BF16_V16SF_MASK, UNKNOWN, (int) V16BF_FTYPE_V16SF_V16BF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_maskz, "__builtin_ia32_cvtneps2bf16_v16sf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V16SF_MASKZ, UNKNOWN, (int) V16BF_FTYPE_V16SF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf, "__builtin_ia32_cvtneps2bf16_v8sf", IX86_BUILTIN_CVTNEPS2BF16_V8SF, UNKNOWN, (int) V8BF_FTYPE_V8SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_mask, "__builtin_ia32_cvtneps2bf16_v8sf_mask", IX86_BUILTIN_CVTNEPS2BF16_V8SF_MASK, UNKNOWN, (int) V8BF_FTYPE_V8SF_V8BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_maskz, "__builtin_ia32_cvtneps2bf16_v8sf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V8SF_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V8SF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf, "__builtin_ia32_cvtneps2bf16_v4sf", IX86_BUILTIN_CVTNEPS2BF16_V4SF, UNKNOWN, (int) V8BF_FTYPE_V4SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_mask, "__builtin_ia32_cvtneps2bf16_v4sf_mask", IX86_BUILTIN_CVTNEPS2BF16_V4SF_MASK, UNKNOWN, (int) V8BF_FTYPE_V4SF_V8BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_maskz, "__builtin_ia32_cvtneps2bf16_v4sf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V4SF_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V4SF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf, "__builtin_ia32_dpbf16ps_v16sf", IX86_BUILTIN_DPBF16PS_V16SF, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32BF_V32BF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_mask, "__builtin_ia32_dpbf16ps_v16sf_mask", IX86_BUILTIN_DPBF16PS_V16SF_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32BF_V32BF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_maskz, "__builtin_ia32_dpbf16ps_v16sf_maskz", IX86_BUILTIN_DPBF16PS_V16SF_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32BF_V32BF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf, "__builtin_ia32_dpbf16ps_v8sf", IX86_BUILTIN_DPBF16PS_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16BF_V16BF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_mask, "__builtin_ia32_dpbf16ps_v8sf_mask", IX86_BUILTIN_DPBF16PS_V8SF_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16BF_V16BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_maskz, "__builtin_ia32_dpbf16ps_v8sf_maskz", IX86_BUILTIN_DPBF16PS_V8SF_MASKZ, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16BF_V16BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builtin_ia32_dpbf16ps_v4sf", IX86_BUILTIN_DPBF16PS_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPBF16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPBF16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
 
 /* AVX512FP16.  */
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_addph128_mask", IX86_BUILTIN_ADDPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 5d9e5a12f7e..8e1ef0b4c4a 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -10462,9 +10462,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V8DF_FTYPE_V2DF:
     case V8DF_FTYPE_V8DF:
     case V4DI_FTYPE_V4DI:
-    case V16HI_FTYPE_V16SF:
-    case V8HI_FTYPE_V8SF:
-    case V8HI_FTYPE_V4SF:
+    case V16BF_FTYPE_V16SF:
+    case V8BF_FTYPE_V8SF:
+    case V8BF_FTYPE_V4SF:
       nargs = 1;
       break;
     case V4SF_FTYPE_V4SF_VEC_MERGE:
@@ -10592,12 +10592,12 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case USI_FTYPE_USI_USI:
     case UDI_FTYPE_UDI_UDI:
     case V16SI_FTYPE_V8DF_V8DF:
-    case V32HI_FTYPE_V16SF_V16SF:
-    case V16HI_FTYPE_V8SF_V8SF:
-    case V8HI_FTYPE_V4SF_V4SF:
-    case V16HI_FTYPE_V16SF_UHI:
-    case V8HI_FTYPE_V8SF_UQI:
-    case V8HI_FTYPE_V4SF_UQI:
+    case V32BF_FTYPE_V16SF_V16SF:
+    case V16BF_FTYPE_V8SF_V8SF:
+    case V8BF_FTYPE_V4SF_V4SF:
+    case V16BF_FTYPE_V16SF_UHI:
+    case V8BF_FTYPE_V8SF_UQI:
+    case V8BF_FTYPE_V4SF_UQI:
       nargs = 2;
       break;
     case V2DI_FTYPE_V2DI_INT_CONVERT:
@@ -10803,15 +10803,15 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V16HI_FTYPE_V16HI_V16HI_V16HI:
     case V8SI_FTYPE_V8SI_V8SI_V8SI:
     case V8HI_FTYPE_V8HI_V8HI_V8HI:
-    case V32HI_FTYPE_V16SF_V16SF_USI:
-    case V16HI_FTYPE_V8SF_V8SF_UHI:
-    case V8HI_FTYPE_V4SF_V4SF_UQI:
-    case V16HI_FTYPE_V16SF_V16HI_UHI:
-    case V8HI_FTYPE_V8SF_V8HI_UQI:
-    case V8HI_FTYPE_V4SF_V8HI_UQI:
-    case V16SF_FTYPE_V16SF_V32HI_V32HI:
-    case V8SF_FTYPE_V8SF_V16HI_V16HI:
-    case V4SF_FTYPE_V4SF_V8HI_V8HI:
+    case V32BF_FTYPE_V16SF_V16SF_USI:
+    case V16BF_FTYPE_V8SF_V8SF_UHI:
+    case V8BF_FTYPE_V4SF_V4SF_UQI:
+    case V16BF_FTYPE_V16SF_V16BF_UHI:
+    case V8BF_FTYPE_V8SF_V8BF_UQI:
+    case V8BF_FTYPE_V4SF_V8BF_UQI:
+    case V16SF_FTYPE_V16SF_V32BF_V32BF:
+    case V8SF_FTYPE_V8SF_V16BF_V16BF:
+    case V4SF_FTYPE_V4SF_V8BF_V8BF:
       nargs = 3;
       break;
     case V32QI_FTYPE_V32QI_V32QI_INT:
@@ -10958,9 +10958,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
     case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
     case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
-    case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
-    case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
-    case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
+    case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
+    case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
+    case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
       nargs = 4;
       break;
     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
@@ -10998,9 +10998,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
       break;
     case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
     case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
-    case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
-    case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
-    case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
+    case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
+    case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
+    case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
       nargs = 4;
       break;
     case UQI_FTYPE_V8DI_V8DI_INT_UQI:
diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h
index ddea249d09b..c62d50f1951 100644
--- a/gcc/config/i386/immintrin.h
+++ b/gcc/config/i386/immintrin.h
@@ -118,9 +118,11 @@
 
 #include <vpclmulqdqintrin.h>
 
+#ifdef __SSE2__
 #include <avx512bf16vlintrin.h>
 
 #include <avx512bf16intrin.h>
+#endif
 
 #include <amxtileintrin.h>
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f4b5506703f..fba81a93c1a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -187,8 +187,6 @@
   UNSPEC_VP2INTERSECT
 
   ;; For AVX512BF16 support
-  UNSPEC_VCVTNE2PS2BF16
-  UNSPEC_VCVTNEPS2BF16
   UNSPEC_VDPBF16PS
 
   ;; For AVX512FP16 suppport
@@ -28918,41 +28916,101 @@
   "vp2intersectd\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr ("prefix") ("evex"))])
 
-(define_mode_iterator BF16 [V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")])
+(define_mode_iterator VF_AVX512BF16VL
+  [V32BF (V16BF "TARGET_AVX512VL") (V8BF "TARGET_AVX512VL")])
 ;; Converting from BF to SF
 (define_mode_attr bf16_cvt_2sf
-  [(V32HI  "V16SF") (V16HI  "V8SF") (V8HI  "V4SF")])
+  [(V32BF  "V16SF") (V16BF  "V8SF") (V8BF  "V4SF")])
 ;; Converting from SF to BF
 (define_mode_attr sf_cvt_bf16
-  [(V4SF  "V8HI") (V8SF  "V8HI") (V16SF  "V16HI")])
+  [(V8SF  "V8BF") (V16SF  "V16BF")])
 ;; Mapping from BF to SF
 (define_mode_attr sf_bf16
-  [(V4SF  "V8HI") (V8SF  "V16HI") (V16SF  "V32HI")])
+  [(V4SF  "V8BF") (V8SF  "V16BF") (V16SF  "V32BF")])
 
 (define_expand "avx512f_cvtne2ps2bf16_<mode>_maskz"
-  [(match_operand:BF16 0 "register_operand")
+  [(match_operand:VF_AVX512BF16VL 0 "register_operand")
    (match_operand:<bf16_cvt_2sf> 1 "register_operand")
-   (match_operand:<bf16_cvt_2sf> 2 "register_operand")
+   (match_operand:<bf16_cvt_2sf> 2 "nonimmediate_operand")
    (match_operand:<avx512fmaskmode> 3 "register_operand")]
   "TARGET_AVX512BF16"
 {
-  emit_insn (gen_avx512f_cvtne2ps2bf16_<mode>_mask(operands[0], operands[1],
-    operands[2], CONST0_RTX(<MODE>mode), operands[3]));
+  emit_insn (gen_avx512f_cvtne2ps2bf16_<mode>_mask(operands[0], operands[2],
+    operands[1], CONST0_RTX(<MODE>mode), operands[3]));
   DONE;
 })
 
 (define_insn "avx512f_cvtne2ps2bf16_<mode><mask_name>"
-  [(set (match_operand:BF16 0 "register_operand" "=v")
-	(unspec:BF16
-	  [(match_operand:<bf16_cvt_2sf> 1 "register_operand" "v")
-	   (match_operand:<bf16_cvt_2sf> 2 "register_operand" "v")]
-        UNSPEC_VCVTNE2PS2BF16))]
+  [(set (match_operand:VF_AVX512BF16VL 0 "register_operand" "=v")
+	(vec_concat:VF_AVX512BF16VL
+	  (float_truncate:<ssehalfvecmode>
+	    (match_operand:<bf16_cvt_2sf> 2 "nonimmediate_operand" "vm"))
+	  (float_truncate:<ssehalfvecmode>
+	    (match_operand:<bf16_cvt_2sf> 1 "register_operand" "v"))))]
   "TARGET_AVX512BF16"
   "vcvtne2ps2bf16\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}")
 
+(define_expand "avx512f_cvtneps2bf16_v4sf"
+  [(set (match_operand:V8BF 0 "register_operand")
+	(vec_concat:V8BF
+	  (float_truncate:V4BF
+	    (match_operand:V4SF 1 "nonimmediate_operand"))
+	  (match_dup 2)))]
+  "TARGET_AVX512BF16 && TARGET_AVX512VL"
+  "operands[2] = CONST0_RTX (V4BFmode);")
+
+(define_insn "*avx512f_cvtneps2bf16_v4sf"
+  [(set (match_operand:V8BF 0 "register_operand" "=v")
+	(vec_concat:V8BF
+	  (float_truncate:V4BF
+	    (match_operand:V4SF 1 "nonimmediate_operand" "vm"))
+	  (match_operand:V4BF 2 "const0_operand")))]
+  "TARGET_AVX512BF16 && TARGET_AVX512VL"
+  "vcvtneps2bf16{x}\t{%1, %0|%0, %1}")
+
+(define_expand "avx512f_cvtneps2bf16_v4sf_maskz"
+  [(match_operand:V8BF 0 "register_operand")
+   (match_operand:V4SF 1 "nonimmediate_operand")
+   (match_operand:QI 2 "register_operand")]
+  "TARGET_AVX512BF16 && TARGET_AVX512VL"
+{
+  emit_insn (gen_avx512f_cvtneps2bf16_v4sf_mask_1(operands[0], operands[1],
+    CONST0_RTX(V8BFmode), operands[2], CONST0_RTX(V4BFmode)));
+  DONE;
+})
+
+(define_expand "avx512f_cvtneps2bf16_v4sf_mask"
+  [(match_operand:V8BF 0 "register_operand")
+   (match_operand:V4SF 1 "nonimmediate_operand")
+   (match_operand:V8BF 2 "nonimm_or_0_operand")
+   (match_operand:QI 3 "register_operand")]
+  "TARGET_AVX512BF16 && TARGET_AVX512VL"
+{
+  emit_insn (gen_avx512f_cvtneps2bf16_v4sf_mask_1(operands[0], operands[1],
+    operands[2], operands[3], CONST0_RTX(V4BFmode)));
+  DONE;
+})
+
+(define_insn "avx512f_cvtneps2bf16_v4sf_mask_1"
+  [(set (match_operand:V8BF 0 "register_operand" "=v")
+	(vec_concat:V8BF
+	  (vec_merge:V4BF
+	    (float_truncate:V4BF
+	      (match_operand:V4SF 1 "nonimmediate_operand" "vm"))
+	    (vec_select:V4BF
+	      (match_operand:V8BF 2 "nonimm_or_0_operand" "0C")
+	      (parallel [(const_int 0) (const_int 1)
+			 (const_int 2) (const_int 3)]))
+	    (match_operand:QI 3 "register_operand" "Yk"))
+	  (match_operand:V4BF 4 "const0_operand")))]
+  "TARGET_AVX512BF16 && TARGET_AVX512VL"
+  "vcvtneps2bf16{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}")
+
+(define_mode_iterator VF1_AVX512_256 [V16SF (V8SF "TARGET_AVX512VL")])
+
 (define_expand "avx512f_cvtneps2bf16_<mode>_maskz"
   [(match_operand:<sf_cvt_bf16> 0 "register_operand")
-   (match_operand:VF1_AVX512VL 1 "register_operand")
+   (match_operand:VF1_AVX512_256 1 "nonimmediate_operand")
    (match_operand:<avx512fmaskmode> 2 "register_operand")]
   "TARGET_AVX512BF16"
 {
@@ -28963,11 +29021,10 @@
 
 (define_insn "avx512f_cvtneps2bf16_<mode><mask_name>"
   [(set (match_operand:<sf_cvt_bf16> 0 "register_operand" "=v")
-	(unspec:<sf_cvt_bf16>
-	  [(match_operand:VF1_AVX512VL 1 "register_operand" "v")]
-        UNSPEC_VCVTNEPS2BF16))]
+	(float_truncate:<sf_cvt_bf16>
+	  (match_operand:VF1_AVX512_256 1 "nonimmediate_operand" "vm")))]
   "TARGET_AVX512BF16"
-  "vcvtneps2bf16\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}")
+  "vcvtneps2bf16<qq2phsuff>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}")
 
 (define_expand "avx512f_dpbf16ps_<mode>_maskz"
   [(match_operand:VF1_AVX512VL 0 "register_operand")
@@ -28987,7 +29044,7 @@
 	(unspec:VF1_AVX512VL
 	  [(match_operand:VF1_AVX512VL 1 "register_operand" "0")
 	   (match_operand:<sf_bf16> 2 "register_operand" "v")
-	   (match_operand:<sf_bf16> 3 "register_operand" "v")]
+	   (match_operand:<sf_bf16> 3 "nonimmediate_operand" "vm")]
         UNSPEC_VDPBF16PS))]
   "TARGET_AVX512BF16"
   "vdpbf16ps\t{%3, %2, %0<maskz_half_operand4>|%0<maskz_half_operand4>, %2, %3}")
@@ -28998,7 +29055,7 @@
 	  (unspec:VF1_AVX512VL
 	    [(match_operand:VF1_AVX512VL 1 "register_operand" "0")
 	     (match_operand:<sf_bf16> 2 "register_operand" "v")
-	     (match_operand:<sf_bf16> 3 "register_operand" "v")]
+	     (match_operand:<sf_bf16> 3 "nonimmediate_operand" "vm")]
              UNSPEC_VDPBF16PS)
           (match_dup 1)
           (match_operand:<avx512fmaskhalfmode> 4 "register_operand" "Yk")))]
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
index 831abd37d80..8e929e6f159 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bf16 -O2" } */
-/* { dg-additional-options "-fno-PIE" { target ia32 } } */
+/* { dg-additional-options "-fno-PIE -mfpmath=sse" { target ia32 } } */
 /* { dg-final { scan-assembler-times "sall\[ \\t\]+\[^\{\n\]*16" 1 } } */
 /* { dg-final { scan-assembler-times "movl" 1 } } */
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c b/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
index b64ad7b84dd..02ebdd8cf5b 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bf16 -O2" } */
-/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
index 8f21b1bfdae..b71addd6301 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bf16 -mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
index 0969ae1b35e..d3a9bdf8c34 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
@@ -1,11 +1,11 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bf16 -mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
 
 #include <immintrin.h>

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-10-31  6:07 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-31  6:07 [gcc r13-3566] i386:: using __bf16 for AVX512BF16 intrinsics Kong Lingling

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).