public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Fix incorrect option mask and avx512cd target push
@ 2023-10-30  7:44 Haochen Jiang
  2023-10-30  7:56 ` Hongtao Liu
  0 siblings, 1 reply; 2+ messages in thread
From: Haochen Jiang @ 2023-10-30  7:44 UTC (permalink / raw)
  To: gcc-patches; +Cc: ubizjak, hongtao.liu

Hi all,

This patch fixed two obvious bug in current evex512 implementation.

Also, I moved AVX512CD+AVX512VL part out of the AVX512VL to avoid
accidental handle miss in avx512cd in the future.

Ok for trunk?

BRs,
Haochen

gcc/ChangeLog:

	* config/i386/avx512cdintrin.h (target): Push evex512 for
	avx512cd.
	* config/i386/avx512vlintrin.h (target): Split avx512cdvl part
	out from avx512vl.
	* config/i386/i386-builtin.def (BDESC): Do not check evex512
	for builtins not needed.
---
 gcc/config/i386/avx512cdintrin.h |    2 +-
 gcc/config/i386/avx512vlintrin.h | 1792 +++++++++++++++---------------
 gcc/config/i386/i386-builtin.def |    4 +-
 3 files changed, 899 insertions(+), 899 deletions(-)

diff --git a/gcc/config/i386/avx512cdintrin.h b/gcc/config/i386/avx512cdintrin.h
index a5f5eabb68d..56a786aa9a3 100644
--- a/gcc/config/i386/avx512cdintrin.h
+++ b/gcc/config/i386/avx512cdintrin.h
@@ -30,7 +30,7 @@
 
 #ifndef __AVX512CD__
 #pragma GCC push_options
-#pragma GCC target("avx512cd")
+#pragma GCC target("avx512cd,evex512")
 #define __DISABLE_AVX512CD__
 #endif /* __AVX512CD__ */
 
diff --git a/gcc/config/i386/avx512vlintrin.h b/gcc/config/i386/avx512vlintrin.h
index 08e49e8d8ab..a40aa91b948 100644
--- a/gcc/config/i386/avx512vlintrin.h
+++ b/gcc/config/i386/avx512vlintrin.h
@@ -8396,1281 +8396,1003 @@ _mm_mask_min_epu32 (__m128i __W, __mmask8 __M, __m128i __A,
 						  (__v4si) __W, __M);
 }
 
-#ifndef __AVX512CD__
-#pragma GCC push_options
-#pragma GCC target("avx512vl,avx512cd")
-#define __DISABLE_AVX512VLCD__
-#endif
-
-extern __inline __m128i
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_broadcastmb_epi64 (__mmask8 __A)
+_mm256_mask_unpacklo_pd (__m256d __W, __mmask8 __U, __m256d __A,
+			 __m256d __B)
 {
-  return (__m128i) __builtin_ia32_broadcastmb128 (__A);
+  return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A,
+						    (__v4df) __B,
+						    (__v4df) __W,
+						    (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_broadcastmb_epi64 (__mmask8 __A)
+_mm256_maskz_unpacklo_pd (__mmask8 __U, __m256d __A, __m256d __B)
 {
-  return (__m256i) __builtin_ia32_broadcastmb256 (__A);
+  return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A,
+						    (__v4df) __B,
+						    (__v4df)
+						    _mm256_setzero_pd (),
+						    (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_broadcastmw_epi32 (__mmask16 __A)
+_mm_mask_unpacklo_pd (__m128d __W, __mmask8 __U, __m128d __A,
+		      __m128d __B)
 {
-  return (__m128i) __builtin_ia32_broadcastmw128 (__A);
+  return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A,
+						    (__v2df) __B,
+						    (__v2df) __W,
+						    (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_broadcastmw_epi32 (__mmask16 __A)
+_mm_maskz_unpacklo_pd (__mmask8 __U, __m128d __A, __m128d __B)
 {
-  return (__m256i) __builtin_ia32_broadcastmw256 (__A);
+  return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A,
+						    (__v2df) __B,
+						    (__v2df)
+						    _mm_setzero_pd (),
+						    (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_lzcnt_epi32 (__m256i __A)
+_mm256_mask_unpacklo_ps (__m256 __W, __mmask8 __U, __m256 __A,
+			 __m256 __B)
 {
-  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
-						     (__v8si)
-						     _mm256_setzero_si256 (),
-						     (__mmask8) -1);
+  return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A,
+						   (__v8sf) __B,
+						   (__v8sf) __W,
+						   (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+_mm256_mask_unpackhi_pd (__m256d __W, __mmask8 __U, __m256d __A,
+			 __m256d __B)
 {
-  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
-						     (__v8si) __W,
-						     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A,
+						    (__v4df) __B,
+						    (__v4df) __W,
+						    (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
+_mm256_maskz_unpackhi_pd (__mmask8 __U, __m256d __A, __m256d __B)
 {
-  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
-						     (__v8si)
-						     _mm256_setzero_si256 (),
-						     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A,
+						    (__v4df) __B,
+						    (__v4df)
+						    _mm256_setzero_pd (),
+						    (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_lzcnt_epi64 (__m256i __A)
+_mm_mask_unpackhi_pd (__m128d __W, __mmask8 __U, __m128d __A,
+		      __m128d __B)
 {
-  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
-						     (__v4di)
-						     _mm256_setzero_si256 (),
-						     (__mmask8) -1);
+  return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A,
+						    (__v2df) __B,
+						    (__v2df) __W,
+						    (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+_mm_maskz_unpackhi_pd (__mmask8 __U, __m128d __A, __m128d __B)
 {
-  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
-						     (__v4di) __W,
-						     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A,
+						    (__v2df) __B,
+						    (__v2df)
+						    _mm_setzero_pd (),
+						    (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
+_mm256_mask_unpackhi_ps (__m256 __W, __mmask8 __U, __m256 __A,
+			 __m256 __B)
 {
-  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
-						     (__v4di)
-						     _mm256_setzero_si256 (),
-						     (__mmask8) __U);
+  return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A,
+						   (__v8sf) __B,
+						   (__v8sf) __W,
+						   (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_conflict_epi64 (__m256i __A)
+_mm256_maskz_unpackhi_ps (__mmask8 __U, __m256 __A, __m256 __B)
 {
-  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
-							 (__v4di)
-							 _mm256_setzero_si256 (),
-							 (__mmask8) -1);
+  return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A,
+						   (__v8sf) __B,
+						   (__v8sf)
+						   _mm256_setzero_ps (),
+						   (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+_mm_mask_unpackhi_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
-  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
-							 (__v4di) __W,
-							 (__mmask8)
-							 __U);
+  return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A,
+						   (__v4sf) __B,
+						   (__v4sf) __W,
+						   (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
+_mm_maskz_unpackhi_ps (__mmask8 __U, __m128 __A, __m128 __B)
 {
-  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
-							 (__v4di)
-							 _mm256_setzero_si256 (),
-							 (__mmask8)
-							 __U);
+  return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A,
+						   (__v4sf) __B,
+						   (__v4sf)
+						   _mm_setzero_ps (),
+						   (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_conflict_epi32 (__m256i __A)
+_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
-							 (__v8si)
-							 _mm256_setzero_si256 (),
-							 (__mmask8) -1);
+  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
+						 (__v4sf) __W,
+						 (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
-							 (__v8si) __W,
-							 (__mmask8)
-							 __U);
+  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
+						 (__v4sf)
+						 _mm_setzero_ps (),
+						 (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
+_mm256_maskz_unpacklo_ps (__mmask8 __U, __m256 __A, __m256 __B)
 {
-  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
-							 (__v8si)
-							 _mm256_setzero_si256 (),
-							 (__mmask8)
-							 __U);
+  return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A,
+						   (__v8sf) __B,
+						   (__v8sf)
+						   _mm256_setzero_ps (),
+						   (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_lzcnt_epi32 (__m128i __A)
+_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
-						     (__v4si)
-						     _mm_setzero_si128 (),
-						     (__mmask8) -1);
+  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
+						    (__v8sf) __W,
+						    (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
-						     (__v4si) __W,
-						     (__mmask8) __U);
+  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
+						    (__v8sf)
+						    _mm256_setzero_ps (),
+						    (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
+_mm_mask_unpacklo_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
-  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
-						     (__v4si)
-						     _mm_setzero_si128 (),
-						     (__mmask8) __U);
+  return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A,
+						   (__v4sf) __B,
+						   (__v4sf) __W,
+						   (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_lzcnt_epi64 (__m128i __A)
+_mm_maskz_unpacklo_ps (__mmask8 __U, __m128 __A, __m128 __B)
 {
-  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
-						     (__v2di)
-						     _mm_setzero_si128 (),
-						     (__mmask8) -1);
+  return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A,
+						   (__v4sf) __B,
+						   (__v4sf)
+						   _mm_setzero_ps (),
+						   (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm256_mask_sra_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+		       __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
-						     (__v2di) __W,
-						     (__mmask8) __U);
+  return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
+						 (__v4si) __B,
+						 (__v8si) __W,
+						 (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
+_mm256_maskz_sra_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
-						     (__v2di)
-						     _mm_setzero_si128 (),
-						     (__mmask8) __U);
+  return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
+						 (__v4si) __B,
+						 (__v8si)
+						 _mm256_setzero_si256 (),
+						 (__mmask8) __U);
 }
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_conflict_epi64 (__m128i __A)
+_mm_mask_sra_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+		    __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
-							 (__v2di)
-							 _mm_setzero_si128 (),
-							 (__mmask8) -1);
+  return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
+						 (__v4si) __B,
+						 (__v4si) __W,
+						 (__mmask8) __U);
 }
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_maskz_sra_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
-							 (__v2di) __W,
-							 (__mmask8)
-							 __U);
+  return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
+						 (__v4si) __B,
+						 (__v4si)
+						 _mm_setzero_si128 (),
+						 (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
+_mm256_sra_epi64 (__m256i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
-							 (__v2di)
-							 _mm_setzero_si128 (),
-							 (__mmask8)
-							 __U);
+  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
+						 (__v2di) __B,
+						 (__v4di)
+						 _mm256_setzero_si256 (),
+						 (__mmask8) -1);
 }
 
-extern __inline __m128i
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_conflict_epi32 (__m128i __A)
+_mm256_mask_sra_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+		       __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
-							 (__v4si)
-							 _mm_setzero_si128 (),
-							 (__mmask8) -1);
+  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
+						 (__v2di) __B,
+						 (__v4di) __W,
+						 (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm256_maskz_sra_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
-							 (__v4si) __W,
-							 (__mmask8)
-							 __U);
+  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
+						 (__v2di) __B,
+						 (__v4di)
+						 _mm256_setzero_si256 (),
+						 (__mmask8) __U);
 }
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
+_mm_sra_epi64 (__m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
-							 (__v4si)
-							 _mm_setzero_si128 (),
-							 (__mmask8)
-							 __U);
+  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
+						 (__v2di) __B,
+						 (__v2di)
+						 _mm_setzero_si128 (),
+						 (__mmask8) -1);
 }
 
-#ifdef __DISABLE_AVX512VLCD__
-#pragma GCC pop_options
-#endif
-
-extern __inline __m256d
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_unpacklo_pd (__m256d __W, __mmask8 __U, __m256d __A,
-			 __m256d __B)
+_mm_mask_sra_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+		    __m128i __B)
 {
-  return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A,
-						    (__v4df) __B,
-						    (__v4df) __W,
-						    (__mmask8) __U);
+  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
+						 (__v2di) __B,
+						 (__v2di) __W,
+						 (__mmask8) __U);
 }
 
-extern __inline __m256d
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_unpacklo_pd (__mmask8 __U, __m256d __A, __m256d __B)
+_mm_maskz_sra_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A,
-						    (__v4df) __B,
-						    (__v4df)
-						    _mm256_setzero_pd (),
-						    (__mmask8) __U);
+  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
+						 (__v2di) __B,
+						 (__v2di)
+						 _mm_setzero_si128 (),
+						 (__mmask8) __U);
 }
 
-extern __inline __m128d
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_unpacklo_pd (__m128d __W, __mmask8 __U, __m128d __A,
-		      __m128d __B)
+_mm_mask_sll_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+		    __m128i __B)
 {
-  return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A,
-						    (__v2df) __B,
-						    (__v2df) __W,
-						    (__mmask8) __U);
+  return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
+						 (__v4si) __B,
+						 (__v4si) __W,
+						 (__mmask8) __U);
 }
 
-extern __inline __m128d
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_unpacklo_pd (__mmask8 __U, __m128d __A, __m128d __B)
+_mm_maskz_sll_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A,
-						    (__v2df) __B,
-						    (__v2df)
-						    _mm_setzero_pd (),
-						    (__mmask8) __U);
+  return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
+						 (__v4si) __B,
+						 (__v4si)
+						 _mm_setzero_si128 (),
+						 (__mmask8) __U);
 }
 
-extern __inline __m256
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_unpacklo_ps (__m256 __W, __mmask8 __U, __m256 __A,
-			 __m256 __B)
+_mm_mask_sll_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+		    __m128i __B)
 {
-  return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A,
-						   (__v8sf) __B,
-						   (__v8sf) __W,
-						   (__mmask8) __U);
+  return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
+						 (__v2di) __B,
+						 (__v2di) __W,
+						 (__mmask8) __U);
 }
 
-extern __inline __m256d
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_unpackhi_pd (__m256d __W, __mmask8 __U, __m256d __A,
-			 __m256d __B)
+_mm_maskz_sll_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A,
-						    (__v4df) __B,
-						    (__v4df) __W,
-						    (__mmask8) __U);
+  return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
+						 (__v2di) __B,
+						 (__v2di)
+						 _mm_setzero_si128 (),
+						 (__mmask8) __U);
 }
 
-extern __inline __m256d
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_unpackhi_pd (__mmask8 __U, __m256d __A, __m256d __B)
+_mm256_mask_sll_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+		       __m128i __B)
 {
-  return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A,
-						    (__v4df) __B,
-						    (__v4df)
-						    _mm256_setzero_pd (),
-						    (__mmask8) __U);
+  return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
+						 (__v4si) __B,
+						 (__v8si) __W,
+						 (__mmask8) __U);
 }
 
-extern __inline __m128d
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_unpackhi_pd (__m128d __W, __mmask8 __U, __m128d __A,
-		      __m128d __B)
+_mm256_maskz_sll_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A,
-						    (__v2df) __B,
-						    (__v2df) __W,
-						    (__mmask8) __U);
+  return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
+						 (__v4si) __B,
+						 (__v8si)
+						 _mm256_setzero_si256 (),
+						 (__mmask8) __U);
 }
 
-extern __inline __m128d
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_unpackhi_pd (__mmask8 __U, __m128d __A, __m128d __B)
+_mm256_mask_sll_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+		       __m128i __B)
 {
-  return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A,
-						    (__v2df) __B,
-						    (__v2df)
-						    _mm_setzero_pd (),
-						    (__mmask8) __U);
+  return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
+						 (__v2di) __B,
+						 (__v4di) __W,
+						 (__mmask8) __U);
 }
 
-extern __inline __m256
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_unpackhi_ps (__m256 __W, __mmask8 __U, __m256 __A,
-			 __m256 __B)
+_mm256_maskz_sll_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
 {
-  return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A,
-						   (__v8sf) __B,
-						   (__v8sf) __W,
-						   (__mmask8) __U);
+  return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
+						 (__v2di) __B,
+						 (__v4di)
+						 _mm256_setzero_si256 (),
+						 (__mmask8) __U);
 }
 
 extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_unpackhi_ps (__mmask8 __U, __m256 __A, __m256 __B)
+_mm256_mask_permutexvar_ps (__m256 __W, __mmask8 __U, __m256i __X,
+			    __m256 __Y)
 {
-  return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A,
-						   (__v8sf) __B,
-						   (__v8sf)
-						   _mm256_setzero_ps (),
-						   (__mmask8) __U);
+  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
+						    (__v8si) __X,
+						    (__v8sf) __W,
+						    (__mmask8) __U);
 }
 
-extern __inline __m128
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_unpackhi_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+_mm256_maskz_permutexvar_ps (__mmask8 __U, __m256i __X, __m256 __Y)
 {
-  return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A,
-						   (__v4sf) __B,
-						   (__v4sf) __W,
-						   (__mmask8) __U);
+  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
+						    (__v8si) __X,
+						    (__v8sf)
+						    _mm256_setzero_ps (),
+						    (__mmask8) __U);
 }
 
-extern __inline __m128
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_unpackhi_ps (__mmask8 __U, __m128 __A, __m128 __B)
+_mm256_permutexvar_pd (__m256i __X, __m256d __Y)
 {
-  return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A,
-						   (__v4sf) __B,
-						   (__v4sf)
-						   _mm_setzero_ps (),
-						   (__mmask8) __U);
+  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+						     (__v4di) __X,
+						     (__v4df)
+						     _mm256_setzero_pd (),
+						     (__mmask8) -1);
 }
 
-extern __inline __m128
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A)
+_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X,
+			    __m256d __Y)
 {
-  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
-						 (__v4sf) __W,
-						 (__mmask8) __U);
+  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+						     (__v4di) __X,
+						     (__v4df) __W,
+						     (__mmask8) __U);
 }
 
-extern __inline __m128
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
+_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y)
 {
-  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
-						 (__v4sf)
-						 _mm_setzero_ps (),
-						 (__mmask8) __U);
+  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+						     (__v4di) __X,
+						     (__v4df)
+						     _mm256_setzero_pd (),
+						     (__mmask8) __U);
 }
 
-extern __inline __m256
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_unpacklo_ps (__mmask8 __U, __m256 __A, __m256 __B)
+_mm256_mask_permutevar_pd (__m256d __W, __mmask8 __U, __m256d __A,
+			   __m256i __C)
 {
-  return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A,
-						   (__v8sf) __B,
-						   (__v8sf)
-						   _mm256_setzero_ps (),
-						   (__mmask8) __U);
+  return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
+							(__v4di) __C,
+							(__v4df) __W,
+							(__mmask8)
+							__U);
 }
 
-extern __inline __m256
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A)
+_mm256_maskz_permutevar_pd (__mmask8 __U, __m256d __A, __m256i __C)
 {
-  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
-						    (__v8sf) __W,
-						    (__mmask8) __U);
+  return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
+							(__v4di) __C,
+							(__v4df)
+							_mm256_setzero_pd (),
+							(__mmask8)
+							__U);
 }
 
 extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
-{
-  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
-						    (__v8sf)
-						    _mm256_setzero_ps (),
-						    (__mmask8) __U);
-}
-
-extern __inline __m128
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_unpacklo_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+_mm256_mask_permutevar_ps (__m256 __W, __mmask8 __U, __m256 __A,
+			   __m256i __C)
 {
-  return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A,
-						   (__v4sf) __B,
-						   (__v4sf) __W,
-						   (__mmask8) __U);
+  return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
+						       (__v8si) __C,
+						       (__v8sf) __W,
+						       (__mmask8) __U);
 }
 
-extern __inline __m128
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_unpacklo_ps (__mmask8 __U, __m128 __A, __m128 __B)
+_mm256_maskz_permutevar_ps (__mmask8 __U, __m256 __A, __m256i __C)
 {
-  return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A,
-						   (__v4sf) __B,
-						   (__v4sf)
-						   _mm_setzero_ps (),
-						   (__mmask8) __U);
+  return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
+						       (__v8si) __C,
+						       (__v8sf)
+						       _mm256_setzero_ps (),
+						       (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_sra_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-		       __m128i __B)
+_mm_mask_permutevar_pd (__m128d __W, __mmask8 __U, __m128d __A,
+			__m128i __C)
 {
-  return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
-						 (__v4si) __B,
-						 (__v8si) __W,
-						 (__mmask8) __U);
+  return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
+						     (__v2di) __C,
+						     (__v2df) __W,
+						     (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_sra_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+_mm_maskz_permutevar_pd (__mmask8 __U, __m128d __A, __m128i __C)
 {
-  return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
-						 (__v4si) __B,
-						 (__v8si)
-						 _mm256_setzero_si256 (),
-						 (__mmask8) __U);
+  return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
+						     (__v2di) __C,
+						     (__v2df)
+						     _mm_setzero_pd (),
+						     (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_sra_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-		    __m128i __B)
+_mm_mask_permutevar_ps (__m128 __W, __mmask8 __U, __m128 __A,
+			__m128i __C)
 {
-  return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
-						 (__v4si) __B,
-						 (__v4si) __W,
-						 (__mmask8) __U);
+  return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
+						    (__v4si) __C,
+						    (__v4sf) __W,
+						    (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_sra_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_permutevar_ps (__mmask8 __U, __m128 __A, __m128i __C)
 {
-  return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
-						 (__v4si) __B,
-						 (__v4si)
-						 _mm_setzero_si128 (),
-						 (__mmask8) __U);
+  return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
+						    (__v4si) __C,
+						    (__v4sf)
+						    _mm_setzero_ps (),
+						    (__mmask8) __U);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_sra_epi64 (__m256i __A, __m128i __B)
+_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
-						 (__v2di) __B,
-						 (__v4di)
-						 _mm256_setzero_si256 (),
-						 (__mmask8) -1);
+  return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
+						  (__v8si) __B,
+						  (__v8si)
+						  _mm256_setzero_si256 (),
+						  __M);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_sra_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-		       __m128i __B)
+_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
-						 (__v2di) __B,
-						 (__v4di) __W,
-						 (__mmask8) __U);
+  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+						     (__v4di) __X,
+						     (__v4di)
+						     _mm256_setzero_si256 (),
+						     __M);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_sra_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
+			 __m256i __B)
 {
-  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
-						 (__v2di) __B,
-						 (__v4di)
-						 _mm256_setzero_si256 (),
-						 (__mmask8) __U);
+  return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
+						  (__v8si) __B,
+						  (__v8si) __W, __M);
 }
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sra_epi64 (__m128i __A, __m128i __B)
+_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
-						 (__v2di) __B,
-						 (__v2di)
-						 _mm_setzero_si128 (),
-						 (__mmask8) -1);
+  return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
+						  (__v4si) __B,
+						  (__v4si)
+						  _mm_setzero_si128 (),
+						  __M);
 }
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_sra_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-		    __m128i __B)
+_mm_mask_mullo_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
+		      __m128i __B)
 {
-  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
-						 (__v2di) __B,
-						 (__v2di) __W,
-						 (__mmask8) __U);
+  return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
+						  (__v4si) __B,
+						  (__v4si) __W, __M);
 }
 
-extern __inline __m128i
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_sra_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
+		       __m256i __Y)
 {
-  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
-						 (__v2di) __B,
-						 (__v2di)
-						 _mm_setzero_si128 (),
-						 (__mmask8) __U);
+  return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
+						  (__v8si) __Y,
+						  (__v4di) __W, __M);
 }
 
-extern __inline __m128i
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_sll_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-		    __m128i __B)
+_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
-						 (__v4si) __B,
-						 (__v4si) __W,
-						 (__mmask8) __U);
+  return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
+						  (__v8si) __Y,
+						  (__v4di)
+						  _mm256_setzero_si256 (),
+						  __M);
 }
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_sll_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X,
+		    __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
-						 (__v4si) __B,
-						 (__v4si)
-						 _mm_setzero_si128 (),
-						 (__mmask8) __U);
+  return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
+						  (__v4si) __Y,
+						  (__v2di) __W, __M);
 }
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_sll_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-		    __m128i __B)
+_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
-						 (__v2di) __B,
-						 (__v2di) __W,
-						 (__mmask8) __U);
+  return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
+						  (__v4si) __Y,
+						  (__v2di)
+						  _mm_setzero_si128 (),
+						  __M);
 }
 
-extern __inline __m128i
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_sll_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm256_permutexvar_epi64 (__m256i __X, __m256i __Y)
 {
-  return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
-						 (__v2di) __B,
-						 (__v2di)
-						 _mm_setzero_si128 (),
-						 (__mmask8) __U);
+  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+						     (__v4di) __X,
+						     (__v4di)
+						     _mm256_setzero_si256 (),
+						     (__mmask8) -1);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_sll_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-		       __m128i __B)
+_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X,
+			       __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
-						 (__v4si) __B,
-						 (__v8si) __W,
-						 (__mmask8) __U);
+  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+						     (__v4di) __X,
+						     (__v4di) __W,
+						     __M);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_sll_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X,
+		       __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
-						 (__v4si) __B,
-						 (__v8si)
-						 _mm256_setzero_si256 (),
-						 (__mmask8) __U);
+  return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
+						   (__v8si) __Y,
+						   (__v4di) __W, __M);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_sll_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-		       __m128i __B)
+_mm256_maskz_permutexvar_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
-						 (__v2di) __B,
-						 (__v4di) __W,
-						 (__mmask8) __U);
+  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+						     (__v8si) __X,
+						     (__v8si)
+						     _mm256_setzero_si256 (),
+						     __M);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_sll_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
-						 (__v2di) __B,
-						 (__v4di)
-						 _mm256_setzero_si256 (),
-						 (__mmask8) __U);
+  return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
+						   (__v8si) __Y,
+						   (__v4di)
+						   _mm256_setzero_si256 (),
+						   __M);
 }
 
-extern __inline __m256
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_permutexvar_ps (__m256 __W, __mmask8 __U, __m256i __X,
-			    __m256 __Y)
+_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X,
+		    __m128i __Y)
 {
-  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
-						    (__v8si) __X,
-						    (__v8sf) __W,
-						    (__mmask8) __U);
+  return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
+						   (__v4si) __Y,
+						   (__v2di) __W, __M);
 }
 
-extern __inline __m256
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_permutexvar_ps (__mmask8 __U, __m256i __X, __m256 __Y)
+_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
-						    (__v8si) __X,
-						    (__v8sf)
-						    _mm256_setzero_ps (),
-						    (__mmask8) __U);
+  return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
+						   (__v4si) __Y,
+						   (__v2di)
+						   _mm_setzero_si128 (),
+						   __M);
 }
 
-extern __inline __m256d
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_permutexvar_pd (__m256i __X, __m256d __Y)
+_mm256_permutexvar_epi32 (__m256i __X, __m256i __Y)
 {
-  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
-						     (__v4di) __X,
-						     (__v4df)
-						     _mm256_setzero_pd (),
+  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+						     (__v8si) __X,
+						     (__v8si)
+						     _mm256_setzero_si256 (),
 						     (__mmask8) -1);
 }
 
-extern __inline __m256d
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X,
-			    __m256d __Y)
+_mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
+			       __m256i __Y)
 {
-  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
-						     (__v4di) __X,
-						     (__v4df) __W,
-						     (__mmask8) __U);
+  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+						     (__v8si) __X,
+						     (__v8si) __W,
+						     __M);
 }
 
-extern __inline __m256d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
-						     (__v4di) __X,
-						     (__v4df)
-						     _mm256_setzero_pd (),
-						     (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+						  (__v8si) __Y, 4,
+						  (__mmask8) __M);
 }
 
-extern __inline __m256d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_permutevar_pd (__m256d __W, __mmask8 __U, __m256d __A,
-			   __m256i __C)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epu32_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
-							(__v4di) __C,
-							(__v4df) __W,
-							(__mmask8)
-							__U);
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+						  (__v8si) __Y, 4,
+						  (__mmask8) -1);
 }
 
-extern __inline __m256d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_permutevar_pd (__mmask8 __U, __m256d __A, __m256i __C)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
-							(__v4di) __C,
-							(__v4df)
-							_mm256_setzero_pd (),
-							(__mmask8)
-							__U);
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+						  (__v8si) __Y, 1,
+						  (__mmask8) __M);
 }
 
-extern __inline __m256
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_permutevar_ps (__m256 __W, __mmask8 __U, __m256 __A,
-			   __m256i __C)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epu32_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
-						       (__v8si) __C,
-						       (__v8sf) __W,
-						       (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+						  (__v8si) __Y, 1,
+						  (__mmask8) -1);
 }
 
-extern __inline __m256
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_permutevar_ps (__mmask8 __U, __m256 __A, __m256i __C)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
-						       (__v8si) __C,
-						       (__v8sf)
-						       _mm256_setzero_ps (),
-						       (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+						  (__v8si) __Y, 5,
+						  (__mmask8) __M);
 }
 
-extern __inline __m128d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_permutevar_pd (__m128d __W, __mmask8 __U, __m128d __A,
-			__m128i __C)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epu32_mask (__m256i __X, __m256i __Y)
 {
-  return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
-						     (__v2di) __C,
-						     (__v2df) __W,
-						     (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+						  (__v8si) __Y, 5,
+						  (__mmask8) -1);
 }
 
-extern __inline __m128d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_permutevar_pd (__mmask8 __U, __m128d __A, __m128i __C)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
-						     (__v2di) __C,
-						     (__v2df)
-						     _mm_setzero_pd (),
-						     (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+						  (__v8si) __Y, 2,
+						  (__mmask8) __M);
 }
 
-extern __inline __m128
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_permutevar_ps (__m128 __W, __mmask8 __U, __m128 __A,
-			__m128i __C)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epu32_mask (__m256i __X, __m256i __Y)
 {
-  return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
-						    (__v4si) __C,
-						    (__v4sf) __W,
-						    (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+						  (__v8si) __Y, 2,
+						  (__mmask8) -1);
 }
 
-extern __inline __m128
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_permutevar_ps (__mmask8 __U, __m128 __A, __m128i __C)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
-						    (__v4si) __C,
-						    (__v4sf)
-						    _mm_setzero_ps (),
-						    (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+						  (__v4di) __Y, 4,
+						  (__mmask8) __M);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
-						  (__v8si) __B,
-						  (__v8si)
-						  _mm256_setzero_si256 (),
-						  __M);
-}
-
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epu64_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
-						     (__v4di) __X,
-						     (__v4di)
-						     _mm256_setzero_si256 (),
-						     __M);
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+						  (__v4di) __Y, 4,
+						  (__mmask8) -1);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
-			 __m256i __B)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
-						  (__v8si) __B,
-						  (__v8si) __W, __M);
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+						  (__v4di) __Y, 1,
+						  (__mmask8) __M);
 }
 
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epu64_mask (__m256i __X, __m256i __Y)
 {
-  return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
-						  (__v4si) __B,
-						  (__v4si)
-						  _mm_setzero_si128 (),
-						  __M);
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+						  (__v4di) __Y, 1,
+						  (__mmask8) -1);
 }
 
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_mullo_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
-		      __m128i __B)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
-						  (__v4si) __B,
-						  (__v4si) __W, __M);
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+						  (__v4di) __Y, 5,
+						  (__mmask8) __M);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
-		       __m256i __Y)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epu64_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
-						  (__v8si) __Y,
-						  (__v4di) __W, __M);
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+						  (__v4di) __Y, 5,
+						  (__mmask8) -1);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
-						  (__v8si) __Y,
-						  (__v4di)
-						  _mm256_setzero_si256 (),
-						  __M);
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+						  (__v4di) __Y, 2,
+						  (__mmask8) __M);
 }
 
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X,
-		    __m128i __Y)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epu64_mask (__m256i __X, __m256i __Y)
 {
-  return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
-						  (__v4si) __Y,
-						  (__v2di) __W, __M);
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+						  (__v4di) __Y, 2,
+						  (__mmask8) -1);
 }
 
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
-						  (__v4si) __Y,
-						  (__v2di)
-						  _mm_setzero_si128 (),
-						  __M);
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+						 (__v8si) __Y, 4,
+						 (__mmask8) __M);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_permutexvar_epi64 (__m256i __X, __m256i __Y)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epi32_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
-						     (__v4di) __X,
-						     (__v4di)
-						     _mm256_setzero_si256 (),
-						     (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+						 (__v8si) __Y, 4,
+						 (__mmask8) -1);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X,
-			       __m256i __Y)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
-						     (__v4di) __X,
-						     (__v4di) __W,
-						     __M);
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+						 (__v8si) __Y, 1,
+						 (__mmask8) __M);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X,
-		       __m256i __Y)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epi32_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
-						   (__v8si) __Y,
-						   (__v4di) __W, __M);
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+						 (__v8si) __Y, 1,
+						 (__mmask8) -1);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_permutexvar_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
-						     (__v8si) __X,
-						     (__v8si)
-						     _mm256_setzero_si256 (),
-						     __M);
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+						 (__v8si) __Y, 5,
+						 (__mmask8) __M);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epi32_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
-						   (__v8si) __Y,
-						   (__v4di)
-						   _mm256_setzero_si256 (),
-						   __M);
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+						 (__v8si) __Y, 5,
+						 (__mmask8) -1);
 }
 
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X,
-		    __m128i __Y)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
-						   (__v4si) __Y,
-						   (__v2di) __W, __M);
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+						 (__v8si) __Y, 2,
+						 (__mmask8) __M);
 }
 
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epi32_mask (__m256i __X, __m256i __Y)
 {
-  return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
-						   (__v4si) __Y,
-						   (__v2di)
-						   _mm_setzero_si128 (),
-						   __M);
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+						 (__v8si) __Y, 2,
+						 (__mmask8) -1);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_permutexvar_epi32 (__m256i __X, __m256i __Y)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
-						     (__v8si) __X,
-						     (__v8si)
-						     _mm256_setzero_si256 (),
-						     (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+						 (__v4di) __Y, 4,
+						 (__mmask8) __M);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
-			       __m256i __Y)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epi64_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
-						     (__v8si) __X,
-						     (__v8si) __W,
-						     __M);
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+						 (__v4di) __Y, 4,
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
   __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmpneq_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+_mm256_mask_cmplt_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
-						  (__v8si) __Y, 4,
-						  (__mmask8) __M);
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+						 (__v4di) __Y, 1,
+						 (__mmask8) __M);
 }
 
 extern __inline __mmask8
   __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmpneq_epu32_mask (__m256i __X, __m256i __Y)
+_mm256_cmplt_epi64_mask (__m256i __X, __m256i __Y)
 {
-  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
-						  (__v8si) __Y, 4,
-						  (__mmask8) -1);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmplt_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
-						  (__v8si) __Y, 1,
-						  (__mmask8) __M);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmplt_epu32_mask (__m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
-						  (__v8si) __Y, 1,
-						  (__mmask8) -1);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmpge_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
-						  (__v8si) __Y, 5,
-						  (__mmask8) __M);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmpge_epu32_mask (__m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
-						  (__v8si) __Y, 5,
-						  (__mmask8) -1);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmple_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
-						  (__v8si) __Y, 2,
-						  (__mmask8) __M);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmple_epu32_mask (__m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
-						  (__v8si) __Y, 2,
-						  (__mmask8) -1);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmpneq_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
-						  (__v4di) __Y, 4,
-						  (__mmask8) __M);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmpneq_epu64_mask (__m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
-						  (__v4di) __Y, 4,
-						  (__mmask8) -1);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmplt_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
-						  (__v4di) __Y, 1,
-						  (__mmask8) __M);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmplt_epu64_mask (__m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
-						  (__v4di) __Y, 1,
-						  (__mmask8) -1);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmpge_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
-						  (__v4di) __Y, 5,
-						  (__mmask8) __M);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmpge_epu64_mask (__m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
-						  (__v4di) __Y, 5,
-						  (__mmask8) -1);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmple_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
-						  (__v4di) __Y, 2,
-						  (__mmask8) __M);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmple_epu64_mask (__m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
-						  (__v4di) __Y, 2,
-						  (__mmask8) -1);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmpneq_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
-						 (__v8si) __Y, 4,
-						 (__mmask8) __M);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmpneq_epi32_mask (__m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
-						 (__v8si) __Y, 4,
-						 (__mmask8) -1);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmplt_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
-						 (__v8si) __Y, 1,
-						 (__mmask8) __M);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmplt_epi32_mask (__m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
-						 (__v8si) __Y, 1,
-						 (__mmask8) -1);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmpge_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
-						 (__v8si) __Y, 5,
-						 (__mmask8) __M);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmpge_epi32_mask (__m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
-						 (__v8si) __Y, 5,
-						 (__mmask8) -1);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmple_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
-						 (__v8si) __Y, 2,
-						 (__mmask8) __M);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmple_epi32_mask (__m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
-						 (__v8si) __Y, 2,
-						 (__mmask8) -1);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmpneq_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
-						 (__v4di) __Y, 4,
-						 (__mmask8) __M);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmpneq_epi64_mask (__m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
-						 (__v4di) __Y, 4,
-						 (__mmask8) -1);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmplt_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
-						 (__v4di) __Y, 1,
-						 (__mmask8) __M);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmplt_epi64_mask (__m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
-						 (__v4di) __Y, 1,
-						 (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+						 (__v4di) __Y, 1,
+						 (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -13861,4 +13583,282 @@ _mm256_permutex_pd (__m256d __X, const int __M)
 #pragma GCC pop_options
 #endif /* __DISABLE_AVX512VL__ */
 
+#if !defined (__AVX512CD__) || !defined (__AVX512VL__)
+#pragma GCC push_options
+#pragma GCC target("avx512vl,avx512cd")
+#define __DISABLE_AVX512VLCD__
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_broadcastmb128 (__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m256i) __builtin_ia32_broadcastmb256 (__A);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m128i) __builtin_ia32_broadcastmw128 (__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m256i) __builtin_ia32_broadcastmw256 (__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_lzcnt_epi32 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+						     (__v8si)
+						     _mm256_setzero_si256 (),
+						     (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+						     (__v8si) __W,
+						     (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+						     (__v8si)
+						     _mm256_setzero_si256 (),
+						     (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_lzcnt_epi64 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+						     (__v4di)
+						     _mm256_setzero_si256 (),
+						     (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+						     (__v4di) __W,
+						     (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+						     (__v4di)
+						     _mm256_setzero_si256 (),
+						     (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_conflict_epi64 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+							 (__v4di)
+							 _mm256_setzero_si256 (),
+							 (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+							 (__v4di) __W,
+							 (__mmask8)
+							 __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+							 (__v4di)
+							 _mm256_setzero_si256 (),
+							 (__mmask8)
+							 __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_conflict_epi32 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+							 (__v8si)
+							 _mm256_setzero_si256 (),
+							 (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+							 (__v8si) __W,
+							 (__mmask8)
+							 __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+							 (__v8si)
+							 _mm256_setzero_si256 (),
+							 (__mmask8)
+							 __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_lzcnt_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+						     (__v4si)
+						     _mm_setzero_si128 (),
+						     (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+						     (__v4si) __W,
+						     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+						     (__v4si)
+						     _mm_setzero_si128 (),
+						     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_lzcnt_epi64 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+						     (__v2di)
+						     _mm_setzero_si128 (),
+						     (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+						     (__v2di) __W,
+						     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+						     (__v2di)
+						     _mm_setzero_si128 (),
+						     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_conflict_epi64 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+							 (__v2di)
+							 _mm_setzero_si128 (),
+							 (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+							 (__v2di) __W,
+							 (__mmask8)
+							 __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+							 (__v2di)
+							 _mm_setzero_si128 (),
+							 (__mmask8)
+							 __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_conflict_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+							 (__v4si)
+							 _mm_setzero_si128 (),
+							 (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+							 (__v4si) __W,
+							 (__mmask8)
+							 __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+							 (__v4si)
+							 _mm_setzero_si128 (),
+							 (__mmask8)
+							 __U);
+}
+
+#ifdef __DISABLE_AVX512VLCD__
+#pragma GCC pop_options
+#endif
+
 #endif /* _AVX512VLINTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index b90d5ccc969..19fa5c107c7 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -1615,8 +1615,8 @@ BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_ktestqi, "__builtin_ia32_ktestcqi",
 BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_ktestqi, "__builtin_ia32_ktestzqi", IX86_BUILTIN_KTESTZ8, UNKNOWN, (int) UQI_FTYPE_UQI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_ktesthi, "__builtin_ia32_ktestchi", IX86_BUILTIN_KTESTC16, UNKNOWN, (int) UHI_FTYPE_UHI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_ktesthi, "__builtin_ia32_ktestzhi", IX86_BUILTIN_KTESTZ16, UNKNOWN, (int) UHI_FTYPE_UHI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_ktestsi, "__builtin_ia32_ktestcsi", IX86_BUILTIN_KTESTC32, UNKNOWN, (int) USI_FTYPE_USI_USI)
-BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_ktestsi, "__builtin_ia32_ktestzsi", IX86_BUILTIN_KTESTZ32, UNKNOWN, (int) USI_FTYPE_USI_USI)
+BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_ktestsi, "__builtin_ia32_ktestcsi", IX86_BUILTIN_KTESTC32, UNKNOWN, (int) USI_FTYPE_USI_USI)
+BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_ktestsi, "__builtin_ia32_ktestzsi", IX86_BUILTIN_KTESTZ32, UNKNOWN, (int) USI_FTYPE_USI_USI)
 BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_ktestdi, "__builtin_ia32_ktestcdi", IX86_BUILTIN_KTESTC64, UNKNOWN, (int) UDI_FTYPE_UDI_UDI)
 BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_ktestdi, "__builtin_ia32_ktestzdi", IX86_BUILTIN_KTESTZ64, UNKNOWN, (int) UDI_FTYPE_UDI_UDI)
 BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_kortestqi, "__builtin_ia32_kortestcqi", IX86_BUILTIN_KORTESTC8, UNKNOWN, (int) UQI_FTYPE_UQI_UQI)
-- 
2.31.1


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] Fix incorrect option mask and avx512cd target push
  2023-10-30  7:44 [PATCH] Fix incorrect option mask and avx512cd target push Haochen Jiang
@ 2023-10-30  7:56 ` Hongtao Liu
  0 siblings, 0 replies; 2+ messages in thread
From: Hongtao Liu @ 2023-10-30  7:56 UTC (permalink / raw)
  To: Haochen Jiang; +Cc: gcc-patches, ubizjak, hongtao.liu

On Mon, Oct 30, 2023 at 3:47 PM Haochen Jiang <haochen.jiang@intel.com> wrote:
>
> Hi all,
>
> This patch fixed two obvious bug in current evex512 implementation.
>
> Also, I moved AVX512CD+AVX512VL part out of the AVX512VL to avoid
> accidental handle miss in avx512cd in the future.
>
> Ok for trunk?
Ok.
>
> BRs,
> Haochen
>
> gcc/ChangeLog:
>
>         * config/i386/avx512cdintrin.h (target): Push evex512 for
>         avx512cd.
>         * config/i386/avx512vlintrin.h (target): Split avx512cdvl part
>         out from avx512vl.
>         * config/i386/i386-builtin.def (BDESC): Do not check evex512
>         for builtins not needed.
> ---
>  gcc/config/i386/avx512cdintrin.h |    2 +-
>  gcc/config/i386/avx512vlintrin.h | 1792 +++++++++++++++---------------
>  gcc/config/i386/i386-builtin.def |    4 +-
>  3 files changed, 899 insertions(+), 899 deletions(-)
>
> diff --git a/gcc/config/i386/avx512cdintrin.h b/gcc/config/i386/avx512cdintrin.h
> index a5f5eabb68d..56a786aa9a3 100644
> --- a/gcc/config/i386/avx512cdintrin.h
> +++ b/gcc/config/i386/avx512cdintrin.h
> @@ -30,7 +30,7 @@
>
>  #ifndef __AVX512CD__
>  #pragma GCC push_options
> -#pragma GCC target("avx512cd")
> +#pragma GCC target("avx512cd,evex512")
>  #define __DISABLE_AVX512CD__
>  #endif /* __AVX512CD__ */
>
> diff --git a/gcc/config/i386/avx512vlintrin.h b/gcc/config/i386/avx512vlintrin.h
> index 08e49e8d8ab..a40aa91b948 100644
> --- a/gcc/config/i386/avx512vlintrin.h
> +++ b/gcc/config/i386/avx512vlintrin.h
> @@ -8396,1281 +8396,1003 @@ _mm_mask_min_epu32 (__m128i __W, __mmask8 __M, __m128i __A,
>                                                   (__v4si) __W, __M);
>  }
>
> -#ifndef __AVX512CD__
> -#pragma GCC push_options
> -#pragma GCC target("avx512vl,avx512cd")
> -#define __DISABLE_AVX512VLCD__
> -#endif
> -
> -extern __inline __m128i
> +extern __inline __m256d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_broadcastmb_epi64 (__mmask8 __A)
> +_mm256_mask_unpacklo_pd (__m256d __W, __mmask8 __U, __m256d __A,
> +                        __m256d __B)
>  {
> -  return (__m128i) __builtin_ia32_broadcastmb128 (__A);
> +  return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A,
> +                                                   (__v4df) __B,
> +                                                   (__v4df) __W,
> +                                                   (__mmask8) __U);
>  }
>
> -extern __inline __m256i
> +extern __inline __m256d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_broadcastmb_epi64 (__mmask8 __A)
> +_mm256_maskz_unpacklo_pd (__mmask8 __U, __m256d __A, __m256d __B)
>  {
> -  return (__m256i) __builtin_ia32_broadcastmb256 (__A);
> +  return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A,
> +                                                   (__v4df) __B,
> +                                                   (__v4df)
> +                                                   _mm256_setzero_pd (),
> +                                                   (__mmask8) __U);
>  }
>
> -extern __inline __m128i
> +extern __inline __m128d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_broadcastmw_epi32 (__mmask16 __A)
> +_mm_mask_unpacklo_pd (__m128d __W, __mmask8 __U, __m128d __A,
> +                     __m128d __B)
>  {
> -  return (__m128i) __builtin_ia32_broadcastmw128 (__A);
> +  return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A,
> +                                                   (__v2df) __B,
> +                                                   (__v2df) __W,
> +                                                   (__mmask8) __U);
>  }
>
> -extern __inline __m256i
> +extern __inline __m128d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_broadcastmw_epi32 (__mmask16 __A)
> +_mm_maskz_unpacklo_pd (__mmask8 __U, __m128d __A, __m128d __B)
>  {
> -  return (__m256i) __builtin_ia32_broadcastmw256 (__A);
> +  return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A,
> +                                                   (__v2df) __B,
> +                                                   (__v2df)
> +                                                   _mm_setzero_pd (),
> +                                                   (__mmask8) __U);
>  }
>
> -extern __inline __m256i
> +extern __inline __m256
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_lzcnt_epi32 (__m256i __A)
> +_mm256_mask_unpacklo_ps (__m256 __W, __mmask8 __U, __m256 __A,
> +                        __m256 __B)
>  {
> -  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
> -                                                    (__v8si)
> -                                                    _mm256_setzero_si256 (),
> -                                                    (__mmask8) -1);
> +  return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A,
> +                                                  (__v8sf) __B,
> +                                                  (__v8sf) __W,
> +                                                  (__mmask8) __U);
>  }
>
> -extern __inline __m256i
> +extern __inline __m256d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
> +_mm256_mask_unpackhi_pd (__m256d __W, __mmask8 __U, __m256d __A,
> +                        __m256d __B)
>  {
> -  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
> -                                                    (__v8si) __W,
> -                                                    (__mmask8) __U);
> +  return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A,
> +                                                   (__v4df) __B,
> +                                                   (__v4df) __W,
> +                                                   (__mmask8) __U);
>  }
>
> -extern __inline __m256i
> +extern __inline __m256d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
> +_mm256_maskz_unpackhi_pd (__mmask8 __U, __m256d __A, __m256d __B)
>  {
> -  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
> -                                                    (__v8si)
> -                                                    _mm256_setzero_si256 (),
> -                                                    (__mmask8) __U);
> +  return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A,
> +                                                   (__v4df) __B,
> +                                                   (__v4df)
> +                                                   _mm256_setzero_pd (),
> +                                                   (__mmask8) __U);
>  }
>
> -extern __inline __m256i
> +extern __inline __m128d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_lzcnt_epi64 (__m256i __A)
> +_mm_mask_unpackhi_pd (__m128d __W, __mmask8 __U, __m128d __A,
> +                     __m128d __B)
>  {
> -  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
> -                                                    (__v4di)
> -                                                    _mm256_setzero_si256 (),
> -                                                    (__mmask8) -1);
> +  return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A,
> +                                                   (__v2df) __B,
> +                                                   (__v2df) __W,
> +                                                   (__mmask8) __U);
>  }
>
> -extern __inline __m256i
> +extern __inline __m128d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
> +_mm_maskz_unpackhi_pd (__mmask8 __U, __m128d __A, __m128d __B)
>  {
> -  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
> -                                                    (__v4di) __W,
> -                                                    (__mmask8) __U);
> +  return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A,
> +                                                   (__v2df) __B,
> +                                                   (__v2df)
> +                                                   _mm_setzero_pd (),
> +                                                   (__mmask8) __U);
>  }
>
> -extern __inline __m256i
> +extern __inline __m256
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
> +_mm256_mask_unpackhi_ps (__m256 __W, __mmask8 __U, __m256 __A,
> +                        __m256 __B)
>  {
> -  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
> -                                                    (__v4di)
> -                                                    _mm256_setzero_si256 (),
> -                                                    (__mmask8) __U);
> +  return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A,
> +                                                  (__v8sf) __B,
> +                                                  (__v8sf) __W,
> +                                                  (__mmask8) __U);
>  }
>
> -extern __inline __m256i
> +extern __inline __m256
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_conflict_epi64 (__m256i __A)
> +_mm256_maskz_unpackhi_ps (__mmask8 __U, __m256 __A, __m256 __B)
>  {
> -  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
> -                                                        (__v4di)
> -                                                        _mm256_setzero_si256 (),
> -                                                        (__mmask8) -1);
> +  return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A,
> +                                                  (__v8sf) __B,
> +                                                  (__v8sf)
> +                                                  _mm256_setzero_ps (),
> +                                                  (__mmask8) __U);
>  }
>
> -extern __inline __m256i
> +extern __inline __m128
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
> +_mm_mask_unpackhi_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
>  {
> -  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
> -                                                        (__v4di) __W,
> -                                                        (__mmask8)
> -                                                        __U);
> +  return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A,
> +                                                  (__v4sf) __B,
> +                                                  (__v4sf) __W,
> +                                                  (__mmask8) __U);
>  }
>
> -extern __inline __m256i
> +extern __inline __m128
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
> +_mm_maskz_unpackhi_ps (__mmask8 __U, __m128 __A, __m128 __B)
>  {
> -  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
> -                                                        (__v4di)
> -                                                        _mm256_setzero_si256 (),
> -                                                        (__mmask8)
> -                                                        __U);
> +  return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A,
> +                                                  (__v4sf) __B,
> +                                                  (__v4sf)
> +                                                  _mm_setzero_ps (),
> +                                                  (__mmask8) __U);
>  }
>
> -extern __inline __m256i
> +extern __inline __m128
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_conflict_epi32 (__m256i __A)
> +_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A)
>  {
> -  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
> -                                                        (__v8si)
> -                                                        _mm256_setzero_si256 (),
> -                                                        (__mmask8) -1);
> +  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
> +                                                (__v4sf) __W,
> +                                                (__mmask8) __U);
>  }
>
> -extern __inline __m256i
> +extern __inline __m128
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
> +_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
>  {
> -  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
> -                                                        (__v8si) __W,
> -                                                        (__mmask8)
> -                                                        __U);
> +  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
> +                                                (__v4sf)
> +                                                _mm_setzero_ps (),
> +                                                (__mmask8) __U);
>  }
>
> -extern __inline __m256i
> +extern __inline __m256
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
> +_mm256_maskz_unpacklo_ps (__mmask8 __U, __m256 __A, __m256 __B)
>  {
> -  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
> -                                                        (__v8si)
> -                                                        _mm256_setzero_si256 (),
> -                                                        (__mmask8)
> -                                                        __U);
> +  return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A,
> +                                                  (__v8sf) __B,
> +                                                  (__v8sf)
> +                                                  _mm256_setzero_ps (),
> +                                                  (__mmask8) __U);
>  }
>
> -extern __inline __m128i
> +extern __inline __m256
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_lzcnt_epi32 (__m128i __A)
> +_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A)
>  {
> -  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
> -                                                    (__v4si)
> -                                                    _mm_setzero_si128 (),
> -                                                    (__mmask8) -1);
> +  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
> +                                                   (__v8sf) __W,
> +                                                   (__mmask8) __U);
>  }
>
> -extern __inline __m128i
> +extern __inline __m256
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
> +_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
>  {
> -  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
> -                                                    (__v4si) __W,
> -                                                    (__mmask8) __U);
> +  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
> +                                                   (__v8sf)
> +                                                   _mm256_setzero_ps (),
> +                                                   (__mmask8) __U);
>  }
>
> -extern __inline __m128i
> +extern __inline __m128
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
> +_mm_mask_unpacklo_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
>  {
> -  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
> -                                                    (__v4si)
> -                                                    _mm_setzero_si128 (),
> -                                                    (__mmask8) __U);
> +  return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A,
> +                                                  (__v4sf) __B,
> +                                                  (__v4sf) __W,
> +                                                  (__mmask8) __U);
>  }
>
> -extern __inline __m128i
> +extern __inline __m128
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_lzcnt_epi64 (__m128i __A)
> +_mm_maskz_unpacklo_ps (__mmask8 __U, __m128 __A, __m128 __B)
>  {
> -  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
> -                                                    (__v2di)
> -                                                    _mm_setzero_si128 (),
> -                                                    (__mmask8) -1);
> +  return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A,
> +                                                  (__v4sf) __B,
> +                                                  (__v4sf)
> +                                                  _mm_setzero_ps (),
> +                                                  (__mmask8) __U);
>  }
>
> -extern __inline __m128i
> +extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
> +_mm256_mask_sra_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
> +                      __m128i __B)
>  {
> -  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
> -                                                    (__v2di) __W,
> -                                                    (__mmask8) __U);
> +  return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
> +                                                (__v4si) __B,
> +                                                (__v8si) __W,
> +                                                (__mmask8) __U);
>  }
>
> -extern __inline __m128i
> +extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
> +_mm256_maskz_sra_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
>  {
> -  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
> -                                                    (__v2di)
> -                                                    _mm_setzero_si128 (),
> -                                                    (__mmask8) __U);
> +  return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
> +                                                (__v4si) __B,
> +                                                (__v8si)
> +                                                _mm256_setzero_si256 (),
> +                                                (__mmask8) __U);
>  }
>
>  extern __inline __m128i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_conflict_epi64 (__m128i __A)
> +_mm_mask_sra_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
> +                   __m128i __B)
>  {
> -  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
> -                                                        (__v2di)
> -                                                        _mm_setzero_si128 (),
> -                                                        (__mmask8) -1);
> +  return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
> +                                                (__v4si) __B,
> +                                                (__v4si) __W,
> +                                                (__mmask8) __U);
>  }
>
>  extern __inline __m128i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
> +_mm_maskz_sra_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
>  {
> -  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
> -                                                        (__v2di) __W,
> -                                                        (__mmask8)
> -                                                        __U);
> +  return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
> +                                                (__v4si) __B,
> +                                                (__v4si)
> +                                                _mm_setzero_si128 (),
> +                                                (__mmask8) __U);
>  }
>
> -extern __inline __m128i
> +extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
> +_mm256_sra_epi64 (__m256i __A, __m128i __B)
>  {
> -  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
> -                                                        (__v2di)
> -                                                        _mm_setzero_si128 (),
> -                                                        (__mmask8)
> -                                                        __U);
> +  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
> +                                                (__v2di) __B,
> +                                                (__v4di)
> +                                                _mm256_setzero_si256 (),
> +                                                (__mmask8) -1);
>  }
>
> -extern __inline __m128i
> +extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_conflict_epi32 (__m128i __A)
> +_mm256_mask_sra_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
> +                      __m128i __B)
>  {
> -  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
> -                                                        (__v4si)
> -                                                        _mm_setzero_si128 (),
> -                                                        (__mmask8) -1);
> +  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
> +                                                (__v2di) __B,
> +                                                (__v4di) __W,
> +                                                (__mmask8) __U);
>  }
>
> -extern __inline __m128i
> +extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
> +_mm256_maskz_sra_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
>  {
> -  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
> -                                                        (__v4si) __W,
> -                                                        (__mmask8)
> -                                                        __U);
> +  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
> +                                                (__v2di) __B,
> +                                                (__v4di)
> +                                                _mm256_setzero_si256 (),
> +                                                (__mmask8) __U);
>  }
>
>  extern __inline __m128i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
> +_mm_sra_epi64 (__m128i __A, __m128i __B)
>  {
> -  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
> -                                                        (__v4si)
> -                                                        _mm_setzero_si128 (),
> -                                                        (__mmask8)
> -                                                        __U);
> +  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
> +                                                (__v2di) __B,
> +                                                (__v2di)
> +                                                _mm_setzero_si128 (),
> +                                                (__mmask8) -1);
>  }
>
> -#ifdef __DISABLE_AVX512VLCD__
> -#pragma GCC pop_options
> -#endif
> -
> -extern __inline __m256d
> +extern __inline __m128i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_unpacklo_pd (__m256d __W, __mmask8 __U, __m256d __A,
> -                        __m256d __B)
> +_mm_mask_sra_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
> +                   __m128i __B)
>  {
> -  return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A,
> -                                                   (__v4df) __B,
> -                                                   (__v4df) __W,
> -                                                   (__mmask8) __U);
> +  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
> +                                                (__v2di) __B,
> +                                                (__v2di) __W,
> +                                                (__mmask8) __U);
>  }
>
> -extern __inline __m256d
> +extern __inline __m128i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_unpacklo_pd (__mmask8 __U, __m256d __A, __m256d __B)
> +_mm_maskz_sra_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
>  {
> -  return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A,
> -                                                   (__v4df) __B,
> -                                                   (__v4df)
> -                                                   _mm256_setzero_pd (),
> -                                                   (__mmask8) __U);
> +  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
> +                                                (__v2di) __B,
> +                                                (__v2di)
> +                                                _mm_setzero_si128 (),
> +                                                (__mmask8) __U);
>  }
>
> -extern __inline __m128d
> +extern __inline __m128i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_unpacklo_pd (__m128d __W, __mmask8 __U, __m128d __A,
> -                     __m128d __B)
> +_mm_mask_sll_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
> +                   __m128i __B)
>  {
> -  return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A,
> -                                                   (__v2df) __B,
> -                                                   (__v2df) __W,
> -                                                   (__mmask8) __U);
> +  return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
> +                                                (__v4si) __B,
> +                                                (__v4si) __W,
> +                                                (__mmask8) __U);
>  }
>
> -extern __inline __m128d
> +extern __inline __m128i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_maskz_unpacklo_pd (__mmask8 __U, __m128d __A, __m128d __B)
> +_mm_maskz_sll_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
>  {
> -  return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A,
> -                                                   (__v2df) __B,
> -                                                   (__v2df)
> -                                                   _mm_setzero_pd (),
> -                                                   (__mmask8) __U);
> +  return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
> +                                                (__v4si) __B,
> +                                                (__v4si)
> +                                                _mm_setzero_si128 (),
> +                                                (__mmask8) __U);
>  }
>
> -extern __inline __m256
> +extern __inline __m128i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_unpacklo_ps (__m256 __W, __mmask8 __U, __m256 __A,
> -                        __m256 __B)
> +_mm_mask_sll_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
> +                   __m128i __B)
>  {
> -  return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A,
> -                                                  (__v8sf) __B,
> -                                                  (__v8sf) __W,
> -                                                  (__mmask8) __U);
> +  return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
> +                                                (__v2di) __B,
> +                                                (__v2di) __W,
> +                                                (__mmask8) __U);
>  }
>
> -extern __inline __m256d
> +extern __inline __m128i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_unpackhi_pd (__m256d __W, __mmask8 __U, __m256d __A,
> -                        __m256d __B)
> +_mm_maskz_sll_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
>  {
> -  return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A,
> -                                                   (__v4df) __B,
> -                                                   (__v4df) __W,
> -                                                   (__mmask8) __U);
> +  return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
> +                                                (__v2di) __B,
> +                                                (__v2di)
> +                                                _mm_setzero_si128 (),
> +                                                (__mmask8) __U);
>  }
>
> -extern __inline __m256d
> +extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_unpackhi_pd (__mmask8 __U, __m256d __A, __m256d __B)
> +_mm256_mask_sll_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
> +                      __m128i __B)
>  {
> -  return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A,
> -                                                   (__v4df) __B,
> -                                                   (__v4df)
> -                                                   _mm256_setzero_pd (),
> -                                                   (__mmask8) __U);
> +  return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
> +                                                (__v4si) __B,
> +                                                (__v8si) __W,
> +                                                (__mmask8) __U);
>  }
>
> -extern __inline __m128d
> +extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_unpackhi_pd (__m128d __W, __mmask8 __U, __m128d __A,
> -                     __m128d __B)
> +_mm256_maskz_sll_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
>  {
> -  return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A,
> -                                                   (__v2df) __B,
> -                                                   (__v2df) __W,
> -                                                   (__mmask8) __U);
> +  return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
> +                                                (__v4si) __B,
> +                                                (__v8si)
> +                                                _mm256_setzero_si256 (),
> +                                                (__mmask8) __U);
>  }
>
> -extern __inline __m128d
> +extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_maskz_unpackhi_pd (__mmask8 __U, __m128d __A, __m128d __B)
> +_mm256_mask_sll_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
> +                      __m128i __B)
>  {
> -  return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A,
> -                                                   (__v2df) __B,
> -                                                   (__v2df)
> -                                                   _mm_setzero_pd (),
> -                                                   (__mmask8) __U);
> +  return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
> +                                                (__v2di) __B,
> +                                                (__v4di) __W,
> +                                                (__mmask8) __U);
>  }
>
> -extern __inline __m256
> +extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_unpackhi_ps (__m256 __W, __mmask8 __U, __m256 __A,
> -                        __m256 __B)
> +_mm256_maskz_sll_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
>  {
> -  return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A,
> -                                                  (__v8sf) __B,
> -                                                  (__v8sf) __W,
> -                                                  (__mmask8) __U);
> +  return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
> +                                                (__v2di) __B,
> +                                                (__v4di)
> +                                                _mm256_setzero_si256 (),
> +                                                (__mmask8) __U);
>  }
>
>  extern __inline __m256
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_unpackhi_ps (__mmask8 __U, __m256 __A, __m256 __B)
> +_mm256_mask_permutexvar_ps (__m256 __W, __mmask8 __U, __m256i __X,
> +                           __m256 __Y)
>  {
> -  return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A,
> -                                                  (__v8sf) __B,
> -                                                  (__v8sf)
> -                                                  _mm256_setzero_ps (),
> -                                                  (__mmask8) __U);
> +  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
> +                                                   (__v8si) __X,
> +                                                   (__v8sf) __W,
> +                                                   (__mmask8) __U);
>  }
>
> -extern __inline __m128
> +extern __inline __m256
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_unpackhi_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
> +_mm256_maskz_permutexvar_ps (__mmask8 __U, __m256i __X, __m256 __Y)
>  {
> -  return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A,
> -                                                  (__v4sf) __B,
> -                                                  (__v4sf) __W,
> -                                                  (__mmask8) __U);
> +  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
> +                                                   (__v8si) __X,
> +                                                   (__v8sf)
> +                                                   _mm256_setzero_ps (),
> +                                                   (__mmask8) __U);
>  }
>
> -extern __inline __m128
> +extern __inline __m256d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_maskz_unpackhi_ps (__mmask8 __U, __m128 __A, __m128 __B)
> +_mm256_permutexvar_pd (__m256i __X, __m256d __Y)
>  {
> -  return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A,
> -                                                  (__v4sf) __B,
> -                                                  (__v4sf)
> -                                                  _mm_setzero_ps (),
> -                                                  (__mmask8) __U);
> +  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
> +                                                    (__v4di) __X,
> +                                                    (__v4df)
> +                                                    _mm256_setzero_pd (),
> +                                                    (__mmask8) -1);
>  }
>
> -extern __inline __m128
> +extern __inline __m256d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A)
> +_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X,
> +                           __m256d __Y)
>  {
> -  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
> -                                                (__v4sf) __W,
> -                                                (__mmask8) __U);
> +  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
> +                                                    (__v4di) __X,
> +                                                    (__v4df) __W,
> +                                                    (__mmask8) __U);
>  }
>
> -extern __inline __m128
> +extern __inline __m256d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
> +_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y)
>  {
> -  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
> -                                                (__v4sf)
> -                                                _mm_setzero_ps (),
> -                                                (__mmask8) __U);
> +  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
> +                                                    (__v4di) __X,
> +                                                    (__v4df)
> +                                                    _mm256_setzero_pd (),
> +                                                    (__mmask8) __U);
>  }
>
> -extern __inline __m256
> +extern __inline __m256d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_unpacklo_ps (__mmask8 __U, __m256 __A, __m256 __B)
> +_mm256_mask_permutevar_pd (__m256d __W, __mmask8 __U, __m256d __A,
> +                          __m256i __C)
>  {
> -  return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A,
> -                                                  (__v8sf) __B,
> -                                                  (__v8sf)
> -                                                  _mm256_setzero_ps (),
> -                                                  (__mmask8) __U);
> +  return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
> +                                                       (__v4di) __C,
> +                                                       (__v4df) __W,
> +                                                       (__mmask8)
> +                                                       __U);
>  }
>
> -extern __inline __m256
> +extern __inline __m256d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A)
> +_mm256_maskz_permutevar_pd (__mmask8 __U, __m256d __A, __m256i __C)
>  {
> -  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
> -                                                   (__v8sf) __W,
> -                                                   (__mmask8) __U);
> +  return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
> +                                                       (__v4di) __C,
> +                                                       (__v4df)
> +                                                       _mm256_setzero_pd (),
> +                                                       (__mmask8)
> +                                                       __U);
>  }
>
>  extern __inline __m256
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
> -{
> -  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
> -                                                   (__v8sf)
> -                                                   _mm256_setzero_ps (),
> -                                                   (__mmask8) __U);
> -}
> -
> -extern __inline __m128
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_unpacklo_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
> +_mm256_mask_permutevar_ps (__m256 __W, __mmask8 __U, __m256 __A,
> +                          __m256i __C)
>  {
> -  return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A,
> -                                                  (__v4sf) __B,
> -                                                  (__v4sf) __W,
> -                                                  (__mmask8) __U);
> +  return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
> +                                                      (__v8si) __C,
> +                                                      (__v8sf) __W,
> +                                                      (__mmask8) __U);
>  }
>
> -extern __inline __m128
> +extern __inline __m256
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_maskz_unpacklo_ps (__mmask8 __U, __m128 __A, __m128 __B)
> +_mm256_maskz_permutevar_ps (__mmask8 __U, __m256 __A, __m256i __C)
>  {
> -  return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A,
> -                                                  (__v4sf) __B,
> -                                                  (__v4sf)
> -                                                  _mm_setzero_ps (),
> -                                                  (__mmask8) __U);
> +  return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
> +                                                      (__v8si) __C,
> +                                                      (__v8sf)
> +                                                      _mm256_setzero_ps (),
> +                                                      (__mmask8) __U);
>  }
>
> -extern __inline __m256i
> +extern __inline __m128d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_sra_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
> -                      __m128i __B)
> +_mm_mask_permutevar_pd (__m128d __W, __mmask8 __U, __m128d __A,
> +                       __m128i __C)
>  {
> -  return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
> -                                                (__v4si) __B,
> -                                                (__v8si) __W,
> -                                                (__mmask8) __U);
> +  return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
> +                                                    (__v2di) __C,
> +                                                    (__v2df) __W,
> +                                                    (__mmask8) __U);
>  }
>
> -extern __inline __m256i
> +extern __inline __m128d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_sra_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
> +_mm_maskz_permutevar_pd (__mmask8 __U, __m128d __A, __m128i __C)
>  {
> -  return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
> -                                                (__v4si) __B,
> -                                                (__v8si)
> -                                                _mm256_setzero_si256 (),
> -                                                (__mmask8) __U);
> +  return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
> +                                                    (__v2di) __C,
> +                                                    (__v2df)
> +                                                    _mm_setzero_pd (),
> +                                                    (__mmask8) __U);
>  }
>
> -extern __inline __m128i
> +extern __inline __m128
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_sra_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
> -                   __m128i __B)
> +_mm_mask_permutevar_ps (__m128 __W, __mmask8 __U, __m128 __A,
> +                       __m128i __C)
>  {
> -  return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
> -                                                (__v4si) __B,
> -                                                (__v4si) __W,
> -                                                (__mmask8) __U);
> +  return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
> +                                                   (__v4si) __C,
> +                                                   (__v4sf) __W,
> +                                                   (__mmask8) __U);
>  }
>
> -extern __inline __m128i
> +extern __inline __m128
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_maskz_sra_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
> +_mm_maskz_permutevar_ps (__mmask8 __U, __m128 __A, __m128i __C)
>  {
> -  return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
> -                                                (__v4si) __B,
> -                                                (__v4si)
> -                                                _mm_setzero_si128 (),
> -                                                (__mmask8) __U);
> +  return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
> +                                                   (__v4si) __C,
> +                                                   (__v4sf)
> +                                                   _mm_setzero_ps (),
> +                                                   (__mmask8) __U);
>  }
>
>  extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_sra_epi64 (__m256i __A, __m128i __B)
> +_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B)
>  {
> -  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
> -                                                (__v2di) __B,
> -                                                (__v4di)
> -                                                _mm256_setzero_si256 (),
> -                                                (__mmask8) -1);
> +  return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
> +                                                 (__v8si) __B,
> +                                                 (__v8si)
> +                                                 _mm256_setzero_si256 (),
> +                                                 __M);
>  }
>
>  extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_sra_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
> -                      __m128i __B)
> +_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y)
>  {
> -  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
> -                                                (__v2di) __B,
> -                                                (__v4di) __W,
> -                                                (__mmask8) __U);
> +  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
> +                                                    (__v4di) __X,
> +                                                    (__v4di)
> +                                                    _mm256_setzero_si256 (),
> +                                                    __M);
>  }
>
>  extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_sra_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
> +_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
> +                        __m256i __B)
>  {
> -  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
> -                                                (__v2di) __B,
> -                                                (__v4di)
> -                                                _mm256_setzero_si256 (),
> -                                                (__mmask8) __U);
> +  return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
> +                                                 (__v8si) __B,
> +                                                 (__v8si) __W, __M);
>  }
>
>  extern __inline __m128i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_sra_epi64 (__m128i __A, __m128i __B)
> +_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
>  {
> -  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
> -                                                (__v2di) __B,
> -                                                (__v2di)
> -                                                _mm_setzero_si128 (),
> -                                                (__mmask8) -1);
> +  return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
> +                                                 (__v4si) __B,
> +                                                 (__v4si)
> +                                                 _mm_setzero_si128 (),
> +                                                 __M);
>  }
>
>  extern __inline __m128i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_sra_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
> -                   __m128i __B)
> +_mm_mask_mullo_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
> +                     __m128i __B)
>  {
> -  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
> -                                                (__v2di) __B,
> -                                                (__v2di) __W,
> -                                                (__mmask8) __U);
> +  return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
> +                                                 (__v4si) __B,
> +                                                 (__v4si) __W, __M);
>  }
>
> -extern __inline __m128i
> +extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_maskz_sra_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
> +_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
> +                      __m256i __Y)
>  {
> -  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
> -                                                (__v2di) __B,
> -                                                (__v2di)
> -                                                _mm_setzero_si128 (),
> -                                                (__mmask8) __U);
> +  return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
> +                                                 (__v8si) __Y,
> +                                                 (__v4di) __W, __M);
>  }
>
> -extern __inline __m128i
> +extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_sll_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
> -                   __m128i __B)
> +_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
>  {
> -  return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
> -                                                (__v4si) __B,
> -                                                (__v4si) __W,
> -                                                (__mmask8) __U);
> +  return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
> +                                                 (__v8si) __Y,
> +                                                 (__v4di)
> +                                                 _mm256_setzero_si256 (),
> +                                                 __M);
>  }
>
>  extern __inline __m128i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_maskz_sll_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
> +_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X,
> +                   __m128i __Y)
>  {
> -  return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
> -                                                (__v4si) __B,
> -                                                (__v4si)
> -                                                _mm_setzero_si128 (),
> -                                                (__mmask8) __U);
> +  return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
> +                                                 (__v4si) __Y,
> +                                                 (__v2di) __W, __M);
>  }
>
>  extern __inline __m128i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_sll_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
> -                   __m128i __B)
> +_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y)
>  {
> -  return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
> -                                                (__v2di) __B,
> -                                                (__v2di) __W,
> -                                                (__mmask8) __U);
> +  return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
> +                                                 (__v4si) __Y,
> +                                                 (__v2di)
> +                                                 _mm_setzero_si128 (),
> +                                                 __M);
>  }
>
> -extern __inline __m128i
> +extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_maskz_sll_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
> +_mm256_permutexvar_epi64 (__m256i __X, __m256i __Y)
>  {
> -  return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
> -                                                (__v2di) __B,
> -                                                (__v2di)
> -                                                _mm_setzero_si128 (),
> -                                                (__mmask8) __U);
> +  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
> +                                                    (__v4di) __X,
> +                                                    (__v4di)
> +                                                    _mm256_setzero_si256 (),
> +                                                    (__mmask8) -1);
>  }
>
>  extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_sll_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
> -                      __m128i __B)
> +_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X,
> +                              __m256i __Y)
>  {
> -  return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
> -                                                (__v4si) __B,
> -                                                (__v8si) __W,
> -                                                (__mmask8) __U);
> +  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
> +                                                    (__v4di) __X,
> +                                                    (__v4di) __W,
> +                                                    __M);
>  }
>
>  extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_sll_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
> +_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X,
> +                      __m256i __Y)
>  {
> -  return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
> -                                                (__v4si) __B,
> -                                                (__v8si)
> -                                                _mm256_setzero_si256 (),
> -                                                (__mmask8) __U);
> +  return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
> +                                                  (__v8si) __Y,
> +                                                  (__v4di) __W, __M);
>  }
>
>  extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_sll_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
> -                      __m128i __B)
> +_mm256_maskz_permutexvar_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
>  {
> -  return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
> -                                                (__v2di) __B,
> -                                                (__v4di) __W,
> -                                                (__mmask8) __U);
> +  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
> +                                                    (__v8si) __X,
> +                                                    (__v8si)
> +                                                    _mm256_setzero_si256 (),
> +                                                    __M);
>  }
>
>  extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_sll_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
> +_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y)
>  {
> -  return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
> -                                                (__v2di) __B,
> -                                                (__v4di)
> -                                                _mm256_setzero_si256 (),
> -                                                (__mmask8) __U);
> +  return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
> +                                                  (__v8si) __Y,
> +                                                  (__v4di)
> +                                                  _mm256_setzero_si256 (),
> +                                                  __M);
>  }
>
> -extern __inline __m256
> +extern __inline __m128i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_permutexvar_ps (__m256 __W, __mmask8 __U, __m256i __X,
> -                           __m256 __Y)
> +_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X,
> +                   __m128i __Y)
>  {
> -  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
> -                                                   (__v8si) __X,
> -                                                   (__v8sf) __W,
> -                                                   (__mmask8) __U);
> +  return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
> +                                                  (__v4si) __Y,
> +                                                  (__v2di) __W, __M);
>  }
>
> -extern __inline __m256
> +extern __inline __m128i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_permutexvar_ps (__mmask8 __U, __m256i __X, __m256 __Y)
> +_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y)
>  {
> -  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
> -                                                   (__v8si) __X,
> -                                                   (__v8sf)
> -                                                   _mm256_setzero_ps (),
> -                                                   (__mmask8) __U);
> +  return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
> +                                                  (__v4si) __Y,
> +                                                  (__v2di)
> +                                                  _mm_setzero_si128 (),
> +                                                  __M);
>  }
>
> -extern __inline __m256d
> +extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_permutexvar_pd (__m256i __X, __m256d __Y)
> +_mm256_permutexvar_epi32 (__m256i __X, __m256i __Y)
>  {
> -  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
> -                                                    (__v4di) __X,
> -                                                    (__v4df)
> -                                                    _mm256_setzero_pd (),
> +  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
> +                                                    (__v8si) __X,
> +                                                    (__v8si)
> +                                                    _mm256_setzero_si256 (),
>                                                      (__mmask8) -1);
>  }
>
> -extern __inline __m256d
> +extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X,
> -                           __m256d __Y)
> +_mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
> +                              __m256i __Y)
>  {
> -  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
> -                                                    (__v4di) __X,
> -                                                    (__v4df) __W,
> -                                                    (__mmask8) __U);
> +  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
> +                                                    (__v8si) __X,
> +                                                    (__v8si) __W,
> +                                                    __M);
>  }
>
> -extern __inline __m256d
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_cmpneq_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
>  {
> -  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
> -                                                    (__v4di) __X,
> -                                                    (__v4df)
> -                                                    _mm256_setzero_pd (),
> -                                                    (__mmask8) __U);
> +  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
> +                                                 (__v8si) __Y, 4,
> +                                                 (__mmask8) __M);
>  }
>
> -extern __inline __m256d
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_permutevar_pd (__m256d __W, __mmask8 __U, __m256d __A,
> -                          __m256i __C)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_cmpneq_epu32_mask (__m256i __X, __m256i __Y)
>  {
> -  return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
> -                                                       (__v4di) __C,
> -                                                       (__v4df) __W,
> -                                                       (__mmask8)
> -                                                       __U);
> +  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
> +                                                 (__v8si) __Y, 4,
> +                                                 (__mmask8) -1);
>  }
>
> -extern __inline __m256d
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_permutevar_pd (__mmask8 __U, __m256d __A, __m256i __C)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_cmplt_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
>  {
> -  return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
> -                                                       (__v4di) __C,
> -                                                       (__v4df)
> -                                                       _mm256_setzero_pd (),
> -                                                       (__mmask8)
> -                                                       __U);
> +  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
> +                                                 (__v8si) __Y, 1,
> +                                                 (__mmask8) __M);
>  }
>
> -extern __inline __m256
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_permutevar_ps (__m256 __W, __mmask8 __U, __m256 __A,
> -                          __m256i __C)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_cmplt_epu32_mask (__m256i __X, __m256i __Y)
>  {
> -  return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
> -                                                      (__v8si) __C,
> -                                                      (__v8sf) __W,
> -                                                      (__mmask8) __U);
> +  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
> +                                                 (__v8si) __Y, 1,
> +                                                 (__mmask8) -1);
>  }
>
> -extern __inline __m256
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_permutevar_ps (__mmask8 __U, __m256 __A, __m256i __C)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_cmpge_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
>  {
> -  return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
> -                                                      (__v8si) __C,
> -                                                      (__v8sf)
> -                                                      _mm256_setzero_ps (),
> -                                                      (__mmask8) __U);
> +  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
> +                                                 (__v8si) __Y, 5,
> +                                                 (__mmask8) __M);
>  }
>
> -extern __inline __m128d
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_permutevar_pd (__m128d __W, __mmask8 __U, __m128d __A,
> -                       __m128i __C)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_cmpge_epu32_mask (__m256i __X, __m256i __Y)
>  {
> -  return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
> -                                                    (__v2di) __C,
> -                                                    (__v2df) __W,
> -                                                    (__mmask8) __U);
> +  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
> +                                                 (__v8si) __Y, 5,
> +                                                 (__mmask8) -1);
>  }
>
> -extern __inline __m128d
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_maskz_permutevar_pd (__mmask8 __U, __m128d __A, __m128i __C)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_cmple_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
>  {
> -  return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
> -                                                    (__v2di) __C,
> -                                                    (__v2df)
> -                                                    _mm_setzero_pd (),
> -                                                    (__mmask8) __U);
> +  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
> +                                                 (__v8si) __Y, 2,
> +                                                 (__mmask8) __M);
>  }
>
> -extern __inline __m128
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_permutevar_ps (__m128 __W, __mmask8 __U, __m128 __A,
> -                       __m128i __C)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_cmple_epu32_mask (__m256i __X, __m256i __Y)
>  {
> -  return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
> -                                                   (__v4si) __C,
> -                                                   (__v4sf) __W,
> -                                                   (__mmask8) __U);
> +  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
> +                                                 (__v8si) __Y, 2,
> +                                                 (__mmask8) -1);
>  }
>
> -extern __inline __m128
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_maskz_permutevar_ps (__mmask8 __U, __m128 __A, __m128i __C)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_cmpneq_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
>  {
> -  return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
> -                                                   (__v4si) __C,
> -                                                   (__v4sf)
> -                                                   _mm_setzero_ps (),
> -                                                   (__mmask8) __U);
> +  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
> +                                                 (__v4di) __Y, 4,
> +                                                 (__mmask8) __M);
>  }
>
> -extern __inline __m256i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B)
> -{
> -  return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
> -                                                 (__v8si) __B,
> -                                                 (__v8si)
> -                                                 _mm256_setzero_si256 (),
> -                                                 __M);
> -}
> -
> -extern __inline __m256i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_cmpneq_epu64_mask (__m256i __X, __m256i __Y)
>  {
> -  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
> -                                                    (__v4di) __X,
> -                                                    (__v4di)
> -                                                    _mm256_setzero_si256 (),
> -                                                    __M);
> +  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
> +                                                 (__v4di) __Y, 4,
> +                                                 (__mmask8) -1);
>  }
>
> -extern __inline __m256i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
> -                        __m256i __B)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_cmplt_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
>  {
> -  return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
> -                                                 (__v8si) __B,
> -                                                 (__v8si) __W, __M);
> +  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
> +                                                 (__v4di) __Y, 1,
> +                                                 (__mmask8) __M);
>  }
>
> -extern __inline __m128i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_cmplt_epu64_mask (__m256i __X, __m256i __Y)
>  {
> -  return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
> -                                                 (__v4si) __B,
> -                                                 (__v4si)
> -                                                 _mm_setzero_si128 (),
> -                                                 __M);
> +  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
> +                                                 (__v4di) __Y, 1,
> +                                                 (__mmask8) -1);
>  }
>
> -extern __inline __m128i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_mullo_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
> -                     __m128i __B)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_cmpge_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
>  {
> -  return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
> -                                                 (__v4si) __B,
> -                                                 (__v4si) __W, __M);
> +  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
> +                                                 (__v4di) __Y, 5,
> +                                                 (__mmask8) __M);
>  }
>
> -extern __inline __m256i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
> -                      __m256i __Y)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_cmpge_epu64_mask (__m256i __X, __m256i __Y)
>  {
> -  return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
> -                                                 (__v8si) __Y,
> -                                                 (__v4di) __W, __M);
> +  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
> +                                                 (__v4di) __Y, 5,
> +                                                 (__mmask8) -1);
>  }
>
> -extern __inline __m256i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_cmple_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
>  {
> -  return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
> -                                                 (__v8si) __Y,
> -                                                 (__v4di)
> -                                                 _mm256_setzero_si256 (),
> -                                                 __M);
> +  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
> +                                                 (__v4di) __Y, 2,
> +                                                 (__mmask8) __M);
>  }
>
> -extern __inline __m128i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X,
> -                   __m128i __Y)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_cmple_epu64_mask (__m256i __X, __m256i __Y)
>  {
> -  return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
> -                                                 (__v4si) __Y,
> -                                                 (__v2di) __W, __M);
> +  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
> +                                                 (__v4di) __Y, 2,
> +                                                 (__mmask8) -1);
>  }
>
> -extern __inline __m128i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_cmpneq_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
>  {
> -  return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
> -                                                 (__v4si) __Y,
> -                                                 (__v2di)
> -                                                 _mm_setzero_si128 (),
> -                                                 __M);
> +  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
> +                                                (__v8si) __Y, 4,
> +                                                (__mmask8) __M);
>  }
>
> -extern __inline __m256i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_permutexvar_epi64 (__m256i __X, __m256i __Y)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_cmpneq_epi32_mask (__m256i __X, __m256i __Y)
>  {
> -  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
> -                                                    (__v4di) __X,
> -                                                    (__v4di)
> -                                                    _mm256_setzero_si256 (),
> -                                                    (__mmask8) -1);
> +  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
> +                                                (__v8si) __Y, 4,
> +                                                (__mmask8) -1);
>  }
>
> -extern __inline __m256i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X,
> -                              __m256i __Y)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_cmplt_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
>  {
> -  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
> -                                                    (__v4di) __X,
> -                                                    (__v4di) __W,
> -                                                    __M);
> +  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
> +                                                (__v8si) __Y, 1,
> +                                                (__mmask8) __M);
>  }
>
> -extern __inline __m256i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X,
> -                      __m256i __Y)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_cmplt_epi32_mask (__m256i __X, __m256i __Y)
>  {
> -  return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
> -                                                  (__v8si) __Y,
> -                                                  (__v4di) __W, __M);
> +  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
> +                                                (__v8si) __Y, 1,
> +                                                (__mmask8) -1);
>  }
>
> -extern __inline __m256i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_permutexvar_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_cmpge_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
>  {
> -  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
> -                                                    (__v8si) __X,
> -                                                    (__v8si)
> -                                                    _mm256_setzero_si256 (),
> -                                                    __M);
> +  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
> +                                                (__v8si) __Y, 5,
> +                                                (__mmask8) __M);
>  }
>
> -extern __inline __m256i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_cmpge_epi32_mask (__m256i __X, __m256i __Y)
>  {
> -  return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
> -                                                  (__v8si) __Y,
> -                                                  (__v4di)
> -                                                  _mm256_setzero_si256 (),
> -                                                  __M);
> +  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
> +                                                (__v8si) __Y, 5,
> +                                                (__mmask8) -1);
>  }
>
> -extern __inline __m128i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X,
> -                   __m128i __Y)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_cmple_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
>  {
> -  return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
> -                                                  (__v4si) __Y,
> -                                                  (__v2di) __W, __M);
> +  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
> +                                                (__v8si) __Y, 2,
> +                                                (__mmask8) __M);
>  }
>
> -extern __inline __m128i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_cmple_epi32_mask (__m256i __X, __m256i __Y)
>  {
> -  return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
> -                                                  (__v4si) __Y,
> -                                                  (__v2di)
> -                                                  _mm_setzero_si128 (),
> -                                                  __M);
> +  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
> +                                                (__v8si) __Y, 2,
> +                                                (__mmask8) -1);
>  }
>
> -extern __inline __m256i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_permutexvar_epi32 (__m256i __X, __m256i __Y)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_cmpneq_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
>  {
> -  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
> -                                                    (__v8si) __X,
> -                                                    (__v8si)
> -                                                    _mm256_setzero_si256 (),
> -                                                    (__mmask8) -1);
> +  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
> +                                                (__v4di) __Y, 4,
> +                                                (__mmask8) __M);
>  }
>
> -extern __inline __m256i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
> -                              __m256i __Y)
> +extern __inline __mmask8
> +  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_cmpneq_epi64_mask (__m256i __X, __m256i __Y)
>  {
> -  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
> -                                                    (__v8si) __X,
> -                                                    (__v8si) __W,
> -                                                    __M);
> +  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
> +                                                (__v4di) __Y, 4,
> +                                                (__mmask8) -1);
>  }
>
>  extern __inline __mmask8
>    __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_cmpneq_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
> +_mm256_mask_cmplt_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
>  {
> -  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
> -                                                 (__v8si) __Y, 4,
> -                                                 (__mmask8) __M);
> +  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
> +                                                (__v4di) __Y, 1,
> +                                                (__mmask8) __M);
>  }
>
>  extern __inline __mmask8
>    __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_cmpneq_epu32_mask (__m256i __X, __m256i __Y)
> +_mm256_cmplt_epi64_mask (__m256i __X, __m256i __Y)
>  {
> -  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
> -                                                 (__v8si) __Y, 4,
> -                                                 (__mmask8) -1);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_cmplt_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
> -                                                 (__v8si) __Y, 1,
> -                                                 (__mmask8) __M);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_cmplt_epu32_mask (__m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
> -                                                 (__v8si) __Y, 1,
> -                                                 (__mmask8) -1);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_cmpge_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
> -                                                 (__v8si) __Y, 5,
> -                                                 (__mmask8) __M);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_cmpge_epu32_mask (__m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
> -                                                 (__v8si) __Y, 5,
> -                                                 (__mmask8) -1);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_cmple_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
> -                                                 (__v8si) __Y, 2,
> -                                                 (__mmask8) __M);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_cmple_epu32_mask (__m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
> -                                                 (__v8si) __Y, 2,
> -                                                 (__mmask8) -1);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_cmpneq_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
> -                                                 (__v4di) __Y, 4,
> -                                                 (__mmask8) __M);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_cmpneq_epu64_mask (__m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
> -                                                 (__v4di) __Y, 4,
> -                                                 (__mmask8) -1);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_cmplt_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
> -                                                 (__v4di) __Y, 1,
> -                                                 (__mmask8) __M);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_cmplt_epu64_mask (__m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
> -                                                 (__v4di) __Y, 1,
> -                                                 (__mmask8) -1);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_cmpge_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
> -                                                 (__v4di) __Y, 5,
> -                                                 (__mmask8) __M);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_cmpge_epu64_mask (__m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
> -                                                 (__v4di) __Y, 5,
> -                                                 (__mmask8) -1);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_cmple_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
> -                                                 (__v4di) __Y, 2,
> -                                                 (__mmask8) __M);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_cmple_epu64_mask (__m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
> -                                                 (__v4di) __Y, 2,
> -                                                 (__mmask8) -1);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_cmpneq_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
> -                                                (__v8si) __Y, 4,
> -                                                (__mmask8) __M);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_cmpneq_epi32_mask (__m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
> -                                                (__v8si) __Y, 4,
> -                                                (__mmask8) -1);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_cmplt_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
> -                                                (__v8si) __Y, 1,
> -                                                (__mmask8) __M);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_cmplt_epi32_mask (__m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
> -                                                (__v8si) __Y, 1,
> -                                                (__mmask8) -1);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_cmpge_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
> -                                                (__v8si) __Y, 5,
> -                                                (__mmask8) __M);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_cmpge_epi32_mask (__m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
> -                                                (__v8si) __Y, 5,
> -                                                (__mmask8) -1);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_cmple_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
> -                                                (__v8si) __Y, 2,
> -                                                (__mmask8) __M);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_cmple_epi32_mask (__m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
> -                                                (__v8si) __Y, 2,
> -                                                (__mmask8) -1);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_cmpneq_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
> -                                                (__v4di) __Y, 4,
> -                                                (__mmask8) __M);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_cmpneq_epi64_mask (__m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
> -                                                (__v4di) __Y, 4,
> -                                                (__mmask8) -1);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_cmplt_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
> -                                                (__v4di) __Y, 1,
> -                                                (__mmask8) __M);
> -}
> -
> -extern __inline __mmask8
> -  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_cmplt_epi64_mask (__m256i __X, __m256i __Y)
> -{
> -  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
> -                                                (__v4di) __Y, 1,
> -                                                (__mmask8) -1);
> +  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
> +                                                (__v4di) __Y, 1,
> +                                                (__mmask8) -1);
>  }
>
>  extern __inline __mmask8
> @@ -13861,4 +13583,282 @@ _mm256_permutex_pd (__m256d __X, const int __M)
>  #pragma GCC pop_options
>  #endif /* __DISABLE_AVX512VL__ */
>
> +#if !defined (__AVX512CD__) || !defined (__AVX512VL__)
> +#pragma GCC push_options
> +#pragma GCC target("avx512vl,avx512cd")
> +#define __DISABLE_AVX512VLCD__
> +#endif
> +
> +extern __inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_broadcastmb_epi64 (__mmask8 __A)
> +{
> +  return (__m128i) __builtin_ia32_broadcastmb128 (__A);
> +}
> +
> +extern __inline __m256i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_broadcastmb_epi64 (__mmask8 __A)
> +{
> +  return (__m256i) __builtin_ia32_broadcastmb256 (__A);
> +}
> +
> +extern __inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_broadcastmw_epi32 (__mmask16 __A)
> +{
> +  return (__m128i) __builtin_ia32_broadcastmw128 (__A);
> +}
> +
> +extern __inline __m256i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_broadcastmw_epi32 (__mmask16 __A)
> +{
> +  return (__m256i) __builtin_ia32_broadcastmw256 (__A);
> +}
> +
> +extern __inline __m256i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_lzcnt_epi32 (__m256i __A)
> +{
> +  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
> +                                                    (__v8si)
> +                                                    _mm256_setzero_si256 (),
> +                                                    (__mmask8) -1);
> +}
> +
> +extern __inline __m256i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
> +{
> +  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
> +                                                    (__v8si) __W,
> +                                                    (__mmask8) __U);
> +}
> +
> +extern __inline __m256i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
> +{
> +  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
> +                                                    (__v8si)
> +                                                    _mm256_setzero_si256 (),
> +                                                    (__mmask8) __U);
> +}
> +
> +extern __inline __m256i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_lzcnt_epi64 (__m256i __A)
> +{
> +  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
> +                                                    (__v4di)
> +                                                    _mm256_setzero_si256 (),
> +                                                    (__mmask8) -1);
> +}
> +
> +extern __inline __m256i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
> +{
> +  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
> +                                                    (__v4di) __W,
> +                                                    (__mmask8) __U);
> +}
> +
> +extern __inline __m256i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
> +{
> +  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
> +                                                    (__v4di)
> +                                                    _mm256_setzero_si256 (),
> +                                                    (__mmask8) __U);
> +}
> +
> +extern __inline __m256i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_conflict_epi64 (__m256i __A)
> +{
> +  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
> +                                                        (__v4di)
> +                                                        _mm256_setzero_si256 (),
> +                                                        (__mmask8) -1);
> +}
> +
> +extern __inline __m256i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
> +{
> +  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
> +                                                        (__v4di) __W,
> +                                                        (__mmask8)
> +                                                        __U);
> +}
> +
> +extern __inline __m256i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
> +{
> +  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
> +                                                        (__v4di)
> +                                                        _mm256_setzero_si256 (),
> +                                                        (__mmask8)
> +                                                        __U);
> +}
> +
> +extern __inline __m256i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_conflict_epi32 (__m256i __A)
> +{
> +  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
> +                                                        (__v8si)
> +                                                        _mm256_setzero_si256 (),
> +                                                        (__mmask8) -1);
> +}
> +
> +extern __inline __m256i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
> +{
> +  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
> +                                                        (__v8si) __W,
> +                                                        (__mmask8)
> +                                                        __U);
> +}
> +
> +extern __inline __m256i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
> +{
> +  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
> +                                                        (__v8si)
> +                                                        _mm256_setzero_si256 (),
> +                                                        (__mmask8)
> +                                                        __U);
> +}
> +
> +extern __inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_lzcnt_epi32 (__m128i __A)
> +{
> +  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
> +                                                    (__v4si)
> +                                                    _mm_setzero_si128 (),
> +                                                    (__mmask8) -1);
> +}
> +
> +extern __inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
> +{
> +  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
> +                                                    (__v4si) __W,
> +                                                    (__mmask8) __U);
> +}
> +
> +extern __inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
> +{
> +  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
> +                                                    (__v4si)
> +                                                    _mm_setzero_si128 (),
> +                                                    (__mmask8) __U);
> +}
> +
> +extern __inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_lzcnt_epi64 (__m128i __A)
> +{
> +  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
> +                                                    (__v2di)
> +                                                    _mm_setzero_si128 (),
> +                                                    (__mmask8) -1);
> +}
> +
> +extern __inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
> +{
> +  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
> +                                                    (__v2di) __W,
> +                                                    (__mmask8) __U);
> +}
> +
> +extern __inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
> +{
> +  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
> +                                                    (__v2di)
> +                                                    _mm_setzero_si128 (),
> +                                                    (__mmask8) __U);
> +}
> +
> +extern __inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_conflict_epi64 (__m128i __A)
> +{
> +  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
> +                                                        (__v2di)
> +                                                        _mm_setzero_si128 (),
> +                                                        (__mmask8) -1);
> +}
> +
> +extern __inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
> +{
> +  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
> +                                                        (__v2di) __W,
> +                                                        (__mmask8)
> +                                                        __U);
> +}
> +
> +extern __inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
> +{
> +  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
> +                                                        (__v2di)
> +                                                        _mm_setzero_si128 (),
> +                                                        (__mmask8)
> +                                                        __U);
> +}
> +
> +extern __inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_conflict_epi32 (__m128i __A)
> +{
> +  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
> +                                                        (__v4si)
> +                                                        _mm_setzero_si128 (),
> +                                                        (__mmask8) -1);
> +}
> +
> +extern __inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
> +{
> +  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
> +                                                        (__v4si) __W,
> +                                                        (__mmask8)
> +                                                        __U);
> +}
> +
> +extern __inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
> +{
> +  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
> +                                                        (__v4si)
> +                                                        _mm_setzero_si128 (),
> +                                                        (__mmask8)
> +                                                        __U);
> +}
> +
> +#ifdef __DISABLE_AVX512VLCD__
> +#pragma GCC pop_options
> +#endif
> +
>  #endif /* _AVX512VLINTRIN_H_INCLUDED */
> diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
> index b90d5ccc969..19fa5c107c7 100644
> --- a/gcc/config/i386/i386-builtin.def
> +++ b/gcc/config/i386/i386-builtin.def
> @@ -1615,8 +1615,8 @@ BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_ktestqi, "__builtin_ia32_ktestcqi",
>  BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_ktestqi, "__builtin_ia32_ktestzqi", IX86_BUILTIN_KTESTZ8, UNKNOWN, (int) UQI_FTYPE_UQI_UQI)
>  BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_ktesthi, "__builtin_ia32_ktestchi", IX86_BUILTIN_KTESTC16, UNKNOWN, (int) UHI_FTYPE_UHI_UHI)
>  BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_ktesthi, "__builtin_ia32_ktestzhi", IX86_BUILTIN_KTESTZ16, UNKNOWN, (int) UHI_FTYPE_UHI_UHI)
> -BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_ktestsi, "__builtin_ia32_ktestcsi", IX86_BUILTIN_KTESTC32, UNKNOWN, (int) USI_FTYPE_USI_USI)
> -BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_ktestsi, "__builtin_ia32_ktestzsi", IX86_BUILTIN_KTESTZ32, UNKNOWN, (int) USI_FTYPE_USI_USI)
> +BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_ktestsi, "__builtin_ia32_ktestcsi", IX86_BUILTIN_KTESTC32, UNKNOWN, (int) USI_FTYPE_USI_USI)
> +BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_ktestsi, "__builtin_ia32_ktestzsi", IX86_BUILTIN_KTESTZ32, UNKNOWN, (int) USI_FTYPE_USI_USI)
>  BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_ktestdi, "__builtin_ia32_ktestcdi", IX86_BUILTIN_KTESTC64, UNKNOWN, (int) UDI_FTYPE_UDI_UDI)
>  BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_ktestdi, "__builtin_ia32_ktestzdi", IX86_BUILTIN_KTESTZ64, UNKNOWN, (int) UDI_FTYPE_UDI_UDI)
>  BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_kortestqi, "__builtin_ia32_kortestcqi", IX86_BUILTIN_KORTESTC8, UNKNOWN, (int) UQI_FTYPE_UQI_UQI)
> --
> 2.31.1
>


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-10-30  7:56 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-30  7:44 [PATCH] Fix incorrect option mask and avx512cd target push Haochen Jiang
2023-10-30  7:56 ` Hongtao Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).