public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] [i386] Add define_insn_and_split for vpcmp{b, w, d, q} vpcmp{ph, ps, pd}.
@ 2021-12-21  6:26 liuhongt
  2021-12-23  5:41 ` Hongtao Liu
  0 siblings, 1 reply; 2+ messages in thread
From: liuhongt @ 2021-12-21  6:26 UTC (permalink / raw)
  To: gcc-patches

The purpose of those define_insn_and_split:
1. Combine vpcmpuw and zero_extend into vpcmpuw.
2. Canonicalize vpcmpuw pattern so CSE can replace duplicate vpcmpuw to just kmov
3. Use DImode as dest of zero_extend so cprop_hardreg can eliminate redundant kmov.

It should partially fix the issue in PR.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.

gcc/ChangeLog:

	PR target/103750
	* config/i386/sse.md
	(*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
	New define_insn_and_split.
	(*<avx512>_cmp<mode>3): Ditto.
	(*<avx512>_cmp<mode>3_zero_extenddi): New define_insn.
	(*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
	New define_insn_and_split.
	(*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
	Ditto.
	(*<avx512>_ucmp<mode>3): Ditto.
	(*<avx512>_ucmp<mode>3_zero_extenddi): New define_insn.
	(*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
	New define_insn_and_split.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/bitwise_mask_op-3.c: Adjust test/
	* g++.target/i386/pr103750-1.C: New test.
---
 gcc/config/i386/sse.md                        | 267 ++++++++++++++++++
 gcc/testsuite/g++.target/i386/pr103750-1.C    |  50 ++++
 .../gcc.target/i386/bitwise_mask_op-3.c       |   6 +-
 3 files changed, 320 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/pr103750-1.C

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5196149ee32..fb885d58272 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -3702,6 +3702,75 @@ (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>"
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+;; Those Splitters are used to canonicalize vpcmpuw pattern, so that CSE can transfrom
+;; duplicated vpcmpuw to vpcmpuw and kmov
+;; Choose biggest mode(DImode) as dest, so kmov can be optimized by cprop_hardreg.
+(define_insn_and_split "*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
+  [(set (match_operand:SWI248x 0 "register_operand" "=k")
+	(zero_extend:SWI248x
+	  (unspec:<V48H_AVX512VL:avx512fmaskmode>
+	    [(match_operand:V48H_AVX512VL 1 "register_operand" "v")
+	     (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
+	     (match_operand:SI 3 "<V48H_AVX512VL:cmp_imm_predicate>" "n")]
+	    UNSPEC_PCMP)))]
+  "TARGET_AVX512BW
+   && (GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode)
+       < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
+  "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  "&& <SWI248x:MODE>mode != E_DImode"
+  [(set (match_dup 0)
+	(zero_extend:DI
+	  (unspec:<V48H_AVX512VL:avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	    UNSPEC_PCMP)))]
+  "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<V48H_AVX512VL:sseinsnmode>")])
+
+(define_insn_and_split "*<avx512>_cmp<mode>3"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
+	(unspec:<avx512fmaskmode>
+	  [(match_operand:V48H_AVX512VL 1 "register_operand" "v")
+	   (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
+	   (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+	  UNSPEC_PCMP))]
+  "TARGET_AVX512BW
+   && GET_MODE_NUNITS (<MODE>mode) < 64"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(zero_extend:DI
+	  (unspec:<avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	    UNSPEC_PCMP)))]
+  "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*<avx512>_cmp<mode>3_zero_extenddi"
+  [(set (match_operand:DI 0 "register_operand" "=k")
+	(zero_extend:DI
+	  (unspec:<avx512fmaskmode>
+	    [(match_operand:V48H_AVX512VL 1 "register_operand" "v")
+	     (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
+	     (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+	    UNSPEC_PCMP)))]
+  "TARGET_AVX512BW
+   && GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode) < 64"
+  "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_insn_and_split "*<avx512>_cmp<mode>3"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
 	(not:<avx512fmaskmode>
@@ -3735,6 +3804,72 @@ (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name>"
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
+  [(set (match_operand:SWI248x 0 "register_operand" "=k")
+	(zero_extend:SWI248x
+	  (unspec:<VI12_AVX512VL:avx512fmaskmode>
+	    [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+	     (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+	     (match_operand:SI 3 "<VI12_AVX512VL:cmp_imm_predicate>" "n")]
+	    UNSPEC_PCMP)))]
+  "TARGET_AVX512BW
+   && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode)
+       < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
+  "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  "&& <SWI248x:MODE>mode != E_DImode"
+  [(set (match_dup 0)
+	(zero_extend:DI
+	  (unspec:<VI12_AVX512VL:avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	   UNSPEC_PCMP)))]
+  "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")])
+
+(define_insn_and_split "*<avx512>_cmp<mode>3"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
+	(unspec:<avx512fmaskmode>
+	  [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+	   (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+	   (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+	  UNSPEC_PCMP))]
+  "TARGET_AVX512BW
+   && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(zero_extend:DI
+	  (unspec:<avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	    UNSPEC_PCMP)))]
+  "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*<avx512>_cmp<mode>3_zero_extenddi"
+  [(set (match_operand:DI 0 "register_operand" "=k")
+	(zero_extend:DI
+	  (unspec:<avx512fmaskmode>
+	    [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+	     (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+	     (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+	    UNSPEC_PCMP)))]
+  "TARGET_AVX512BW
+   && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
+  "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_int_iterator UNSPEC_PCMP_ITER
   [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP])
 
@@ -3771,6 +3906,72 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
+  [(set (match_operand:SWI248x 0 "register_operand" "=k")
+	(zero_extend:SWI248x
+	  (unspec:<VI12_AVX512VL:avx512fmaskmode>
+	    [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+	     (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+	     (match_operand:SI 3 "const_0_to_7_operand" "n")]
+	    UNSPEC_UNSIGNED_PCMP)))]
+  "TARGET_AVX512BW
+  && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode)
+      < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
+  "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  "&& <SWI248x:MODE>mode != E_DImode"
+  [(set (match_dup 0)
+	(zero_extend:DI
+	  (unspec:<VI12_AVX512VL:avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	    UNSPEC_UNSIGNED_PCMP)))]
+  "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")])
+
+(define_insn_and_split "*<avx512>_ucmp<mode>3"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
+	(unspec:<avx512fmaskmode>
+	  [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+	   (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+	   (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+	  UNSPEC_UNSIGNED_PCMP))]
+  "TARGET_AVX512BW
+   && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(zero_extend:DI
+	  (unspec:<avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	    UNSPEC_UNSIGNED_PCMP)))]
+  "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi"
+  [(set (match_operand:DI 0 "register_operand" "=k")
+	(zero_extend:DI
+	  (unspec:<avx512fmaskmode>
+	    [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+	     (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+	     (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+	    UNSPEC_UNSIGNED_PCMP)))]
+  "TARGET_AVX512BW
+   && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
+  "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
 	(unspec:<avx512fmaskmode>
@@ -3785,6 +3986,72 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
+  [(set (match_operand:SWI248x 0 "register_operand" "=k")
+	(zero_extend:SWI248x
+	  (unspec:<VI48_AVX512VL:avx512fmaskmode>
+	    [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
+	     (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
+	     (match_operand:SI 3 "const_0_to_7_operand" "n")]
+	    UNSPEC_UNSIGNED_PCMP)))]
+  "TARGET_AVX512BW
+  && (GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode)
+      < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
+  "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  "&& <SWI248x:MODE>mode != E_DImode"
+  [(set (match_dup 0)
+	(zero_extend:DI
+	  (unspec:<VI48_AVX512VL:avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	   UNSPEC_UNSIGNED_PCMP)))]
+  "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<VI48_AVX512VL:sseinsnmode>")])
+
+(define_insn_and_split "*<avx512>_ucmp<mode>3"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
+	(unspec:<avx512fmaskmode>
+	  [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
+	   (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
+	   (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+	  UNSPEC_UNSIGNED_PCMP))]
+  "TARGET_AVX512BW
+   && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(zero_extend:DI
+	  (unspec:<avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	    UNSPEC_UNSIGNED_PCMP)))]
+  "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi"
+  [(set (match_operand:DI 0 "register_operand" "=k")
+	(zero_extend:DI
+	  (unspec:<avx512fmaskmode>
+	    [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
+	     (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
+	     (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+	    UNSPEC_UNSIGNED_PCMP)))]
+  "TARGET_AVX512BW
+   && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64"
+  "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_insn_and_split "*<avx512>_ucmp<mode>3"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
 	(not:<avx512fmaskmode>
diff --git a/gcc/testsuite/g++.target/i386/pr103750-1.C b/gcc/testsuite/g++.target/i386/pr103750-1.C
new file mode 100644
index 00000000000..83f471331b3
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr103750-1.C
@@ -0,0 +1,50 @@
+/* PR target/103750 */
+/* { dg-do compile }  */
+/* { dg-options "-O2 -march=cannonlake -maes -std=c++1y" } */
+/* { dg-final { scan-assembler-times "kmovw" 2 } } */
+/* { dg-final { scan-assembler-times "kmovd" 2 } } */
+/* There shouldn't be any kmovw/kmovd inside the loop.  */
+#include <immintrin.h>
+
+const char16_t *qustrchr(char16_t *n, char16_t *e, char16_t c) noexcept
+{
+  __m256i mch256 = _mm256_set1_epi16(c);
+  for ( ; n < e; n += 32) {
+    __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
+    __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1);
+    __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256);
+    __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256);
+    if (_kortestz_mask16_u8(mask1, mask2))
+      continue;
+
+    unsigned idx = _tzcnt_u32(mask1);
+    if (mask1 == 0) {
+      idx = __tzcnt_u16(mask2);
+      n += 16;
+    }
+    return n + idx;
+  }
+  return e;
+}
+
+const char16_t *qustrchr1(char16_t *n, char16_t *e, char16_t c) noexcept
+{
+  __m256i mch256 = _mm256_set1_epi16(c);
+  for ( ; n < e; n += 32) {
+    __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
+    __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1);
+    __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256);
+    __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256);
+    if (_kortestz_mask32_u8(mask1, mask2))
+      continue;
+
+    unsigned idx = _tzcnt_u32(mask1);
+    if (mask1 == 0) {
+      idx = __tzcnt_u16(mask2);
+      n += 16;
+    }
+    return n + idx;
+  }
+  return e;
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
index 352c49d6c6b..82bb99e30af 100644
--- a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
+++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
@@ -12,7 +12,7 @@ foo_orb (__m512i a, __m512i b)
   foo = m1 | m2;
 }
 
-/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail *-*-* } } }  */
+/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } }  */
 
 void
 foo_xorb (__m512i a, __m512i b)
@@ -22,7 +22,7 @@ foo_xorb (__m512i a, __m512i b)
   foo = m1 ^ m2;
 }
 
-/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail *-*-* } } }  */
+/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } }  */
 
 void
 foo_andb (__m512i a, __m512i b)
@@ -40,4 +40,4 @@ foo_andnb (__m512i a, __m512i b)
   foo = m1 & ~m2;
 }
 
-/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail *-*-* } } }  */
+/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail { *-*-* && { ! ia32 } } } } }  */
-- 
2.18.1


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] [i386] Add define_insn_and_split for vpcmp{b, w, d, q} vpcmp{ph, ps, pd}.
  2021-12-21  6:26 [PATCH] [i386] Add define_insn_and_split for vpcmp{b, w, d, q} vpcmp{ph, ps, pd} liuhongt
@ 2021-12-23  5:41 ` Hongtao Liu
  0 siblings, 0 replies; 2+ messages in thread
From: Hongtao Liu @ 2021-12-23  5:41 UTC (permalink / raw)
  To: liuhongt; +Cc: GCC Patches, H. J. Lu

[-- Attachment #1: Type: text/plain, Size: 17963 bytes --]

On Tue, Dec 21, 2021 at 2:27 PM liuhongt <hongtao.liu@intel.com> wrote:
>
> The purpose of those define_insn_and_split:
> 1. Combine vpcmpuw and zero_extend into vpcmpuw.
> 2. Canonicalize vpcmpuw pattern so CSE can replace duplicate vpcmpuw to just kmov
> 3. Use DImode as dest of zero_extend so cprop_hardreg can eliminate redundant kmov.
Use DImode as dest of zero_extend is too aggressive which causes
several regression.
New patch add define_insn_and_split just combine  vpcmpuw and
zero_extend into vpcmpuw.
Here's the patch i'm checking in.
>
> It should partially fix the issue in PR.
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ready to push to trunk.
>
> gcc/ChangeLog:
>
>         PR target/103750
>         * config/i386/sse.md
>         (*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
>         New define_insn_and_split.
>         (*<avx512>_cmp<mode>3): Ditto.
>         (*<avx512>_cmp<mode>3_zero_extenddi): New define_insn.
>         (*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
>         New define_insn_and_split.
>         (*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
>         Ditto.
>         (*<avx512>_ucmp<mode>3): Ditto.
>         (*<avx512>_ucmp<mode>3_zero_extenddi): New define_insn.
>         (*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
>         New define_insn_and_split.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/bitwise_mask_op-3.c: Adjust test/
>         * g++.target/i386/pr103750-1.C: New test.
> ---
>  gcc/config/i386/sse.md                        | 267 ++++++++++++++++++
>  gcc/testsuite/g++.target/i386/pr103750-1.C    |  50 ++++
>  .../gcc.target/i386/bitwise_mask_op-3.c       |   6 +-
>  3 files changed, 320 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/g++.target/i386/pr103750-1.C
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 5196149ee32..fb885d58272 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -3702,6 +3702,75 @@ (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>"
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "<sseinsnmode>")])
>
> +;; Those Splitters are used to canonicalize vpcmpuw pattern, so that CSE can transfrom
> +;; duplicated vpcmpuw to vpcmpuw and kmov
> +;; Choose biggest mode(DImode) as dest, so kmov can be optimized by cprop_hardreg.
> +(define_insn_and_split "*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
> +  [(set (match_operand:SWI248x 0 "register_operand" "=k")
> +       (zero_extend:SWI248x
> +         (unspec:<V48H_AVX512VL:avx512fmaskmode>
> +           [(match_operand:V48H_AVX512VL 1 "register_operand" "v")
> +            (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
> +            (match_operand:SI 3 "<V48H_AVX512VL:cmp_imm_predicate>" "n")]
> +           UNSPEC_PCMP)))]
> +  "TARGET_AVX512BW
> +   && (GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode)
> +       < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
> +  "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> +  "&& <SWI248x:MODE>mode != E_DImode"
> +  [(set (match_dup 0)
> +       (zero_extend:DI
> +         (unspec:<V48H_AVX512VL:avx512fmaskmode>
> +           [(match_dup 1)
> +            (match_dup 2)
> +            (match_dup 3)]
> +           UNSPEC_PCMP)))]
> +  "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<V48H_AVX512VL:sseinsnmode>")])
> +
> +(define_insn_and_split "*<avx512>_cmp<mode>3"
> +  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
> +       (unspec:<avx512fmaskmode>
> +         [(match_operand:V48H_AVX512VL 1 "register_operand" "v")
> +          (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
> +          (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> +         UNSPEC_PCMP))]
> +  "TARGET_AVX512BW
> +   && GET_MODE_NUNITS (<MODE>mode) < 64"
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 0)
> +       (zero_extend:DI
> +         (unspec:<avx512fmaskmode>
> +           [(match_dup 1)
> +            (match_dup 2)
> +            (match_dup 3)]
> +           UNSPEC_PCMP)))]
> +  "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
> +(define_insn "*<avx512>_cmp<mode>3_zero_extenddi"
> +  [(set (match_operand:DI 0 "register_operand" "=k")
> +       (zero_extend:DI
> +         (unspec:<avx512fmaskmode>
> +           [(match_operand:V48H_AVX512VL 1 "register_operand" "v")
> +            (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
> +            (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> +           UNSPEC_PCMP)))]
> +  "TARGET_AVX512BW
> +   && GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode) < 64"
> +  "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
>  (define_insn_and_split "*<avx512>_cmp<mode>3"
>    [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
>         (not:<avx512fmaskmode>
> @@ -3735,6 +3804,72 @@ (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name>"
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "<sseinsnmode>")])
>
> +(define_insn_and_split "*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
> +  [(set (match_operand:SWI248x 0 "register_operand" "=k")
> +       (zero_extend:SWI248x
> +         (unspec:<VI12_AVX512VL:avx512fmaskmode>
> +           [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
> +            (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
> +            (match_operand:SI 3 "<VI12_AVX512VL:cmp_imm_predicate>" "n")]
> +           UNSPEC_PCMP)))]
> +  "TARGET_AVX512BW
> +   && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode)
> +       < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
> +  "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> +  "&& <SWI248x:MODE>mode != E_DImode"
> +  [(set (match_dup 0)
> +       (zero_extend:DI
> +         (unspec:<VI12_AVX512VL:avx512fmaskmode>
> +           [(match_dup 1)
> +            (match_dup 2)
> +            (match_dup 3)]
> +          UNSPEC_PCMP)))]
> +  "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")])
> +
> +(define_insn_and_split "*<avx512>_cmp<mode>3"
> +  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
> +       (unspec:<avx512fmaskmode>
> +         [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
> +          (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
> +          (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> +         UNSPEC_PCMP))]
> +  "TARGET_AVX512BW
> +   && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 0)
> +       (zero_extend:DI
> +         (unspec:<avx512fmaskmode>
> +           [(match_dup 1)
> +            (match_dup 2)
> +            (match_dup 3)]
> +           UNSPEC_PCMP)))]
> +  "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
> +(define_insn "*<avx512>_cmp<mode>3_zero_extenddi"
> +  [(set (match_operand:DI 0 "register_operand" "=k")
> +       (zero_extend:DI
> +         (unspec:<avx512fmaskmode>
> +           [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
> +            (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
> +            (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> +           UNSPEC_PCMP)))]
> +  "TARGET_AVX512BW
> +   && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
> +  "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
>  (define_int_iterator UNSPEC_PCMP_ITER
>    [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP])
>
> @@ -3771,6 +3906,72 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "<sseinsnmode>")])
>
> +(define_insn_and_split "*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
> +  [(set (match_operand:SWI248x 0 "register_operand" "=k")
> +       (zero_extend:SWI248x
> +         (unspec:<VI12_AVX512VL:avx512fmaskmode>
> +           [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
> +            (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
> +            (match_operand:SI 3 "const_0_to_7_operand" "n")]
> +           UNSPEC_UNSIGNED_PCMP)))]
> +  "TARGET_AVX512BW
> +  && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode)
> +      < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
> +  "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> +  "&& <SWI248x:MODE>mode != E_DImode"
> +  [(set (match_dup 0)
> +       (zero_extend:DI
> +         (unspec:<VI12_AVX512VL:avx512fmaskmode>
> +           [(match_dup 1)
> +            (match_dup 2)
> +            (match_dup 3)]
> +           UNSPEC_UNSIGNED_PCMP)))]
> +  "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")])
> +
> +(define_insn_and_split "*<avx512>_ucmp<mode>3"
> +  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
> +       (unspec:<avx512fmaskmode>
> +         [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
> +          (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
> +          (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> +         UNSPEC_UNSIGNED_PCMP))]
> +  "TARGET_AVX512BW
> +   && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 0)
> +       (zero_extend:DI
> +         (unspec:<avx512fmaskmode>
> +           [(match_dup 1)
> +            (match_dup 2)
> +            (match_dup 3)]
> +           UNSPEC_UNSIGNED_PCMP)))]
> +  "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
> +(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi"
> +  [(set (match_operand:DI 0 "register_operand" "=k")
> +       (zero_extend:DI
> +         (unspec:<avx512fmaskmode>
> +           [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
> +            (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
> +            (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> +           UNSPEC_UNSIGNED_PCMP)))]
> +  "TARGET_AVX512BW
> +   && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
> +  "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
>  (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
>    [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
>         (unspec:<avx512fmaskmode>
> @@ -3785,6 +3986,72 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "<sseinsnmode>")])
>
> +(define_insn_and_split "*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
> +  [(set (match_operand:SWI248x 0 "register_operand" "=k")
> +       (zero_extend:SWI248x
> +         (unspec:<VI48_AVX512VL:avx512fmaskmode>
> +           [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
> +            (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
> +            (match_operand:SI 3 "const_0_to_7_operand" "n")]
> +           UNSPEC_UNSIGNED_PCMP)))]
> +  "TARGET_AVX512BW
> +  && (GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode)
> +      < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
> +  "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> +  "&& <SWI248x:MODE>mode != E_DImode"
> +  [(set (match_dup 0)
> +       (zero_extend:DI
> +         (unspec:<VI48_AVX512VL:avx512fmaskmode>
> +           [(match_dup 1)
> +            (match_dup 2)
> +            (match_dup 3)]
> +          UNSPEC_UNSIGNED_PCMP)))]
> +  "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<VI48_AVX512VL:sseinsnmode>")])
> +
> +(define_insn_and_split "*<avx512>_ucmp<mode>3"
> +  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
> +       (unspec:<avx512fmaskmode>
> +         [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
> +          (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
> +          (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> +         UNSPEC_UNSIGNED_PCMP))]
> +  "TARGET_AVX512BW
> +   && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64"
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 0)
> +       (zero_extend:DI
> +         (unspec:<avx512fmaskmode>
> +           [(match_dup 1)
> +            (match_dup 2)
> +            (match_dup 3)]
> +           UNSPEC_UNSIGNED_PCMP)))]
> +  "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
> +(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi"
> +  [(set (match_operand:DI 0 "register_operand" "=k")
> +       (zero_extend:DI
> +         (unspec:<avx512fmaskmode>
> +           [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
> +            (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
> +            (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> +           UNSPEC_UNSIGNED_PCMP)))]
> +  "TARGET_AVX512BW
> +   && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64"
> +  "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
>  (define_insn_and_split "*<avx512>_ucmp<mode>3"
>    [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
>         (not:<avx512fmaskmode>
> diff --git a/gcc/testsuite/g++.target/i386/pr103750-1.C b/gcc/testsuite/g++.target/i386/pr103750-1.C
> new file mode 100644
> index 00000000000..83f471331b3
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr103750-1.C
> @@ -0,0 +1,50 @@
> +/* PR target/103750 */
> +/* { dg-do compile }  */
> +/* { dg-options "-O2 -march=cannonlake -maes -std=c++1y" } */
> +/* { dg-final { scan-assembler-times "kmovw" 2 } } */
> +/* { dg-final { scan-assembler-times "kmovd" 2 } } */
> +/* There shouldn't be any kmovw/kmovd inside the loop.  */
> +#include <immintrin.h>
> +
> +const char16_t *qustrchr(char16_t *n, char16_t *e, char16_t c) noexcept
> +{
> +  __m256i mch256 = _mm256_set1_epi16(c);
> +  for ( ; n < e; n += 32) {
> +    __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
> +    __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1);
> +    __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256);
> +    __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256);
> +    if (_kortestz_mask16_u8(mask1, mask2))
> +      continue;
> +
> +    unsigned idx = _tzcnt_u32(mask1);
> +    if (mask1 == 0) {
> +      idx = __tzcnt_u16(mask2);
> +      n += 16;
> +    }
> +    return n + idx;
> +  }
> +  return e;
> +}
> +
> +const char16_t *qustrchr1(char16_t *n, char16_t *e, char16_t c) noexcept
> +{
> +  __m256i mch256 = _mm256_set1_epi16(c);
> +  for ( ; n < e; n += 32) {
> +    __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
> +    __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1);
> +    __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256);
> +    __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256);
> +    if (_kortestz_mask32_u8(mask1, mask2))
> +      continue;
> +
> +    unsigned idx = _tzcnt_u32(mask1);
> +    if (mask1 == 0) {
> +      idx = __tzcnt_u16(mask2);
> +      n += 16;
> +    }
> +    return n + idx;
> +  }
> +  return e;
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> index 352c49d6c6b..82bb99e30af 100644
> --- a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> @@ -12,7 +12,7 @@ foo_orb (__m512i a, __m512i b)
>    foo = m1 | m2;
>  }
>
> -/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail *-*-* } } }  */
> +/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } }  */
>
>  void
>  foo_xorb (__m512i a, __m512i b)
> @@ -22,7 +22,7 @@ foo_xorb (__m512i a, __m512i b)
>    foo = m1 ^ m2;
>  }
>
> -/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail *-*-* } } }  */
> +/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } }  */
>
>  void
>  foo_andb (__m512i a, __m512i b)
> @@ -40,4 +40,4 @@ foo_andnb (__m512i a, __m512i b)
>    foo = m1 & ~m2;
>  }
>
> -/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail *-*-* } } }  */
> +/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail { *-*-* && { ! ia32 } } } } }  */
> --
> 2.18.1
>


-- 
BR,
Hongtao

[-- Attachment #2: 0001-i386-Combine-vpcmpuw-zero_extend-to-vpcmpuw.patch --]
[-- Type: application/octet-stream, Size: 51133 bytes --]

From 19c4a95245aff5aa0d11832211164c9f7e600aed Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 22 Dec 2021 16:48:54 +0800
Subject: [PATCH] [i386] Combine vpcmpuw + zero_extend to vpcmpuw.

vcmp{ps,ph,pd} and vpcmp{,u}{b,w,d,q} implicitly clear the upper bits
of dest.

gcc/ChangeLog:

	PR target/103750
	* config/i386/sse.md
	(*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
	New pre_reload define_insn_and_split.
	(*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
	Ditto.
	(*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
	Ditto.
	(*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
	Ditto.
	(*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>_2):
	Ditto.
	(*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>_2):
	Ditto.
	(*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>_2):
	Ditto.
	(*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>_2):
	Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx512bw-pr103750-1.c: New test.
	* gcc.target/i386/avx512bw-pr103750-2.c: New test.
	* gcc.target/i386/avx512f-pr103750-1.c: New test.
	* gcc.target/i386/avx512f-pr103750-2.c: New test.
	* gcc.target/i386/avx512fp16-pr103750-1.c: New test.
	* gcc.target/i386/avx512fp16-pr103750-2.c: New test.
---
 gcc/config/i386/sse.md                        | 275 ++++++++++
 .../gcc.target/i386/avx512bw-pr103750-1.c     | 154 ++++++
 .../gcc.target/i386/avx512bw-pr103750-2.c     | 173 +++++++
 .../gcc.target/i386/avx512f-pr103750-1.c      | 426 ++++++++++++++++
 .../gcc.target/i386/avx512f-pr103750-2.c      | 478 ++++++++++++++++++
 .../gcc.target/i386/avx512fp16-pr103750-1.c   |  58 +++
 .../gcc.target/i386/avx512fp16-pr103750-2.c   |  71 +++
 7 files changed, 1635 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr103750-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-pr103750-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-pr103750-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr103750-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr103750-2.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5196149ee32..72cfe85a4b7 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -3702,6 +3702,77 @@ (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>"
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+;; Since vpcmpd implicitly clear the upper bits of dest, transform
+;; vpcmpd + zero_extend to vpcmpd since the instruction
+(define_insn_and_split "*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
+  [(set (match_operand:SWI248x 0 "register_operand")
+	(zero_extend:SWI248x
+	  (unspec:<V48H_AVX512VL:avx512fmaskmode>
+	    [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand")
+	     (match_operand:V48H_AVX512VL 2 "nonimmediate_operand")
+	     (match_operand:SI 3 "const_0_to_7_operand" "n")]
+	    UNSPEC_PCMP)))]
+  "TARGET_AVX512F
+   && (!VALID_MASK_AVX512BW_MODE (<SWI248x:MODE>mode) || TARGET_AVX512BW)
+   && ix86_pre_reload_split ()
+   && (GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode)
+      < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:<V48H_AVX512VL:avx512fmaskmode>
+	  [(match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  UNSPEC_PCMP))]
+{
+  operands[1] = force_reg (<V48H_AVX512VL:MODE>mode, operands[1]);
+  operands[0] = lowpart_subreg (<V48H_AVX512VL:avx512fmaskmode>mode,
+				 operands[0], <SWI248x:MODE>mode);
+}
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<V48H_AVX512VL:sseinsnmode>")])
+
+(define_insn_and_split "*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>_2"
+  [(set (match_operand:SWI248x 0 "register_operand")
+	(zero_extend:SWI248x
+	  (unspec:<V48H_AVX512VL:avx512fmaskmode>
+	    [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand")
+	     (match_operand:V48H_AVX512VL 2 "nonimmediate_operand")
+	     (match_operand:SI 3 "const_0_to_7_operand")]
+	    UNSPEC_PCMP)))
+   (set (match_operand:<V48H_AVX512VL:avx512fmaskmode> 4 "register_operand")
+	(unspec:<V48H_AVX512VL:avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	    UNSPEC_PCMP))]
+  "TARGET_AVX512F
+   && (!VALID_MASK_AVX512BW_MODE (<SWI248x:MODE>mode) || TARGET_AVX512BW)
+   && (GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode)
+       < GET_MODE_PRECISION (<SWI248x:MODE>mode))
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:<V48H_AVX512VL:avx512fmaskmode>
+	  [(match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  UNSPEC_PCMP))
+   (set (match_dup 4) (match_dup 0))]
+{
+  operands[1] = force_reg (<V48H_AVX512VL:MODE>mode, operands[1]);
+  operands[0] = lowpart_subreg (<V48H_AVX512VL:avx512fmaskmode>mode,
+				operands[0], <SWI248x:MODE>mode);
+}
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<V48H_AVX512VL:sseinsnmode>")])
+
 (define_insn_and_split "*<avx512>_cmp<mode>3"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
 	(not:<avx512fmaskmode>
@@ -3735,6 +3806,73 @@ (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name>"
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
+  [(set (match_operand:SWI248x 0 "register_operand")
+	(zero_extend:SWI248x
+	  (unspec:<VI12_AVX512VL:avx512fmaskmode>
+	    [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand")
+	     (match_operand:VI12_AVX512VL 2 "nonimmediate_operand")
+	     (match_operand:SI 3 "const_0_to_7_operand")]
+	    UNSPEC_PCMP)))]
+  "TARGET_AVX512BW
+  && ix86_pre_reload_split ()
+  && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode)
+      < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:<VI12_AVX512VL:avx512fmaskmode>
+	  [(match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  UNSPEC_PCMP))]
+{
+  operands[1] = force_reg (<VI12_AVX512VL:MODE>mode, operands[1]);
+  operands[0] = lowpart_subreg (<VI12_AVX512VL:avx512fmaskmode>mode,
+				operands[0], <SWI248x:MODE>mode);
+}
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")])
+
+(define_insn_and_split "*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>_2"
+  [(set (match_operand:SWI248x 0 "register_operand")
+	(zero_extend:SWI248x
+	  (unspec:<VI12_AVX512VL:avx512fmaskmode>
+	    [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand")
+	     (match_operand:VI12_AVX512VL 2 "nonimmediate_operand")
+	     (match_operand:SI 3 "const_0_to_7_operand")]
+	    UNSPEC_PCMP)))
+   (set (match_operand:<VI12_AVX512VL:avx512fmaskmode> 4 "register_operand")
+	(unspec:<VI12_AVX512VL:avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	    UNSPEC_PCMP))]
+  "TARGET_AVX512BW
+  && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode)
+      < GET_MODE_PRECISION (<SWI248x:MODE>mode))
+  && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:<VI12_AVX512VL:avx512fmaskmode>
+	  [(match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  UNSPEC_PCMP))
+   (set (match_dup 4) (match_dup 0))]
+{
+  operands[1] = force_reg (<VI12_AVX512VL:MODE>mode, operands[1]);
+  operands[0] = lowpart_subreg (<VI12_AVX512VL:avx512fmaskmode>mode,
+				operands[0], <SWI248x:MODE>mode);
+}
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")])
+
 (define_int_iterator UNSPEC_PCMP_ITER
   [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP])
 
@@ -3771,6 +3909,74 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
+  [(set (match_operand:SWI248x 0 "register_operand")
+	(zero_extend:SWI248x
+	  (unspec:<VI12_AVX512VL:avx512fmaskmode>
+	    [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand")
+	     (match_operand:VI12_AVX512VL 2 "nonimmediate_operand")
+	     (match_operand:SI 3 "const_0_to_7_operand")]
+	    UNSPEC_UNSIGNED_PCMP)))]
+  "TARGET_AVX512BW
+  && ix86_pre_reload_split ()
+  && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode)
+      < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:<VI12_AVX512VL:avx512fmaskmode>
+	  [(match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  UNSPEC_UNSIGNED_PCMP))]
+{
+  operands[1] = force_reg (<VI12_AVX512VL:MODE>mode, operands[1]);
+  operands[0] = lowpart_subreg (<VI12_AVX512VL:avx512fmaskmode>mode,
+				operands[0], <SWI248x:MODE>mode);
+}
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")])
+
+(define_insn_and_split "*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>_2"
+  [(set (match_operand:SWI248x 0 "register_operand")
+	(zero_extend:SWI248x
+	  (unspec:<VI12_AVX512VL:avx512fmaskmode>
+	    [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand")
+	     (match_operand:VI12_AVX512VL 2 "nonimmediate_operand")
+	     (match_operand:SI 3 "const_0_to_7_operand")]
+	    UNSPEC_UNSIGNED_PCMP)))
+   (set (match_operand:<VI12_AVX512VL:avx512fmaskmode> 4 "register_operand")
+	(unspec:<VI12_AVX512VL:avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	    UNSPEC_UNSIGNED_PCMP))]
+  "TARGET_AVX512BW
+   && ix86_pre_reload_split ()
+   && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode)
+      < GET_MODE_PRECISION (<SWI248x:MODE>mode))
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:<VI12_AVX512VL:avx512fmaskmode>
+	  [(match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  UNSPEC_UNSIGNED_PCMP))
+   (set (match_dup 4) (match_dup 0))]
+{
+  operands[1] = force_reg (<VI12_AVX512VL:MODE>mode, operands[1]);
+  operands[0] = lowpart_subreg (<VI12_AVX512VL:avx512fmaskmode>mode,
+				operands[0], <SWI248x:MODE>mode);
+}
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")])
+
 (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
 	(unspec:<avx512fmaskmode>
@@ -3785,6 +3991,75 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
+  [(set (match_operand:SWI248x 0 "register_operand")
+	(zero_extend:SWI248x
+	  (unspec:<VI48_AVX512VL:avx512fmaskmode>
+	    [(match_operand:VI48_AVX512VL 1 "nonimmediate_operand")
+	     (match_operand:VI48_AVX512VL 2 "nonimmediate_operand")
+	     (match_operand:SI 3 "const_0_to_7_operand")]
+	    UNSPEC_UNSIGNED_PCMP)))]
+  "TARGET_AVX512F
+   && (!VALID_MASK_AVX512BW_MODE (<SWI248x:MODE>mode) || TARGET_AVX512BW)
+   && ix86_pre_reload_split ()
+   && (GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode)
+      < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:<VI48_AVX512VL:avx512fmaskmode>
+	  [(match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  UNSPEC_UNSIGNED_PCMP))]
+{
+  operands[1] = force_reg (<VI48_AVX512VL:MODE>mode, operands[1]);
+  operands[0] = lowpart_subreg (<VI48_AVX512VL:avx512fmaskmode>mode,
+				operands[0], <SWI248x:MODE>mode);
+}
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<VI48_AVX512VL:sseinsnmode>")])
+
+(define_insn_and_split "*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>_2"
+  [(set (match_operand:SWI248x 0 "register_operand")
+	(zero_extend:SWI248x
+	  (unspec:<VI48_AVX512VL:avx512fmaskmode>
+	    [(match_operand:VI48_AVX512VL 1 "nonimmediate_operand")
+	     (match_operand:VI48_AVX512VL 2 "nonimmediate_operand")
+	     (match_operand:SI 3 "const_0_to_7_operand")]
+	    UNSPEC_UNSIGNED_PCMP)))
+   (set (match_operand:<VI48_AVX512VL:avx512fmaskmode> 4 "register_operand")
+	(unspec:<VI48_AVX512VL:avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	    UNSPEC_UNSIGNED_PCMP))]
+  "TARGET_AVX512F
+   && (!VALID_MASK_AVX512BW_MODE (<SWI248x:MODE>mode) || TARGET_AVX512BW)
+   && (GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode)
+       < GET_MODE_PRECISION (<SWI248x:MODE>mode))
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:<VI48_AVX512VL:avx512fmaskmode>
+	  [(match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  UNSPEC_UNSIGNED_PCMP))
+   (set (match_dup 4) (match_dup 0))]
+{
+  operands[1] = force_reg (<VI48_AVX512VL:MODE>mode, operands[1]);
+  operands[0] = lowpart_subreg (<VI48_AVX512VL:avx512fmaskmode>mode,
+				operands[0], <SWI248x:MODE>mode);
+}
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<VI48_AVX512VL:sseinsnmode>")])
+
 (define_insn_and_split "*<avx512>_ucmp<mode>3"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
 	(not:<avx512fmaskmode>
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-1.c
new file mode 100644
index 00000000000..b1165f069bb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-1.c
@@ -0,0 +1,154 @@
+/* PR target/103750 */
+/* { dg-do compile }  */
+/* { dg-options "-O2 -mavx512bw -mavx512vl" } */
+/* { dg-final { scan-assembler-not "kmov" { xfail ia32 } } } */
+/* xfail need to be fixed.  */
+
+#include <immintrin.h>
+extern __m128i* pi128;
+extern __m256i* pi256;
+extern __m512i* pi512;
+
+unsigned char
+foo ()
+{
+  __mmask16 mask1 = _mm_cmpeq_epu8_mask (pi128[0], pi128[1]);
+  __mmask16 mask2 = _mm_cmpeq_epu8_mask (pi128[2], pi128[3]);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+foo1 ()
+{
+  __mmask16 mask1 = _mm_cmpeq_epu8_mask (pi128[0], pi128[1]);
+  __mmask16 mask2 = _mm_cmpeq_epu8_mask (pi128[2], pi128[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+foo2 ()
+{
+  __mmask32 mask1 = _mm256_cmpeq_epu8_mask (pi256[0], pi256[1]);
+  __mmask32 mask2 = _mm256_cmpeq_epu8_mask (pi256[2], pi256[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+foo3 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epu16_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epu16_mask (pi128[1], pi128[2]);
+  return _kortestz_mask16_u8 (mask1, mask2);
+}
+
+unsigned char
+foo4 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epu16_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epu16_mask (pi128[1], pi128[2]);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+foo5 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epu16_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epu16_mask (pi128[1], pi128[2]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+foo6 ()
+{
+  __mmask16 mask1 = _mm256_cmpeq_epu16_mask (pi256[0], pi256[1]);
+  __mmask16 mask2 = _mm256_cmpeq_epu16_mask (pi256[2], pi256[3]);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+foo7 ()
+{
+  __mmask16 mask1 = _mm256_cmpeq_epu16_mask (pi256[0], pi256[1]);
+  __mmask16 mask2 = _mm256_cmpeq_epu16_mask (pi256[2], pi256[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+foo8 ()
+{
+  __mmask32 mask1 = _mm512_cmpeq_epu16_mask (pi512[0], pi512[1]);
+  __mmask32 mask2 = _mm512_cmpeq_epu16_mask (pi512[2], pi512[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo ()
+{
+  __mmask16 mask1 = _mm_cmpeq_epi8_mask (pi128[0], pi128[1]);
+  __mmask16 mask2 = _mm_cmpeq_epi8_mask (pi128[2], pi128[3]);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo1 ()
+{
+  __mmask16 mask1 = _mm_cmpeq_epi8_mask (pi128[0], pi128[1]);
+  __mmask16 mask2 = _mm_cmpeq_epi8_mask (pi128[2], pi128[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo2 ()
+{
+  __mmask32 mask1 = _mm256_cmpeq_epi8_mask (pi256[0], pi256[1]);
+  __mmask32 mask2 = _mm256_cmpeq_epi8_mask (pi256[2], pi256[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo3 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epi16_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epi16_mask (pi128[1], pi128[2]);
+  return _kortestz_mask16_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo4 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epi16_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epi16_mask (pi128[1], pi128[2]);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo5 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epi16_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epi16_mask (pi128[1], pi128[2]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo6 ()
+{
+  __mmask16 mask1 = _mm256_cmpeq_epi16_mask (pi256[0], pi256[1]);
+  __mmask16 mask2 = _mm256_cmpeq_epi16_mask (pi256[2], pi256[3]);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo7 ()
+{
+  __mmask16 mask1 = _mm256_cmpeq_epi16_mask (pi256[0], pi256[1]);
+  __mmask16 mask2 = _mm256_cmpeq_epi16_mask (pi256[2], pi256[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo8 ()
+{
+  __mmask32 mask1 = _mm512_cmpeq_epi16_mask (pi512[0], pi512[1]);
+  __mmask32 mask2 = _mm512_cmpeq_epi16_mask (pi512[2], pi512[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c
new file mode 100644
index 00000000000..7303f5403ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c
@@ -0,0 +1,173 @@
+/* PR target/103750 */
+/* { dg-do compile }  */
+/* { dg-options "-O2 -mavx512dq -mavx512bw -mavx512vl" } */
+/* { dg-final { scan-assembler-not "kmov" { xfail ia32 } } } */
+/* xfail need to be fixed.  */
+
+#include <immintrin.h>
+extern __m128i* pi128;
+extern __m256i* pi256;
+extern __m512i* pi512;
+
+extern char a, b;
+void
+foo ()
+{
+  __mmask16 mask1 = _mm_cmpeq_epu8_mask (pi128[0], pi128[1]);
+  __mmask16 mask2 = _mm_cmpeq_epu8_mask (pi128[2], pi128[3]);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask16_u8 (mask1, mask2);
+}
+
+void
+foo1 ()
+{
+  __mmask16 mask1 = _mm_cmpeq_epu8_mask (pi128[0], pi128[1]);
+  __mmask16 mask2 = _mm_cmpeq_epu8_mask (pi128[2], pi128[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask16_u8 (mask1, mask2);
+}
+
+void
+foo2 ()
+{
+  __mmask32 mask1 = _mm256_cmpeq_epu8_mask (pi256[0], pi256[1]);
+  __mmask32 mask2 = _mm256_cmpeq_epu8_mask (pi256[2], pi256[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask32_u8 (mask1, mask2);
+}
+
+void
+foo3 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epu16_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epu16_mask (pi128[1], pi128[2]);
+  a = _kortestz_mask16_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+foo4 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epu16_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epu16_mask (pi128[1], pi128[2]);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+foo5 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epu16_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epu16_mask (pi128[1], pi128[2]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+foo6 ()
+{
+  __mmask16 mask1 = _mm256_cmpeq_epu16_mask (pi256[0], pi256[1]);
+  __mmask16 mask2 = _mm256_cmpeq_epu16_mask (pi256[2], pi256[3]);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask16_u8 (mask1, mask2);
+}
+
+void
+foo7 ()
+{
+  __mmask16 mask1 = _mm256_cmpeq_epu16_mask (pi256[0], pi256[1]);
+  __mmask16 mask2 = _mm256_cmpeq_epu16_mask (pi256[2], pi256[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask16_u8 (mask1, mask2);
+}
+
+void
+foo8 ()
+{
+  __mmask32 mask1 = _mm512_cmpeq_epu16_mask (pi512[0], pi512[1]);
+  __mmask32 mask2 = _mm512_cmpeq_epu16_mask (pi512[2], pi512[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask32_u8 (mask1, mask2);
+}
+
+void
+sign_foo ()
+{
+  __mmask16 mask1 = _mm_cmpeq_epi8_mask (pi128[0], pi128[1]);
+  __mmask16 mask2 = _mm_cmpeq_epi8_mask (pi128[2], pi128[3]);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask16_u8 (mask1, mask2);
+}
+
+void
+sign_foo1 ()
+{
+  __mmask16 mask1 = _mm_cmpeq_epi8_mask (pi128[0], pi128[1]);
+  __mmask16 mask2 = _mm_cmpeq_epi8_mask (pi128[2], pi128[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask16_u8 (mask1, mask2);
+}
+
+void
+sign_foo2 ()
+{
+  __mmask32 mask1 = _mm256_cmpeq_epi8_mask (pi256[0], pi256[1]);
+  __mmask32 mask2 = _mm256_cmpeq_epi8_mask (pi256[2], pi256[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask32_u8 (mask1, mask2);
+}
+
+void
+sign_foo3 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epi16_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epi16_mask (pi128[1], pi128[2]);
+  a = _kortestz_mask16_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo4 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epi16_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epi16_mask (pi128[1], pi128[2]);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo5 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epi16_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epi16_mask (pi128[1], pi128[2]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo6 ()
+{
+  __mmask16 mask1 = _mm256_cmpeq_epi16_mask (pi256[0], pi256[1]);
+  __mmask16 mask2 = _mm256_cmpeq_epi16_mask (pi256[2], pi256[3]);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask16_u8 (mask1, mask2);
+}
+
+void
+sign_foo7 ()
+{
+  __mmask16 mask1 = _mm256_cmpeq_epi16_mask (pi256[0], pi256[1]);
+  __mmask16 mask2 = _mm256_cmpeq_epi16_mask (pi256[2], pi256[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask16_u8 (mask1, mask2);
+}
+
+void
+sign_foo8 ()
+{
+  __mmask32 mask1 = _mm512_cmpeq_epi16_mask (pi512[0], pi512[1]);
+  __mmask32 mask2 = _mm512_cmpeq_epi16_mask (pi512[2], pi512[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask32_u8 (mask1, mask2);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr103750-1.c b/gcc/testsuite/gcc.target/i386/avx512f-pr103750-1.c
new file mode 100644
index 00000000000..613efe0f926
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr103750-1.c
@@ -0,0 +1,426 @@
+/* PR target/103750 */
+/* { dg-do compile }  */
+/* { dg-options "-O2 -mavx512f -mavx512vl -mavx512bw" } */
+/* { dg-final { scan-assembler-not "kmov" { xfail ia32 } } } */
+/* xfail need to be fixed.  */
+
+#include <immintrin.h>
+extern __m128i* pi128;
+extern __m256i* pi256;
+extern __m512i* pi512;
+
+extern __m128* ps128;
+extern __m256* ps256;
+extern __m512* ps512;
+
+extern __m128d* pd128;
+extern __m256d* pd256;
+extern __m512d* pd512;
+
+unsigned char
+foo ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epu32_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epu32_mask (pi128[2], pi128[3]);
+  return _kortestz_mask16_u8 (mask1, mask2);
+}
+
+unsigned char
+foo1 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epu32_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epu32_mask (pi128[2], pi128[3]);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+foo2 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epu32_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epu32_mask (pi128[2], pi128[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+foo3 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epu32_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epu32_mask (pi256[2], pi256[3]);
+  return _kortestz_mask16_u8 (mask1, mask2);
+}
+
+unsigned char
+foo4 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epu32_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epu32_mask (pi256[2], pi256[3]);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+foo5 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epu32_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epu32_mask (pi256[2], pi256[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+foo6 ()
+{
+  __mmask16 mask1 = _mm512_cmpeq_epu32_mask (pi512[0], pi512[1]);
+  __mmask16 mask2 = _mm512_cmpeq_epu32_mask (pi512[2], pi512[3]);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+foo7 ()
+{
+  __mmask16 mask1 = _mm512_cmpeq_epu32_mask (pi512[0], pi512[1]);
+  __mmask16 mask2 = _mm512_cmpeq_epu32_mask (pi512[2], pi512[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+foo8 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epu64_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epu64_mask (pi128[2], pi128[3]);
+  return _kortestz_mask16_u8 (mask1, mask2);
+}
+
+unsigned char
+foo9 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epu64_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epu64_mask (pi128[2], pi128[3]);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+foo10 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epu64_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epu64_mask (pi128[2], pi128[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+foo11 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epu64_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epu64_mask (pi256[2], pi256[3]);
+  return _kortestz_mask16_u8 (mask1, mask2);
+}
+
+unsigned char
+foo12 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epu64_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epu64_mask (pi256[2], pi256[3]);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+foo13 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epu64_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epu64_mask (pi256[2], pi256[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+foo14 ()
+{
+  __mmask8 mask1 = _mm512_cmpeq_epu64_mask (pi512[0], pi512[1]);
+  __mmask8 mask2 = _mm512_cmpeq_epu64_mask (pi512[2], pi512[3]);
+  return _kortestz_mask16_u8 (mask1, mask2);
+}
+
+unsigned char
+foo15 ()
+{
+  __mmask8 mask1 = _mm512_cmpeq_epu64_mask (pi512[0], pi512[1]);
+  __mmask8 mask2 = _mm512_cmpeq_epu64_mask (pi512[2], pi512[3]);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+foo16 ()
+{
+  __mmask8 mask1 = _mm512_cmpeq_epu64_mask (pi512[0], pi512[1]);
+  __mmask8 mask2 = _mm512_cmpeq_epu64_mask (pi512[2], pi512[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epi32_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epi32_mask (pi128[2], pi128[3]);
+  return _kortestz_mask16_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo1 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epi32_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epi32_mask (pi128[2], pi128[3]);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo2 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epi32_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epi32_mask (pi128[2], pi128[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo3 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epi32_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epi32_mask (pi256[2], pi256[3]);
+  return _kortestz_mask16_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo4 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epi32_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epi32_mask (pi256[2], pi256[3]);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo5 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epi32_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epi32_mask (pi256[2], pi256[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo6 ()
+{
+  __mmask16 mask1 = _mm512_cmpeq_epi32_mask (pi512[0], pi512[1]);
+  __mmask16 mask2 = _mm512_cmpeq_epi32_mask (pi512[2], pi512[3]);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo7 ()
+{
+  __mmask16 mask1 = _mm512_cmpeq_epi32_mask (pi512[0], pi512[1]);
+  __mmask16 mask2 = _mm512_cmpeq_epi32_mask (pi512[2], pi512[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo8 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epi64_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epi64_mask (pi128[2], pi128[3]);
+  return _kortestz_mask16_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo9 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epi64_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epi64_mask (pi128[2], pi128[3]);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo10 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epi64_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epi64_mask (pi128[2], pi128[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo11 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epi64_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epi64_mask (pi256[2], pi256[3]);
+  return _kortestz_mask16_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo12 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epi64_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epi64_mask (pi256[2], pi256[3]);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo13 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epi64_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epi64_mask (pi256[2], pi256[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo14 ()
+{
+  __mmask8 mask1 = _mm512_cmpeq_epi64_mask (pi512[0], pi512[1]);
+  __mmask8 mask2 = _mm512_cmpeq_epi64_mask (pi512[2], pi512[3]);
+  return _kortestz_mask16_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo15 ()
+{
+  __mmask8 mask1 = _mm512_cmpeq_epi64_mask (pi512[0], pi512[1]);
+  __mmask8 mask2 = _mm512_cmpeq_epi64_mask (pi512[2], pi512[3]);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo16 ()
+{
+  __mmask8 mask1 = _mm512_cmpeq_epi64_mask (pi512[0], pi512[1]);
+  __mmask8 mask2 = _mm512_cmpeq_epi64_mask (pi512[2], pi512[3]);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+float_foo ()
+{
+  __mmask8 mask1 = _mm_cmp_ps_mask (ps128[0], ps128[1], 1);
+  __mmask8 mask2 = _mm_cmp_ps_mask (ps128[2], ps128[3], 1);
+  return _kortestz_mask16_u8 (mask1, mask2);
+}
+
+unsigned char
+float_foo1 ()
+{
+  __mmask8 mask1 = _mm_cmp_ps_mask (ps128[0], ps128[1], 1);
+  __mmask8 mask2 = _mm_cmp_ps_mask (ps128[2], ps128[3], 1);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+float_foo2 ()
+{
+  __mmask8 mask1 = _mm_cmp_ps_mask (ps128[0], ps128[1], 1);
+  __mmask8 mask2 = _mm_cmp_ps_mask (ps128[2], ps128[3], 1);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+float_foo3 ()
+{
+  __mmask8 mask1 = _mm256_cmp_ps_mask (ps256[0], ps256[1], 1);
+  __mmask8 mask2 = _mm256_cmp_ps_mask (ps256[2], ps256[3], 1);
+  return _kortestz_mask16_u8 (mask1, mask2);
+}
+
+unsigned char
+float_foo4 ()
+{
+  __mmask8 mask1 = _mm256_cmp_ps_mask (ps256[0], ps256[1], 1);
+  __mmask8 mask2 = _mm256_cmp_ps_mask (ps256[2], ps256[3], 1);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+float_foo5 ()
+{
+  __mmask8 mask1 = _mm256_cmp_ps_mask (ps256[0], ps256[1], 1);
+  __mmask8 mask2 = _mm256_cmp_ps_mask (ps256[2], ps256[3], 1);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+float_foo6 ()
+{
+  __mmask16 mask1 = _mm512_cmp_ps_mask (ps512[0], ps512[1], 1);
+  __mmask16 mask2 = _mm512_cmp_ps_mask (ps512[2], ps512[3], 1);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+float_foo7 ()
+{
+  __mmask16 mask1 = _mm512_cmp_ps_mask (ps512[0], ps512[1], 1);
+  __mmask16 mask2 = _mm512_cmp_ps_mask (ps512[2], ps512[3], 1);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+float_foo8 ()
+{
+  __mmask8 mask1 = _mm_cmp_pd_mask (pd128[0], pd128[1], 1);
+  __mmask8 mask2 = _mm_cmp_pd_mask (pd128[2], pd128[3], 1);
+  return _kortestz_mask16_u8 (mask1, mask2);
+}
+
+unsigned char
+float_foo9 ()
+{
+  __mmask8 mask1 = _mm_cmp_pd_mask (pd128[0], pd128[1], 1);
+  __mmask8 mask2 = _mm_cmp_pd_mask (pd128[2], pd128[3], 1);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+float_foo10 ()
+{
+  __mmask8 mask1 = _mm_cmp_pd_mask (pd128[0], pd128[1], 1);
+  __mmask8 mask2 = _mm_cmp_pd_mask (pd128[2], pd128[3], 1);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+float_foo11 ()
+{
+  __mmask8 mask1 = _mm256_cmp_pd_mask (pd256[0], pd256[1], 1);
+  __mmask8 mask2 = _mm256_cmp_pd_mask (pd256[2], pd256[3], 1);
+  return _kortestz_mask16_u8 (mask1, mask2);
+}
+
+unsigned char
+float_foo12 ()
+{
+  __mmask8 mask1 = _mm256_cmp_pd_mask (pd256[0], pd256[1], 1);
+  __mmask8 mask2 = _mm256_cmp_pd_mask (pd256[2], pd256[3], 1);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+float_foo13 ()
+{
+  __mmask8 mask1 = _mm256_cmp_pd_mask (pd256[0], pd256[1], 1);
+  __mmask8 mask2 = _mm256_cmp_pd_mask (pd256[2], pd256[3], 1);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+float_foo14 ()
+{
+  __mmask8 mask1 = _mm512_cmp_pd_mask (pd512[0], pd512[1], 1);
+  __mmask8 mask2 = _mm512_cmp_pd_mask (pd512[2], pd512[3], 1);
+  return _kortestz_mask16_u8 (mask1, mask2);
+}
+
+unsigned char
+float_foo15 ()
+{
+  __mmask8 mask1 = _mm512_cmp_pd_mask (pd512[0], pd512[1], 1);
+  __mmask8 mask2 = _mm512_cmp_pd_mask (pd512[2], pd512[3], 1);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+float_foo16 ()
+{
+  __mmask8 mask1 = _mm512_cmp_pd_mask (pd512[0], pd512[1], 1);
+  __mmask8 mask2 = _mm512_cmp_pd_mask (pd512[2], pd512[3], 1);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr103750-2.c b/gcc/testsuite/gcc.target/i386/avx512f-pr103750-2.c
new file mode 100644
index 00000000000..a6c2b06747d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr103750-2.c
@@ -0,0 +1,478 @@
+/* PR target/103750 */
+/* { dg-do compile }  */
+/* { dg-options "-O2 -mavx512dq -mavx512vl -mavx512bw" } */
+/* { dg-final { scan-assembler-not "kmov" { xfail ia32 } } } */
+/* xfail need to be fixed.  */
+
+#include <immintrin.h>
+extern __m128i* pi128;
+extern __m256i* pi256;
+extern __m512i* pi512;
+
+extern __m128* ps128;
+extern __m256* ps256;
+extern __m512* ps512;
+
+extern __m128d* pd128;
+extern __m256d* pd256;
+extern __m512d* pd512;
+
+extern char a, b;
+void
+foo ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epu32_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epu32_mask (pi128[2], pi128[3]);
+  a = _kortestz_mask16_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+foo1 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epu32_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epu32_mask (pi128[2], pi128[3]);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+foo2 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epu32_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epu32_mask (pi128[2], pi128[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+foo3 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epu32_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epu32_mask (pi256[2], pi256[3]);
+  a = _kortestz_mask16_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+foo4 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epu32_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epu32_mask (pi256[2], pi256[3]);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+foo5 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epu32_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epu32_mask (pi256[2], pi256[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+foo6 ()
+{
+  __mmask16 mask1 = _mm512_cmpeq_epu32_mask (pi512[0], pi512[1]);
+  __mmask16 mask2 = _mm512_cmpeq_epu32_mask (pi512[2], pi512[3]);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask16_u8 (mask1, mask2);
+}
+
+void
+foo7 ()
+{
+  __mmask16 mask1 = _mm512_cmpeq_epu32_mask (pi512[0], pi512[1]);
+  __mmask16 mask2 = _mm512_cmpeq_epu32_mask (pi512[2], pi512[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask16_u8 (mask1, mask2);
+}
+
+void
+foo8 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epu64_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epu64_mask (pi128[2], pi128[3]);
+  a = _kortestz_mask16_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+foo9 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epu64_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epu64_mask (pi128[2], pi128[3]);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+foo10 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epu64_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epu64_mask (pi128[2], pi128[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+foo11 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epu64_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epu64_mask (pi256[2], pi256[3]);
+  a = _kortestz_mask16_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+foo12 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epu64_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epu64_mask (pi256[2], pi256[3]);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+foo13 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epu64_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epu64_mask (pi256[2], pi256[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+foo14 ()
+{
+  __mmask8 mask1 = _mm512_cmpeq_epu64_mask (pi512[0], pi512[1]);
+  __mmask8 mask2 = _mm512_cmpeq_epu64_mask (pi512[2], pi512[3]);
+  a = _kortestz_mask16_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+foo15 ()
+{
+  __mmask8 mask1 = _mm512_cmpeq_epu64_mask (pi512[0], pi512[1]);
+  __mmask8 mask2 = _mm512_cmpeq_epu64_mask (pi512[2], pi512[3]);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+foo16 ()
+{
+  __mmask8 mask1 = _mm512_cmpeq_epu64_mask (pi512[0], pi512[1]);
+  __mmask8 mask2 = _mm512_cmpeq_epu64_mask (pi512[2], pi512[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epi32_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epi32_mask (pi128[2], pi128[3]);
+  a = _kortestz_mask16_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo1 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epi32_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epi32_mask (pi128[2], pi128[3]);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo2 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epi32_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epi32_mask (pi128[2], pi128[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo3 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epi32_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epi32_mask (pi256[2], pi256[3]);
+  a = _kortestz_mask16_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo4 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epi32_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epi32_mask (pi256[2], pi256[3]);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo5 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epi32_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epi32_mask (pi256[2], pi256[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo6 ()
+{
+  __mmask16 mask1 = _mm512_cmpeq_epi32_mask (pi512[0], pi512[1]);
+  __mmask16 mask2 = _mm512_cmpeq_epi32_mask (pi512[2], pi512[3]);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask16_u8 (mask1, mask2);
+}
+
+void
+sign_foo7 ()
+{
+  __mmask16 mask1 = _mm512_cmpeq_epi32_mask (pi512[0], pi512[1]);
+  __mmask16 mask2 = _mm512_cmpeq_epi32_mask (pi512[2], pi512[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask16_u8 (mask1, mask2);
+}
+
+void
+sign_foo8 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epi64_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epi64_mask (pi128[2], pi128[3]);
+  a = _kortestz_mask16_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo9 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epi64_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epi64_mask (pi128[2], pi128[3]);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo10 ()
+{
+  __mmask8 mask1 = _mm_cmpeq_epi64_mask (pi128[0], pi128[1]);
+  __mmask8 mask2 = _mm_cmpeq_epi64_mask (pi128[2], pi128[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo11 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epi64_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epi64_mask (pi256[2], pi256[3]);
+  a = _kortestz_mask16_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo12 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epi64_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epi64_mask (pi256[2], pi256[3]);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo13 ()
+{
+  __mmask8 mask1 = _mm256_cmpeq_epi64_mask (pi256[0], pi256[1]);
+  __mmask8 mask2 = _mm256_cmpeq_epi64_mask (pi256[2], pi256[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo14 ()
+{
+  __mmask8 mask1 = _mm512_cmpeq_epi64_mask (pi512[0], pi512[1]);
+  __mmask8 mask2 = _mm512_cmpeq_epi64_mask (pi512[2], pi512[3]);
+  a = _kortestz_mask16_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo15 ()
+{
+  __mmask8 mask1 = _mm512_cmpeq_epi64_mask (pi512[0], pi512[1]);
+  __mmask8 mask2 = _mm512_cmpeq_epi64_mask (pi512[2], pi512[3]);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo16 ()
+{
+  __mmask8 mask1 = _mm512_cmpeq_epi64_mask (pi512[0], pi512[1]);
+  __mmask8 mask2 = _mm512_cmpeq_epi64_mask (pi512[2], pi512[3]);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+float_foo ()
+{
+  __mmask8 mask1 = _mm_cmp_ps_mask (ps128[0], ps128[1], 1);
+  __mmask8 mask2 = _mm_cmp_ps_mask (ps128[2], ps128[3], 1);
+  a = _kortestz_mask16_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+float_foo1 ()
+{
+  __mmask8 mask1 = _mm_cmp_ps_mask (ps128[0], ps128[1], 1);
+  __mmask8 mask2 = _mm_cmp_ps_mask (ps128[2], ps128[3], 1);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+float_foo2 ()
+{
+  __mmask8 mask1 = _mm_cmp_ps_mask (ps128[0], ps128[1], 1);
+  __mmask8 mask2 = _mm_cmp_ps_mask (ps128[2], ps128[3], 1);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+float_foo3 ()
+{
+  __mmask8 mask1 = _mm256_cmp_ps_mask (ps256[0], ps256[1], 1);
+  __mmask8 mask2 = _mm256_cmp_ps_mask (ps256[2], ps256[3], 1);
+  a = _kortestz_mask16_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+float_foo4 ()
+{
+  __mmask8 mask1 = _mm256_cmp_ps_mask (ps256[0], ps256[1], 1);
+  __mmask8 mask2 = _mm256_cmp_ps_mask (ps256[2], ps256[3], 1);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+float_foo5 ()
+{
+  __mmask8 mask1 = _mm256_cmp_ps_mask (ps256[0], ps256[1], 1);
+  __mmask8 mask2 = _mm256_cmp_ps_mask (ps256[2], ps256[3], 1);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+float_foo6 ()
+{
+  __mmask16 mask1 = _mm512_cmp_ps_mask (ps512[0], ps512[1], 1);
+  __mmask16 mask2 = _mm512_cmp_ps_mask (ps512[2], ps512[3], 1);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask16_u8 (mask1, mask2);
+}
+
+void
+float_foo7 ()
+{
+  __mmask16 mask1 = _mm512_cmp_ps_mask (ps512[0], ps512[1], 1);
+  __mmask16 mask2 = _mm512_cmp_ps_mask (ps512[2], ps512[3], 1);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask16_u8 (mask1, mask2);
+}
+
+void
+float_foo8 ()
+{
+  __mmask8 mask1 = _mm_cmp_pd_mask (pd128[0], pd128[1], 1);
+  __mmask8 mask2 = _mm_cmp_pd_mask (pd128[2], pd128[3], 1);
+  a = _kortestz_mask16_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+float_foo9 ()
+{
+  __mmask8 mask1 = _mm_cmp_pd_mask (pd128[0], pd128[1], 1);
+  __mmask8 mask2 = _mm_cmp_pd_mask (pd128[2], pd128[3], 1);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+float_foo10 ()
+{
+  __mmask8 mask1 = _mm_cmp_pd_mask (pd128[0], pd128[1], 1);
+  __mmask8 mask2 = _mm_cmp_pd_mask (pd128[2], pd128[3], 1);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+float_foo11 ()
+{
+  __mmask8 mask1 = _mm256_cmp_pd_mask (pd256[0], pd256[1], 1);
+  __mmask8 mask2 = _mm256_cmp_pd_mask (pd256[2], pd256[3], 1);
+  a = _kortestz_mask16_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+float_foo12 ()
+{
+  __mmask8 mask1 = _mm256_cmp_pd_mask (pd256[0], pd256[1], 1);
+  __mmask8 mask2 = _mm256_cmp_pd_mask (pd256[2], pd256[3], 1);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+float_foo13 ()
+{
+  __mmask8 mask1 = _mm256_cmp_pd_mask (pd256[0], pd256[1], 1);
+  __mmask8 mask2 = _mm256_cmp_pd_mask (pd256[2], pd256[3], 1);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+float_foo14 ()
+{
+  __mmask8 mask1 = _mm512_cmp_pd_mask (pd512[0], pd512[1], 1);
+  __mmask8 mask2 = _mm512_cmp_pd_mask (pd512[2], pd512[3], 1);
+  a = _kortestz_mask16_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+float_foo15 ()
+{
+  __mmask8 mask1 = _mm512_cmp_pd_mask (pd512[0], pd512[1], 1);
+  __mmask8 mask2 = _mm512_cmp_pd_mask (pd512[2], pd512[3], 1);
+  a = _kortestz_mask32_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+float_foo16 ()
+{
+  __mmask8 mask1 = _mm512_cmp_pd_mask (pd512[0], pd512[1], 1);
+  __mmask8 mask2 = _mm512_cmp_pd_mask (pd512[2], pd512[3], 1);
+  a = _kortestz_mask64_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr103750-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16-pr103750-1.c
new file mode 100644
index 00000000000..eaf6d1e9819
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr103750-1.c
@@ -0,0 +1,58 @@
+/* PR target/103750 */
+/* { dg-do compile }  */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+/* { dg-final { scan-assembler-not "kmov" { xfail ia32 } } } */
+/* xfail need to be fixed.  */
+
+#include <immintrin.h>
+extern __m128h* ph128;
+extern __m256h* ph256;
+extern __m512h* ph512;
+
+unsigned char
+sign_foo3 ()
+{
+  __mmask8 mask1 = _mm_cmp_ph_mask (ph128[0], ph128[1], 1);
+  __mmask8 mask2 = _mm_cmp_ph_mask (ph128[1], ph128[2], 1);
+  return _kortestz_mask16_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo4 ()
+{
+  __mmask8 mask1 = _mm_cmp_ph_mask (ph128[0], ph128[1], 1);
+  __mmask8 mask2 = _mm_cmp_ph_mask (ph128[1], ph128[2], 1);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo5 ()
+{
+  __mmask8 mask1 = _mm_cmp_ph_mask (ph128[0], ph128[1], 1);
+  __mmask8 mask2 = _mm_cmp_ph_mask (ph128[1], ph128[2], 1);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo6 ()
+{
+  __mmask16 mask1 = _mm256_cmp_ph_mask (ph256[0], ph256[1], 1);
+  __mmask16 mask2 = _mm256_cmp_ph_mask (ph256[2], ph256[3], 1);
+  return _kortestz_mask32_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo7 ()
+{
+  __mmask16 mask1 = _mm256_cmp_ph_mask (ph256[0], ph256[1], 1);
+  __mmask16 mask2 = _mm256_cmp_ph_mask (ph256[2], ph256[3], 1);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
+
+unsigned char
+sign_foo8 ()
+{
+  __mmask32 mask1 = _mm512_cmp_ph_mask (ph512[0], ph512[1], 1);
+  __mmask32 mask2 = _mm512_cmp_ph_mask (ph512[2], ph512[3], 1);
+  return _kortestz_mask64_u8 (mask1, mask2);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr103750-2.c b/gcc/testsuite/gcc.target/i386/avx512fp16-pr103750-2.c
new file mode 100644
index 00000000000..3d3a033fe64
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr103750-2.c
@@ -0,0 +1,71 @@
+/* PR target/103750 */
+/* { dg-do compile }  */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -mavx512dq" } */
+/* { dg-final { scan-assembler-not "kmov" { xfail ia32 } } } */
+/* xfail need to be fixed.  */
+
+#include <immintrin.h>
+extern __m128h* ph128;
+extern __m256h* ph256;
+extern __m512h* ph512;
+
+extern char a, b;
+void
+sign_foo3 ()
+{
+  __mmask8 mask1 = _mm_cmp_ph_mask (ph128[0], ph128[1], 1);
+  __mmask8 mask2 = _mm_cmp_ph_mask (ph128[1], ph128[2], 1);
+
+  a = _kortestz_mask16_u8 (mask1, mask2);
+  b = _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo4 ()
+{
+  __mmask8 mask1 = _mm_cmp_ph_mask (ph128[0], ph128[1], 1);
+  __mmask8 mask2 = _mm_cmp_ph_mask (ph128[1], ph128[2], 1);
+
+  a =  _kortestz_mask32_u8 (mask1, mask2);
+  b =  _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo5 ()
+{
+  __mmask8 mask1 = _mm_cmp_ph_mask (ph128[0], ph128[1], 1);
+  __mmask8 mask2 = _mm_cmp_ph_mask (ph128[1], ph128[2], 1);
+
+  a =  _kortestz_mask64_u8 (mask1, mask2);
+  b =  _kortestz_mask8_u8 (mask1, mask2);
+}
+
+void
+sign_foo6 ()
+{
+  __mmask16 mask1 = _mm256_cmp_ph_mask (ph256[0], ph256[1], 1);
+  __mmask16 mask2 = _mm256_cmp_ph_mask (ph256[2], ph256[3], 1);
+
+  a =  _kortestz_mask32_u8 (mask1, mask2);
+  b =  _kortestz_mask16_u8 (mask1, mask2);
+}
+
+void
+sign_foo7 ()
+{
+  __mmask16 mask1 = _mm256_cmp_ph_mask (ph256[0], ph256[1], 1);
+  __mmask16 mask2 = _mm256_cmp_ph_mask (ph256[2], ph256[3], 1);
+
+  a =  _kortestz_mask64_u8 (mask1, mask2);
+  b =  _kortestz_mask16_u8 (mask1, mask2);
+}
+
+void
+sign_foo8 ()
+{
+  __mmask32 mask1 = _mm512_cmp_ph_mask (ph512[0], ph512[1], 1);
+  __mmask32 mask2 = _mm512_cmp_ph_mask (ph512[2], ph512[3], 1);
+
+  a =  _kortestz_mask64_u8 (mask1, mask2);
+  b =  _kortestz_mask32_u8 (mask1, mask2);
+}
-- 
2.18.1


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2021-12-23  5:41 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-21  6:26 [PATCH] [i386] Add define_insn_and_split for vpcmp{b, w, d, q} vpcmp{ph, ps, pd} liuhongt
2021-12-23  5:41 ` Hongtao Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).