On Tue, Dec 21, 2021 at 2:27 PM liuhongt wrote: > > The purpose of those define_insn_and_split: > 1. Combine vpcmpuw and zero_extend into vpcmpuw. > 2. Canonicalize vpcmpuw pattern so CSE can replace duplicate vpcmpuw to just kmov > 3. Use DImode as dest of zero_extend so cprop_hardreg can eliminate redundant kmov. Use DImode as dest of zero_extend is too aggressive which causes several regression. New patch add define_insn_and_split just combine vpcmpuw and zero_extend into vpcmpuw. Here's the patch i'm checking in. > > It should partially fix the issue in PR. > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ready to push to trunk. > > gcc/ChangeLog: > > PR target/103750 > * config/i386/sse.md > (*_cmp3_zero_extend): > New define_insn_and_split. > (*_cmp3): Ditto. > (*_cmp3_zero_extenddi): New define_insn. > (*_cmp3_zero_extend): > New define_insn_and_split. > (*_ucmp3_zero_extend): > Ditto. > (*_ucmp3): Ditto. > (*_ucmp3_zero_extenddi): New define_insn. > (*_ucmp3_zero_extend): > New define_insn_and_split. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/bitwise_mask_op-3.c: Adjust test/ > * g++.target/i386/pr103750-1.C: New test. > --- > gcc/config/i386/sse.md | 267 ++++++++++++++++++ > gcc/testsuite/g++.target/i386/pr103750-1.C | 50 ++++ > .../gcc.target/i386/bitwise_mask_op-3.c | 6 +- > 3 files changed, 320 insertions(+), 3 deletions(-) > create mode 100644 gcc/testsuite/g++.target/i386/pr103750-1.C > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index 5196149ee32..fb885d58272 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -3702,6 +3702,75 @@ (define_insn "_cmp3" > (set_attr "prefix" "evex") > (set_attr "mode" "")]) > > +;; Those Splitters are used to canonicalize vpcmpuw pattern, so that CSE can transfrom > +;; duplicated vpcmpuw to vpcmpuw and kmov > +;; Choose biggest mode(DImode) as dest, so kmov can be optimized by cprop_hardreg. > +(define_insn_and_split "*_cmp3_zero_extend" > + [(set (match_operand:SWI248x 0 "register_operand" "=k") > + (zero_extend:SWI248x > + (unspec: > + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") > + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "" "n")] > + UNSPEC_PCMP)))] > + "TARGET_AVX512BW > + && (GET_MODE_NUNITS (mode) > + < GET_MODE_PRECISION (mode))" > + "vcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + "&& mode != E_DImode" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec: > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "")]) > + > +(define_insn_and_split "*_cmp3" > + [(set (match_operand: 0 "register_operand" "=k") > + (unspec: > + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") > + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "" "n")] > + UNSPEC_PCMP))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (mode) < 64" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec: > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "")]) > + > +(define_insn "*_cmp3_zero_extenddi" > + [(set (match_operand:DI 0 "register_operand" "=k") > + (zero_extend:DI > + (unspec: > + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") > + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "" "n")] > + UNSPEC_PCMP)))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (mode) < 64" > + "vcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "")]) > + > (define_insn_and_split "*_cmp3" > [(set (match_operand: 0 "register_operand") > (not: > @@ -3735,6 +3804,72 @@ (define_insn "_cmp3" > (set_attr "prefix" "evex") > (set_attr "mode" "")]) > > +(define_insn_and_split "*_cmp3_zero_extend" > + [(set (match_operand:SWI248x 0 "register_operand" "=k") > + (zero_extend:SWI248x > + (unspec: > + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") > + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "" "n")] > + UNSPEC_PCMP)))] > + "TARGET_AVX512BW > + && (GET_MODE_NUNITS (mode) > + < GET_MODE_PRECISION (mode))" > + "vpcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + "&& mode != E_DImode" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec: > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "")]) > + > +(define_insn_and_split "*_cmp3" > + [(set (match_operand: 0 "register_operand" "=k") > + (unspec: > + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") > + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "" "n")] > + UNSPEC_PCMP))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (mode) < 64" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec: > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "")]) > + > +(define_insn "*_cmp3_zero_extenddi" > + [(set (match_operand:DI 0 "register_operand" "=k") > + (zero_extend:DI > + (unspec: > + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") > + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "" "n")] > + UNSPEC_PCMP)))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (mode) < 64" > + "vpcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "")]) > + > (define_int_iterator UNSPEC_PCMP_ITER > [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP]) > > @@ -3771,6 +3906,72 @@ (define_insn "_ucmp3" > (set_attr "prefix" "evex") > (set_attr "mode" "")]) > > +(define_insn_and_split "*_ucmp3_zero_extend" > + [(set (match_operand:SWI248x 0 "register_operand" "=k") > + (zero_extend:SWI248x > + (unspec: > + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") > + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "const_0_to_7_operand" "n")] > + UNSPEC_UNSIGNED_PCMP)))] > + "TARGET_AVX512BW > + && (GET_MODE_NUNITS (mode) > + < GET_MODE_PRECISION (mode))" > + "vpcmpu\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + "&& mode != E_DImode" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec: > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_UNSIGNED_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "")]) > + > +(define_insn_and_split "*_ucmp3" > + [(set (match_operand: 0 "register_operand" "=k") > + (unspec: > + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") > + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "" "n")] > + UNSPEC_UNSIGNED_PCMP))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (mode) < 64" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec: > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_UNSIGNED_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "")]) > + > +(define_insn "*_ucmp3_zero_extenddi" > + [(set (match_operand:DI 0 "register_operand" "=k") > + (zero_extend:DI > + (unspec: > + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") > + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "" "n")] > + UNSPEC_UNSIGNED_PCMP)))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (mode) < 64" > + "vpcmpu\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "")]) > + > (define_insn "_ucmp3" > [(set (match_operand: 0 "register_operand" "=k") > (unspec: > @@ -3785,6 +3986,72 @@ (define_insn "_ucmp3" > (set_attr "prefix" "evex") > (set_attr "mode" "")]) > > +(define_insn_and_split "*_ucmp3_zero_extend" > + [(set (match_operand:SWI248x 0 "register_operand" "=k") > + (zero_extend:SWI248x > + (unspec: > + [(match_operand:VI48_AVX512VL 1 "register_operand" "v") > + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "const_0_to_7_operand" "n")] > + UNSPEC_UNSIGNED_PCMP)))] > + "TARGET_AVX512BW > + && (GET_MODE_NUNITS (mode) > + < GET_MODE_PRECISION (mode))" > + "vpcmpu\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + "&& mode != E_DImode" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec: > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_UNSIGNED_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "")]) > + > +(define_insn_and_split "*_ucmp3" > + [(set (match_operand: 0 "register_operand" "=k") > + (unspec: > + [(match_operand:VI48_AVX512VL 1 "register_operand" "v") > + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "" "n")] > + UNSPEC_UNSIGNED_PCMP))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (mode) < 64" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec: > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_UNSIGNED_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "")]) > + > +(define_insn "*_ucmp3_zero_extenddi" > + [(set (match_operand:DI 0 "register_operand" "=k") > + (zero_extend:DI > + (unspec: > + [(match_operand:VI48_AVX512VL 1 "register_operand" "v") > + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "" "n")] > + UNSPEC_UNSIGNED_PCMP)))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (mode) < 64" > + "vpcmpu\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "")]) > + > (define_insn_and_split "*_ucmp3" > [(set (match_operand: 0 "register_operand") > (not: > diff --git a/gcc/testsuite/g++.target/i386/pr103750-1.C b/gcc/testsuite/g++.target/i386/pr103750-1.C > new file mode 100644 > index 00000000000..83f471331b3 > --- /dev/null > +++ b/gcc/testsuite/g++.target/i386/pr103750-1.C > @@ -0,0 +1,50 @@ > +/* PR target/103750 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=cannonlake -maes -std=c++1y" } */ > +/* { dg-final { scan-assembler-times "kmovw" 2 } } */ > +/* { dg-final { scan-assembler-times "kmovd" 2 } } */ > +/* There shouldn't be any kmovw/kmovd inside the loop. */ > +#include > + > +const char16_t *qustrchr(char16_t *n, char16_t *e, char16_t c) noexcept > +{ > + __m256i mch256 = _mm256_set1_epi16(c); > + for ( ; n < e; n += 32) { > + __m256i data1 = _mm256_loadu_si256(reinterpret_cast(n)); > + __m256i data2 = _mm256_loadu_si256(reinterpret_cast(n) + 1); > + __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256); > + __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256); > + if (_kortestz_mask16_u8(mask1, mask2)) > + continue; > + > + unsigned idx = _tzcnt_u32(mask1); > + if (mask1 == 0) { > + idx = __tzcnt_u16(mask2); > + n += 16; > + } > + return n + idx; > + } > + return e; > +} > + > +const char16_t *qustrchr1(char16_t *n, char16_t *e, char16_t c) noexcept > +{ > + __m256i mch256 = _mm256_set1_epi16(c); > + for ( ; n < e; n += 32) { > + __m256i data1 = _mm256_loadu_si256(reinterpret_cast(n)); > + __m256i data2 = _mm256_loadu_si256(reinterpret_cast(n) + 1); > + __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256); > + __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256); > + if (_kortestz_mask32_u8(mask1, mask2)) > + continue; > + > + unsigned idx = _tzcnt_u32(mask1); > + if (mask1 == 0) { > + idx = __tzcnt_u16(mask2); > + n += 16; > + } > + return n + idx; > + } > + return e; > +} > + > diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c > index 352c49d6c6b..82bb99e30af 100644 > --- a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c > +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c > @@ -12,7 +12,7 @@ foo_orb (__m512i a, __m512i b) > foo = m1 | m2; > } > > -/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail *-*-* } } } */ > +/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } } */ > > void > foo_xorb (__m512i a, __m512i b) > @@ -22,7 +22,7 @@ foo_xorb (__m512i a, __m512i b) > foo = m1 ^ m2; > } > > -/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail *-*-* } } } */ > +/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } } */ > > void > foo_andb (__m512i a, __m512i b) > @@ -40,4 +40,4 @@ foo_andnb (__m512i a, __m512i b) > foo = m1 & ~m2; > } > > -/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail *-*-* } } } */ > +/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail { *-*-* && { ! ia32 } } } } } */ > -- > 2.18.1 > -- BR, Hongtao