* [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)) @ 2022-09-22 7:19 Hu, Lin1 2022-09-23 1:47 ` Hongtao Liu 0 siblings, 1 reply; 4+ messages in thread From: Hu, Lin1 @ 2022-09-22 7:19 UTC (permalink / raw) To: gcc-patches; +Cc: hongtao.liu, ubizjak Hi all, This patch aims to optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)). Reduce the number of instructions required to achieve the final result. Regtested on x86_64-pc-linux-gnu. Ok for trunk? BRs, Lin gcc/ChangeLog: PR target/94962 * config/i386/constraints.md (BH): New define_constraint. * config/i386/i386.cc (standard_sse_constant_p): Add return 3/4 when operand matches new predicate. (standard_sse_constant_opcode): Add new alternative branch to return "vpcmpeqd". * config/i386/predicates.md (vector_all_ones_zero_extend_half_operand): New define_predicate. (vector_all_ones_zero_extend_quarter_operand): Ditto. * config/i386/sse.md: Add constraint to insn "mov<mode>_internal". gcc/testsuite/ChangeLog: PR target/94962 * gcc.target/i386/avx256-unaligned-load-1.c: Modify test. * gcc.target/i386/avx256-unaligned-store-1.c: Ditto. * gcc.target/i386/avx256-unaligned-store-2.c: Ditto. * gcc.target/i386/avx256-unaligned-store-3.c: Ditto. * gcc.target/i386/pr94962-1.c: New test. * gcc.target/i386/pr94962-2.c: Ditto. * gcc.target/i386/pr94962-3.c: Ditto. * gcc.target/i386/pr94962-4.c: Ditto. --- gcc/config/i386/constraints.md | 8 +++ gcc/config/i386/i386.cc | 26 +++++++- gcc/config/i386/predicates.md | 49 ++++++++++++++ gcc/config/i386/sse.md | 8 +-- .../gcc.target/i386/avx256-unaligned-load-1.c | 4 +- .../i386/avx256-unaligned-store-1.c | 4 +- .../i386/avx256-unaligned-store-2.c | 4 +- .../i386/avx256-unaligned-store-3.c | 4 +- gcc/testsuite/gcc.target/i386/pr94962-1.c | 11 ++++ gcc/testsuite/gcc.target/i386/pr94962-2.c | 17 +++++ gcc/testsuite/gcc.target/i386/pr94962-3.c | 64 +++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr94962-4.c | 49 ++++++++++++++ 12 files changed, 235 insertions(+), 13 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-4.c diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md index 7361687632f..95b2b142d41 100644 --- a/gcc/config/i386/constraints.md +++ b/gcc/config/i386/constraints.md @@ -168,6 +168,9 @@ ;; z Constant call address operand. ;; C Integer SSE constant with all bits set operand. ;; F Floating-point SSE constant with all bits set operand. +;; H Integer SSE constant that is 128/256bit all ones +;; and zero-extand to 256/512bit, or 128bit all ones +;; and zero-extend to 512bit. ;; M x86-64 memory operand. (define_constraint "Bf" @@ -233,6 +236,11 @@ (and (match_test "TARGET_SSE") (match_operand 0 "float_vector_all_ones_operand"))) +(define_constraint "BH" + "@internal integer constant with last half/quarter bits set operand." + (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand") + (match_operand 0 "vector_all_ones_zero_extend_quarter_operand"))) + ;; NB: Similar to 'm', but don't use define_memory_constraint on x86-64 ;; to prevent LRA from converting the operand to the form '(mem (reg X))' ;; where X is a base register. diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index dadf453d6c0..ca799da5d7e 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -5186,7 +5186,8 @@ standard_80387_constant_rtx (int idx) XFmode); } -/* Return 1 if X is all bits 0 and 2 if X is all bits 1 +/* Return 1 if X is all bits 0, 2 if X is all bits 1 + and 3 if X is all bits 1 with zero extend in supported SSE/AVX vector mode. */ int @@ -5234,6 +5235,10 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode) } } + if (vector_all_ones_zero_extend_half_operand (x, mode) + || vector_all_ones_zero_extend_quarter_operand (x, mode)) + return 3; + return 0; } @@ -5341,6 +5346,25 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) gcc_unreachable (); } } + else if (vector_all_ones_zero_extend_half_operand (x, mode)) + { + if (GET_MODE_SIZE (mode) == 64) + { + gcc_assert (TARGET_AVX512F); + return "vpcmpeqd \t %t0, %t0, %t0"; + } + else if (GET_MODE_SIZE (mode) == 32) + { + gcc_assert (TARGET_AVX); + return "vpcmpeqd \t %x0, %x0, %x0"; + } + gcc_unreachable (); + } + else if (vector_all_ones_zero_extend_quarter_operand (x, mode)) + { + gcc_assert (TARGET_AVX512F); + return "vpcmpeqd \t %x0, %x0, %x0"; + } gcc_unreachable (); } diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 4f16bb748b5..655eabf793b 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1159,6 +1159,55 @@ (match_test "INTEGRAL_MODE_P (GET_MODE (op))") (match_test "op == CONSTM1_RTX (GET_MODE (op))"))) +/* Return true if operand is an 128/256bit all ones vector + that zero-extends to 256/512bit. */ +(define_predicate "vector_all_ones_zero_extend_half_operand" + (match_code "const_vector") +{ + mode = GET_MODE (op); + if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT + || (GET_MODE_SIZE (mode) != 32 + && GET_MODE_SIZE (mode) != 64)) + return false; + + int nelts = CONST_VECTOR_NUNITS (op); + for (int i = 0; i != nelts; i++) + { + rtx elt = CONST_VECTOR_ELT (op, i); + if (i < nelts / 2 + && elt != CONSTM1_RTX (GET_MODE_INNER (mode))) + return false; + if (i >= nelts / 2 + && elt != CONST0_RTX (GET_MODE_INNER (mode))) + return false; + } + return true; +}) + +/* Return true if operand is an 128bit all ones vector + that zero extends to 512bit. */ +(define_predicate "vector_all_ones_zero_extend_quarter_operand" + (match_code "const_vector") +{ + mode = GET_MODE (op); + if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT + || GET_MODE_SIZE (mode) != 64) + return false; + + int nelts = CONST_VECTOR_NUNITS (op); + for (int i = 0; i != nelts; i++) + { + rtx elt = CONST_VECTOR_ELT (op, i); + if (i < nelts / 4 + && elt != CONSTM1_RTX (GET_MODE_INNER (mode))) + return false; + if (i >= nelts / 4 + && elt != CONST0_RTX (GET_MODE_INNER (mode))) + return false; + } + return true; +}) + ; Return true when OP is operand acceptable for vector memory operand. ; Only AVX can have misaligned memory operand. (define_predicate "vector_memory_operand" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index d535c0af043..f804dbe9b7a 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1318,9 +1318,9 @@ (define_insn "mov<mode>_internal" [(set (match_operand:VMOVE 0 "nonimmediate_operand" - "=v,v ,v ,m") + "=v,v ,v,v ,m") (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand" - " C,<sseconstm1>,vm,v"))] + " C,<sseconstm1>,BH,vm,v"))] "TARGET_SSE && (register_operand (operands[0], <MODE>mode) || register_operand (operands[1], <MODE>mode)) @@ -1338,7 +1338,7 @@ gcc_unreachable (); } } - [(set_attr "type" "sselog1,sselog1,ssemov,ssemov") + [(set_attr "type" "sselog1,sselog1,sselog1,ssemov,ssemov") (set_attr "prefix" "maybe_vex") (set (attr "mode") (cond [(match_test "TARGET_AVX") @@ -1349,7 +1349,7 @@ (and (match_test "<MODE>mode == V2DFmode") (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")) (const_string "V4SF") - (and (eq_attr "alternative" "3") + (and (eq_attr "alternative" "4") (match_test "TARGET_SSE_TYPELESS_STORES")) (const_string "V4SF") (and (eq_attr "alternative" "0") diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c index 68378a556fb..7115b0a9dde 100644 --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c @@ -14,6 +14,6 @@ avx_test (void) c[i] = a[i] * b[i+3]; } -/* { dg-final { scan-assembler-not "vmovups\[^\n\r]*movv8sf_internal/2" } } */ -/* { dg-final { scan-assembler "movv4sf_internal/2" } } */ +/* { dg-final { scan-assembler-not "vmovups\[^\n\r]*movv8sf_internal/3" } } */ +/* { dg-final { scan-assembler "movv4sf_internal/3" } } */ /* { dg-final { scan-assembler "vinsertf128" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c index d82aecffda9..4c713959df2 100644 --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c @@ -17,6 +17,6 @@ avx_test (void) d[i] = c[i] * 20.0; } -/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/3" } } */ -/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/3" } } */ +/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/4" } } */ +/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/4" } } */ /* { dg-final { scan-assembler "vextractf128" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c index be12529e8d5..4978c37f526 100644 --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c @@ -23,6 +23,6 @@ avx_test (void) } } -/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/3" } } */ -/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/3" } } */ +/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/4" } } */ +/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/4" } } */ /* { dg-final { scan-assembler "vextract.128" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c index 918028df9ed..f909099bcb1 100644 --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c @@ -17,6 +17,6 @@ avx_test (void) d[i] = c[i] * 20.0; } -/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/3" } } */ -/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/3" } } */ +/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/4" } } */ +/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/4" } } */ /* { dg-final { scan-assembler "vextractf128" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr94962-1.c b/gcc/testsuite/gcc.target/i386/pr94962-1.c new file mode 100644 index 00000000000..e3b01249421 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr94962-1.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-not "vmovdqa" } } */ +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1 } } */ + +#include <immintrin.h> + +__m256i mask() +{ + return _mm256_zextsi128_si256(_mm_set1_epi8(-1)); +} diff --git a/gcc/testsuite/gcc.target/i386/pr94962-2.c b/gcc/testsuite/gcc.target/i386/pr94962-2.c new file mode 100644 index 00000000000..4e10e927ba1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr94962-2.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512f" } */ +/* { dg-final { scan-assembler-not "vmovdqa" } } */ +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1 } } */ +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 1 } } */ + +#include <immintrin.h> + +__m512i mask1() +{ + return _mm512_zextsi128_si512(_mm_set1_epi8(-1)); +} + +__m512i mask2() +{ + return _mm512_zextsi256_si512(_mm256_set1_epi8(-1)); +} diff --git a/gcc/testsuite/gcc.target/i386/pr94962-3.c b/gcc/testsuite/gcc.target/i386/pr94962-3.c new file mode 100644 index 00000000000..8d0b9974435 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr94962-3.c @@ -0,0 +1,64 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512f" } */ +/* { dg-final { scan-assembler-not "vmovdqa" } } */ +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 4 } } */ + +typedef long long __v8di __attribute__ ((__vector_size__ (64))); +typedef int __v16si __attribute__ ((__vector_size__ (64))); +typedef short __v32hi __attribute__ ((__vector_size__ (64))); +typedef char __v64qi __attribute__ ((__vector_size__ (64))); +typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__)); + +__m512i +__attribute__ ((noinline, noclone)) +foo1 () +{ + return __extension__ (__m512i)(__v8di) { -1, -1, -1, -1, + 0, 0, 0, 0 }; +} + +__m512i +__attribute__ ((noinline, noclone)) +foo2 () +{ + return __extension__ (__m512i)(__v16si) { -1, -1, -1, -1, + -1, -1, -1, -1, + 0, 0, 0, 0, + 0, 0, 0, 0 }; +} + +__m512i +__attribute__ ((noinline, noclone)) +foo3 () +{ + return __extension__ (__m512i)(__v32hi) { -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 }; +} + +__m512i +__attribute__ ((noinline, noclone)) +foo4 () +{ + return __extension__ (__m512i)(__v64qi) { -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 }; +} diff --git a/gcc/testsuite/gcc.target/i386/pr94962-4.c b/gcc/testsuite/gcc.target/i386/pr94962-4.c new file mode 100644 index 00000000000..5502c39910b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr94962-4.c @@ -0,0 +1,49 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-not "vmovdqa" } } */ +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 4 } } */ + +typedef long long __v4di __attribute__ ((__vector_size__ (32))); +typedef int __v8si __attribute__ ((__vector_size__ (32))); +typedef short __v16hi __attribute__ ((__vector_size__ (32))); +typedef char __v32qi __attribute__ ((__vector_size__ (32))); +typedef long long __m256i __attribute__ ((__vector_size__ (32), __may_alias__)); + +__m256i +__attribute__ ((noinline, noclone)) +foo1 () +{ + return __extension__ (__m256i)(__v4di) { -1, -1, 0, 0 }; +} + +__m256i +__attribute__ ((noinline, noclone)) +foo2 () +{ + return __extension__ (__m256i)(__v8si) { -1, -1, -1, -1, + 0, 0, 0, 0 }; +} + +__m256i +__attribute__ ((noinline, noclone)) +foo3 () +{ + return __extension__ (__m256i)(__v16hi) { -1, -1, -1, -1, + -1, -1, -1, -1, + 0, 0, 0, 0, + 0, 0, 0, 0 }; +} + +__m256i +__attribute__ ((noinline, noclone)) +foo4 () +{ + return __extension__ (__m256i)(__v32qi) { -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 }; +} -- 2.18.2 ^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)) 2022-09-22 7:19 [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)) Hu, Lin1 @ 2022-09-23 1:47 ` Hongtao Liu 2022-09-23 3:07 ` Hu, Lin1 0 siblings, 1 reply; 4+ messages in thread From: Hongtao Liu @ 2022-09-23 1:47 UTC (permalink / raw) To: Hu, Lin1; +Cc: gcc-patches, hongtao.liu On Thu, Sep 22, 2022 at 3:20 PM Hu, Lin1 via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > Hi all, > > This patch aims to optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)). Reduce the number of instructions required to achieve the final result. > > Regtested on x86_64-pc-linux-gnu. Ok for trunk? > > BRs, > Lin > > gcc/ChangeLog: > > PR target/94962 > * config/i386/constraints.md (BH): New define_constraint. > * config/i386/i386.cc (standard_sse_constant_p): Add return 3/4 when operand matches new predicate. > (standard_sse_constant_opcode): Add new alternative branch to return "vpcmpeqd". > * config/i386/predicates.md (vector_all_ones_zero_extend_half_operand): New define_predicate. > (vector_all_ones_zero_extend_quarter_operand): Ditto. > * config/i386/sse.md: Add constraint to insn "mov<mode>_internal". (mov<mode>_internal): Add new constraint BH. Put the insn name at first. > > gcc/testsuite/ChangeLog: > > PR target/94962 > * gcc.target/i386/avx256-unaligned-load-1.c: Modify test. > * gcc.target/i386/avx256-unaligned-store-1.c: Ditto. > * gcc.target/i386/avx256-unaligned-store-2.c: Ditto. > * gcc.target/i386/avx256-unaligned-store-3.c: Ditto. > * gcc.target/i386/pr94962-1.c: New test. > * gcc.target/i386/pr94962-2.c: Ditto. > * gcc.target/i386/pr94962-3.c: Ditto. > * gcc.target/i386/pr94962-4.c: Ditto. > --- > gcc/config/i386/constraints.md | 8 +++ > gcc/config/i386/i386.cc | 26 +++++++- > gcc/config/i386/predicates.md | 49 ++++++++++++++ > gcc/config/i386/sse.md | 8 +-- > .../gcc.target/i386/avx256-unaligned-load-1.c | 4 +- > .../i386/avx256-unaligned-store-1.c | 4 +- > .../i386/avx256-unaligned-store-2.c | 4 +- > .../i386/avx256-unaligned-store-3.c | 4 +- > gcc/testsuite/gcc.target/i386/pr94962-1.c | 11 ++++ > gcc/testsuite/gcc.target/i386/pr94962-2.c | 17 +++++ > gcc/testsuite/gcc.target/i386/pr94962-3.c | 64 +++++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr94962-4.c | 49 ++++++++++++++ > 12 files changed, 235 insertions(+), 13 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-4.c > > diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md > index 7361687632f..95b2b142d41 100644 > --- a/gcc/config/i386/constraints.md > +++ b/gcc/config/i386/constraints.md > @@ -168,6 +168,9 @@ > ;; z Constant call address operand. > ;; C Integer SSE constant with all bits set operand. > ;; F Floating-point SSE constant with all bits set operand. > +;; H Integer SSE constant that is 128/256bit all ones > +;; and zero-extand to 256/512bit, or 128bit all ones > +;; and zero-extend to 512bit. > ;; M x86-64 memory operand. > > (define_constraint "Bf" > @@ -233,6 +236,11 @@ > (and (match_test "TARGET_SSE") > (match_operand 0 "float_vector_all_ones_operand"))) > > +(define_constraint "BH" > + "@internal integer constant with last half/quarter bits set operand." > + (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand") > + (match_operand 0 "vector_all_ones_zero_extend_quarter_operand"))) > + > ;; NB: Similar to 'm', but don't use define_memory_constraint on x86-64 > ;; to prevent LRA from converting the operand to the form '(mem (reg X))' > ;; where X is a base register. > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index dadf453d6c0..ca799da5d7e 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -5186,7 +5186,8 @@ standard_80387_constant_rtx (int idx) > XFmode); > } > > -/* Return 1 if X is all bits 0 and 2 if X is all bits 1 > +/* Return 1 if X is all bits 0, 2 if X is all bits 1 > + and 3 if X is all bits 1 with zero extend > in supported SSE/AVX vector mode. */ > > int > @@ -5234,6 +5235,10 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode) > } > } > > + if (vector_all_ones_zero_extend_half_operand (x, mode) > + || vector_all_ones_zero_extend_quarter_operand (x, mode)) > + return 3; > + > return 0; > } > > @@ -5341,6 +5346,25 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) > gcc_unreachable (); > } > } > + else if (vector_all_ones_zero_extend_half_operand (x, mode)) > + { > + if (GET_MODE_SIZE (mode) == 64) > + { > + gcc_assert (TARGET_AVX512F); > + return "vpcmpeqd \t %t0, %t0, %t0"; > + } > + else if (GET_MODE_SIZE (mode) == 32) > + { > + gcc_assert (TARGET_AVX); > + return "vpcmpeqd \t %x0, %x0, %x0"; > + } > + gcc_unreachable (); > + } > + else if (vector_all_ones_zero_extend_quarter_operand (x, mode)) > + { > + gcc_assert (TARGET_AVX512F); > + return "vpcmpeqd \t %x0, %x0, %x0"; > + } > Can we merge 2 vpcmpeqd \t %x0, %x0, %x0"; into 1? like else if (vector_all_ones_zero_extend_half_operand (x, mode) && GET_MODE_SIZE(mode) == 64)) return "vpcmpeqd \t %t0, %t0, %t0"; else if ((vector_all_ones_zero_extend_half_operand (x, mode) && GET_MODE_SIZE (mode) == 32) || vector_all_ones_zero_extend_quarter_operand (x, mode)) return "vpcmpeqd \t %x0, %x0, %x0"; > gcc_unreachable (); > } > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md > index 4f16bb748b5..655eabf793b 100644 > --- a/gcc/config/i386/predicates.md > +++ b/gcc/config/i386/predicates.md > @@ -1159,6 +1159,55 @@ > (match_test "INTEGRAL_MODE_P (GET_MODE (op))") > (match_test "op == CONSTM1_RTX (GET_MODE (op))"))) > > +/* Return true if operand is an 128/256bit all ones vector > + that zero-extends to 256/512bit. */ > +(define_predicate "vector_all_ones_zero_extend_half_operand" > + (match_code "const_vector") > +{ > + mode = GET_MODE (op); > + if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT > + || (GET_MODE_SIZE (mode) != 32 > + && GET_MODE_SIZE (mode) != 64)) > + return false; > + > + int nelts = CONST_VECTOR_NUNITS (op); > + for (int i = 0; i != nelts; i++) > + { > + rtx elt = CONST_VECTOR_ELT (op, i); > + if (i < nelts / 2 > + && elt != CONSTM1_RTX (GET_MODE_INNER (mode))) > + return false; > + if (i >= nelts / 2 > + && elt != CONST0_RTX (GET_MODE_INNER (mode))) > + return false; > + } > + return true; > +}) > + > +/* Return true if operand is an 128bit all ones vector > + that zero extends to 512bit. */ > +(define_predicate "vector_all_ones_zero_extend_quarter_operand" > + (match_code "const_vector") > +{ > + mode = GET_MODE (op); > + if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT > + || GET_MODE_SIZE (mode) != 64) > + return false; > + > + int nelts = CONST_VECTOR_NUNITS (op); > + for (int i = 0; i != nelts; i++) > + { > + rtx elt = CONST_VECTOR_ELT (op, i); > + if (i < nelts / 4 > + && elt != CONSTM1_RTX (GET_MODE_INNER (mode))) > + return false; > + if (i >= nelts / 4 > + && elt != CONST0_RTX (GET_MODE_INNER (mode))) > + return false; > + } > + return true; > +}) > + > ; Return true when OP is operand acceptable for vector memory operand. > ; Only AVX can have misaligned memory operand. > (define_predicate "vector_memory_operand" > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index d535c0af043..f804dbe9b7a 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -1318,9 +1318,9 @@ > > (define_insn "mov<mode>_internal" > [(set (match_operand:VMOVE 0 "nonimmediate_operand" > - "=v,v ,v ,m") > + "=v,v ,v,v ,m") > (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand" > - " C,<sseconstm1>,vm,v"))] > + " C,<sseconstm1>,BH,vm,v"))] > "TARGET_SSE > && (register_operand (operands[0], <MODE>mode) > || register_operand (operands[1], <MODE>mode)) > @@ -1338,7 +1338,7 @@ > gcc_unreachable (); > } > } > - [(set_attr "type" "sselog1,sselog1,ssemov,ssemov") > + [(set_attr "type" "sselog1,sselog1,sselog1,ssemov,ssemov") > (set_attr "prefix" "maybe_vex") > (set (attr "mode") > (cond [(match_test "TARGET_AVX") > @@ -1349,7 +1349,7 @@ > (and (match_test "<MODE>mode == V2DFmode") > (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")) > (const_string "V4SF") > - (and (eq_attr "alternative" "3") > + (and (eq_attr "alternative" "4") > (match_test "TARGET_SSE_TYPELESS_STORES")) > (const_string "V4SF") > (and (eq_attr "alternative" "0") > diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c > index 68378a556fb..7115b0a9dde 100644 > --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c > @@ -14,6 +14,6 @@ avx_test (void) > c[i] = a[i] * b[i+3]; > } > > -/* { dg-final { scan-assembler-not "vmovups\[^\n\r]*movv8sf_internal/2" } } */ > -/* { dg-final { scan-assembler "movv4sf_internal/2" } } */ > +/* { dg-final { scan-assembler-not "vmovups\[^\n\r]*movv8sf_internal/3" } } */ > +/* { dg-final { scan-assembler "movv4sf_internal/3" } } */ > /* { dg-final { scan-assembler "vinsertf128" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c > index d82aecffda9..4c713959df2 100644 > --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c > @@ -17,6 +17,6 @@ avx_test (void) > d[i] = c[i] * 20.0; > } > > -/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/3" } } */ > -/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/3" } } */ > +/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/4" } } */ > +/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/4" } } */ > /* { dg-final { scan-assembler "vextractf128" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c > index be12529e8d5..4978c37f526 100644 > --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c > +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c > @@ -23,6 +23,6 @@ avx_test (void) > } > } > > -/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/3" } } */ > -/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/3" } } */ > +/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/4" } } */ > +/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/4" } } */ > /* { dg-final { scan-assembler "vextract.128" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c > index 918028df9ed..f909099bcb1 100644 > --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c > +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c > @@ -17,6 +17,6 @@ avx_test (void) > d[i] = c[i] * 20.0; > } > > -/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/3" } } */ > -/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/3" } } */ > +/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/4" } } */ > +/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/4" } } */ > /* { dg-final { scan-assembler "vextractf128" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr94962-1.c b/gcc/testsuite/gcc.target/i386/pr94962-1.c > new file mode 100644 > index 00000000000..e3b01249421 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr94962-1.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx" } */ > +/* { dg-final { scan-assembler-not "vmovdqa" } } */ > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1 } } */ > + > +#include <immintrin.h> > + > +__m256i mask() > +{ > + return _mm256_zextsi128_si256(_mm_set1_epi8(-1)); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr94962-2.c b/gcc/testsuite/gcc.target/i386/pr94962-2.c > new file mode 100644 > index 00000000000..4e10e927ba1 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr94962-2.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx512f" } */ > +/* { dg-final { scan-assembler-not "vmovdqa" } } */ > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1 } } */ > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 1 } } */ > + > +#include <immintrin.h> > + > +__m512i mask1() > +{ > + return _mm512_zextsi128_si512(_mm_set1_epi8(-1)); > +} > + > +__m512i mask2() > +{ > + return _mm512_zextsi256_si512(_mm256_set1_epi8(-1)); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr94962-3.c b/gcc/testsuite/gcc.target/i386/pr94962-3.c > new file mode 100644 > index 00000000000..8d0b9974435 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr94962-3.c > @@ -0,0 +1,64 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx512f" } */ > +/* { dg-final { scan-assembler-not "vmovdqa" } } */ > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 4 } } */ > + > +typedef long long __v8di __attribute__ ((__vector_size__ (64))); > +typedef int __v16si __attribute__ ((__vector_size__ (64))); > +typedef short __v32hi __attribute__ ((__vector_size__ (64))); > +typedef char __v64qi __attribute__ ((__vector_size__ (64))); > +typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__)); > + > +__m512i > +__attribute__ ((noinline, noclone)) > +foo1 () > +{ > + return __extension__ (__m512i)(__v8di) { -1, -1, -1, -1, > + 0, 0, 0, 0 }; > +} > + > +__m512i > +__attribute__ ((noinline, noclone)) > +foo2 () > +{ > + return __extension__ (__m512i)(__v16si) { -1, -1, -1, -1, > + -1, -1, -1, -1, > + 0, 0, 0, 0, > + 0, 0, 0, 0 }; > +} > + > +__m512i > +__attribute__ ((noinline, noclone)) > +foo3 () > +{ > + return __extension__ (__m512i)(__v32hi) { -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0 }; > +} > + > +__m512i > +__attribute__ ((noinline, noclone)) > +foo4 () > +{ > + return __extension__ (__m512i)(__v64qi) { -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0 }; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr94962-4.c b/gcc/testsuite/gcc.target/i386/pr94962-4.c > new file mode 100644 > index 00000000000..5502c39910b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr94962-4.c > @@ -0,0 +1,49 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx" } */ > +/* { dg-final { scan-assembler-not "vmovdqa" } } */ > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 4 } } */ > + > +typedef long long __v4di __attribute__ ((__vector_size__ (32))); > +typedef int __v8si __attribute__ ((__vector_size__ (32))); > +typedef short __v16hi __attribute__ ((__vector_size__ (32))); > +typedef char __v32qi __attribute__ ((__vector_size__ (32))); > +typedef long long __m256i __attribute__ ((__vector_size__ (32), __may_alias__)); > + > +__m256i > +__attribute__ ((noinline, noclone)) > +foo1 () > +{ > + return __extension__ (__m256i)(__v4di) { -1, -1, 0, 0 }; > +} > + > +__m256i > +__attribute__ ((noinline, noclone)) > +foo2 () > +{ > + return __extension__ (__m256i)(__v8si) { -1, -1, -1, -1, > + 0, 0, 0, 0 }; > +} > + > +__m256i > +__attribute__ ((noinline, noclone)) > +foo3 () > +{ > + return __extension__ (__m256i)(__v16hi) { -1, -1, -1, -1, > + -1, -1, -1, -1, > + 0, 0, 0, 0, > + 0, 0, 0, 0 }; > +} > + > +__m256i > +__attribute__ ((noinline, noclone)) > +foo4 () > +{ > + return __extension__ (__m256i)(__v32qi) { -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0 }; > +} > -- > 2.18.2 > Others LGTM. -- BR, Hongtao ^ permalink raw reply [flat|nested] 4+ messages in thread
* RE: [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)) 2022-09-23 1:47 ` Hongtao Liu @ 2022-09-23 3:07 ` Hu, Lin1 2022-09-23 3:09 ` Hongtao Liu 0 siblings, 1 reply; 4+ messages in thread From: Hu, Lin1 @ 2022-09-23 3:07 UTC (permalink / raw) To: Hongtao Liu; +Cc: gcc-patches, Liu, Hongtao [-- Attachment #1: Type: text/plain, Size: 19081 bytes --] Hi, Hongtao I have modefied this patch and regtested on x86_64-pc-linux-gnu. BRs. Lin -----Original Message----- From: Hongtao Liu <crazylht@gmail.com> Sent: Friday, September 23, 2022 9:48 AM To: Hu, Lin1 <lin1.hu@intel.com> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com> Subject: Re: [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)) On Thu, Sep 22, 2022 at 3:20 PM Hu, Lin1 via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > Hi all, > > This patch aims to optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)). Reduce the number of instructions required to achieve the final result. > > Regtested on x86_64-pc-linux-gnu. Ok for trunk? > > BRs, > Lin > > gcc/ChangeLog: > > PR target/94962 > * config/i386/constraints.md (BH): New define_constraint. > * config/i386/i386.cc (standard_sse_constant_p): Add return 3/4 when operand matches new predicate. > (standard_sse_constant_opcode): Add new alternative branch to return "vpcmpeqd". > * config/i386/predicates.md (vector_all_ones_zero_extend_half_operand): New define_predicate. > (vector_all_ones_zero_extend_quarter_operand): Ditto. > * config/i386/sse.md: Add constraint to insn "mov<mode>_internal". (mov<mode>_internal): Add new constraint BH. Put the insn name at first. > > gcc/testsuite/ChangeLog: > > PR target/94962 > * gcc.target/i386/avx256-unaligned-load-1.c: Modify test. > * gcc.target/i386/avx256-unaligned-store-1.c: Ditto. > * gcc.target/i386/avx256-unaligned-store-2.c: Ditto. > * gcc.target/i386/avx256-unaligned-store-3.c: Ditto. > * gcc.target/i386/pr94962-1.c: New test. > * gcc.target/i386/pr94962-2.c: Ditto. > * gcc.target/i386/pr94962-3.c: Ditto. > * gcc.target/i386/pr94962-4.c: Ditto. > --- > gcc/config/i386/constraints.md | 8 +++ > gcc/config/i386/i386.cc | 26 +++++++- > gcc/config/i386/predicates.md | 49 ++++++++++++++ > gcc/config/i386/sse.md | 8 +-- > .../gcc.target/i386/avx256-unaligned-load-1.c | 4 +- > .../i386/avx256-unaligned-store-1.c | 4 +- > .../i386/avx256-unaligned-store-2.c | 4 +- > .../i386/avx256-unaligned-store-3.c | 4 +- > gcc/testsuite/gcc.target/i386/pr94962-1.c | 11 ++++ > gcc/testsuite/gcc.target/i386/pr94962-2.c | 17 +++++ > gcc/testsuite/gcc.target/i386/pr94962-3.c | 64 +++++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr94962-4.c | 49 ++++++++++++++ > 12 files changed, 235 insertions(+), 13 deletions(-) create mode > 100644 gcc/testsuite/gcc.target/i386/pr94962-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-4.c > > diff --git a/gcc/config/i386/constraints.md > b/gcc/config/i386/constraints.md index 7361687632f..95b2b142d41 100644 > --- a/gcc/config/i386/constraints.md > +++ b/gcc/config/i386/constraints.md > @@ -168,6 +168,9 @@ > ;; z Constant call address operand. > ;; C Integer SSE constant with all bits set operand. > ;; F Floating-point SSE constant with all bits set operand. > +;; H Integer SSE constant that is 128/256bit all ones > +;; and zero-extand to 256/512bit, or 128bit all ones > +;; and zero-extend to 512bit. > ;; M x86-64 memory operand. > > (define_constraint "Bf" > @@ -233,6 +236,11 @@ > (and (match_test "TARGET_SSE") > (match_operand 0 "float_vector_all_ones_operand"))) > > +(define_constraint "BH" > + "@internal integer constant with last half/quarter bits set operand." > + (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand") > + (match_operand 0 > +"vector_all_ones_zero_extend_quarter_operand"))) > + > ;; NB: Similar to 'm', but don't use define_memory_constraint on > x86-64 ;; to prevent LRA from converting the operand to the form '(mem (reg X))' > ;; where X is a base register. > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index > dadf453d6c0..ca799da5d7e 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -5186,7 +5186,8 @@ standard_80387_constant_rtx (int idx) > XFmode); } > > -/* Return 1 if X is all bits 0 and 2 if X is all bits 1 > +/* Return 1 if X is all bits 0, 2 if X is all bits 1 > + and 3 if X is all bits 1 with zero extend > in supported SSE/AVX vector mode. */ > > int > @@ -5234,6 +5235,10 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode) > } > } > > + if (vector_all_ones_zero_extend_half_operand (x, mode) > + || vector_all_ones_zero_extend_quarter_operand (x, mode)) > + return 3; > + > return 0; > } > > @@ -5341,6 +5346,25 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) > gcc_unreachable (); > } > } > + else if (vector_all_ones_zero_extend_half_operand (x, mode)) > + { > + if (GET_MODE_SIZE (mode) == 64) > + { > + gcc_assert (TARGET_AVX512F); > + return "vpcmpeqd \t %t0, %t0, %t0"; > + } > + else if (GET_MODE_SIZE (mode) == 32) > + { > + gcc_assert (TARGET_AVX); > + return "vpcmpeqd \t %x0, %x0, %x0"; > + } > + gcc_unreachable (); > + } > + else if (vector_all_ones_zero_extend_quarter_operand (x, mode)) > + { > + gcc_assert (TARGET_AVX512F); > + return "vpcmpeqd \t %x0, %x0, %x0"; > + } > Can we merge 2 vpcmpeqd \t %x0, %x0, %x0"; into 1? like else if (vector_all_ones_zero_extend_half_operand (x, mode) && GET_MODE_SIZE(mode) == 64)) return "vpcmpeqd \t %t0, %t0, %t0"; else if ((vector_all_ones_zero_extend_half_operand (x, mode) && GET_MODE_SIZE (mode) == 32) || vector_all_ones_zero_extend_quarter_operand (x, mode)) return "vpcmpeqd \t %x0, %x0, %x0"; > gcc_unreachable (); > } > diff --git a/gcc/config/i386/predicates.md > b/gcc/config/i386/predicates.md index 4f16bb748b5..655eabf793b 100644 > --- a/gcc/config/i386/predicates.md > +++ b/gcc/config/i386/predicates.md > @@ -1159,6 +1159,55 @@ > (match_test "INTEGRAL_MODE_P (GET_MODE (op))") > (match_test "op == CONSTM1_RTX (GET_MODE (op))"))) > > +/* Return true if operand is an 128/256bit all ones vector > + that zero-extends to 256/512bit. */ (define_predicate > +"vector_all_ones_zero_extend_half_operand" > + (match_code "const_vector") > +{ > + mode = GET_MODE (op); > + if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT > + || (GET_MODE_SIZE (mode) != 32 > + && GET_MODE_SIZE (mode) != 64)) > + return false; > + > + int nelts = CONST_VECTOR_NUNITS (op); > + for (int i = 0; i != nelts; i++) > + { > + rtx elt = CONST_VECTOR_ELT (op, i); > + if (i < nelts / 2 > + && elt != CONSTM1_RTX (GET_MODE_INNER (mode))) > + return false; > + if (i >= nelts / 2 > + && elt != CONST0_RTX (GET_MODE_INNER (mode))) > + return false; > + } > + return true; > +}) > + > +/* Return true if operand is an 128bit all ones vector > + that zero extends to 512bit. */ > +(define_predicate "vector_all_ones_zero_extend_quarter_operand" > + (match_code "const_vector") > +{ > + mode = GET_MODE (op); > + if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT > + || GET_MODE_SIZE (mode) != 64) > + return false; > + > + int nelts = CONST_VECTOR_NUNITS (op); > + for (int i = 0; i != nelts; i++) > + { > + rtx elt = CONST_VECTOR_ELT (op, i); > + if (i < nelts / 4 > + && elt != CONSTM1_RTX (GET_MODE_INNER (mode))) > + return false; > + if (i >= nelts / 4 > + && elt != CONST0_RTX (GET_MODE_INNER (mode))) > + return false; > + } > + return true; > +}) > + > ; Return true when OP is operand acceptable for vector memory operand. > ; Only AVX can have misaligned memory operand. > (define_predicate "vector_memory_operand" > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index > d535c0af043..f804dbe9b7a 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -1318,9 +1318,9 @@ > > (define_insn "mov<mode>_internal" > [(set (match_operand:VMOVE 0 "nonimmediate_operand" > - "=v,v ,v ,m") > + "=v,v ,v,v ,m") > (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand" > - " C,<sseconstm1>,vm,v"))] > + " C,<sseconstm1>,BH,vm,v"))] > "TARGET_SSE > && (register_operand (operands[0], <MODE>mode) > || register_operand (operands[1], <MODE>mode)) @@ -1338,7 > +1338,7 @@ > gcc_unreachable (); > } > } > - [(set_attr "type" "sselog1,sselog1,ssemov,ssemov") > + [(set_attr "type" "sselog1,sselog1,sselog1,ssemov,ssemov") > (set_attr "prefix" "maybe_vex") > (set (attr "mode") > (cond [(match_test "TARGET_AVX") @@ -1349,7 +1349,7 @@ > (and (match_test "<MODE>mode == V2DFmode") > (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")) > (const_string "V4SF") > - (and (eq_attr "alternative" "3") > + (and (eq_attr "alternative" "4") > (match_test "TARGET_SSE_TYPELESS_STORES")) > (const_string "V4SF") > (and (eq_attr "alternative" "0") diff --git > a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c > b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c > index 68378a556fb..7115b0a9dde 100644 > --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c > @@ -14,6 +14,6 @@ avx_test (void) > c[i] = a[i] * b[i+3]; > } > > -/* { dg-final { scan-assembler-not > "vmovups\[^\n\r]*movv8sf_internal/2" } } */ > -/* { dg-final { scan-assembler "movv4sf_internal/2" } } */ > +/* { dg-final { scan-assembler-not > +"vmovups\[^\n\r]*movv8sf_internal/3" } } */ > +/* { dg-final { scan-assembler "movv4sf_internal/3" } } */ > /* { dg-final { scan-assembler "vinsertf128" } } */ diff --git > a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c > b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c > index d82aecffda9..4c713959df2 100644 > --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c > @@ -17,6 +17,6 @@ avx_test (void) > d[i] = c[i] * 20.0; > } > > -/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/3" } } > */ > -/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/3" } } */ > +/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/4" } } > +*/ > +/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/4" } } */ > /* { dg-final { scan-assembler "vextractf128" } } */ diff --git > a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c > b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c > index be12529e8d5..4978c37f526 100644 > --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c > +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c > @@ -23,6 +23,6 @@ avx_test (void) > } > } > > -/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/3" } } > */ > -/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/3" } } */ > +/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/4" } } > +*/ > +/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/4" } } */ > /* { dg-final { scan-assembler "vextract.128" } } */ diff --git > a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c > b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c > index 918028df9ed..f909099bcb1 100644 > --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c > +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c > @@ -17,6 +17,6 @@ avx_test (void) > d[i] = c[i] * 20.0; > } > > -/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/3" } } > */ > -/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/3" } } */ > +/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/4" } } > +*/ > +/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/4" } } */ > /* { dg-final { scan-assembler "vextractf128" } } */ diff --git > a/gcc/testsuite/gcc.target/i386/pr94962-1.c > b/gcc/testsuite/gcc.target/i386/pr94962-1.c > new file mode 100644 > index 00000000000..e3b01249421 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr94962-1.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx" } */ > +/* { dg-final { scan-assembler-not "vmovdqa" } } */ > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1 > +} } */ > + > +#include <immintrin.h> > + > +__m256i mask() > +{ > + return _mm256_zextsi128_si256(_mm_set1_epi8(-1)); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr94962-2.c > b/gcc/testsuite/gcc.target/i386/pr94962-2.c > new file mode 100644 > index 00000000000..4e10e927ba1 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr94962-2.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx512f" } */ > +/* { dg-final { scan-assembler-not "vmovdqa" } } */ > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1 > +} } */ > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 1 > +} } */ > + > +#include <immintrin.h> > + > +__m512i mask1() > +{ > + return _mm512_zextsi128_si512(_mm_set1_epi8(-1)); > +} > + > +__m512i mask2() > +{ > + return _mm512_zextsi256_si512(_mm256_set1_epi8(-1)); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr94962-3.c > b/gcc/testsuite/gcc.target/i386/pr94962-3.c > new file mode 100644 > index 00000000000..8d0b9974435 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr94962-3.c > @@ -0,0 +1,64 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx512f" } */ > +/* { dg-final { scan-assembler-not "vmovdqa" } } */ > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 4 > +} } */ > + > +typedef long long __v8di __attribute__ ((__vector_size__ (64))); > +typedef int __v16si __attribute__ ((__vector_size__ (64))); typedef > +short __v32hi __attribute__ ((__vector_size__ (64))); typedef char > +__v64qi __attribute__ ((__vector_size__ (64))); typedef long long > +__m512i __attribute__ ((__vector_size__ (64), __may_alias__)); > + > +__m512i > +__attribute__ ((noinline, noclone)) > +foo1 () > +{ > + return __extension__ (__m512i)(__v8di) { -1, -1, -1, -1, > + 0, 0, 0, 0 }; } > + > +__m512i > +__attribute__ ((noinline, noclone)) > +foo2 () > +{ > + return __extension__ (__m512i)(__v16si) { -1, -1, -1, -1, > + -1, -1, -1, -1, > + 0, 0, 0, 0, > + 0, 0, 0, 0 }; } > + > +__m512i > +__attribute__ ((noinline, noclone)) > +foo3 () > +{ > + return __extension__ (__m512i)(__v32hi) { -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0 }; } > + > +__m512i > +__attribute__ ((noinline, noclone)) > +foo4 () > +{ > + return __extension__ (__m512i)(__v64qi) { -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0 }; } > diff --git a/gcc/testsuite/gcc.target/i386/pr94962-4.c > b/gcc/testsuite/gcc.target/i386/pr94962-4.c > new file mode 100644 > index 00000000000..5502c39910b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr94962-4.c > @@ -0,0 +1,49 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx" } */ > +/* { dg-final { scan-assembler-not "vmovdqa" } } */ > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 4 > +} } */ > + > +typedef long long __v4di __attribute__ ((__vector_size__ (32))); > +typedef int __v8si __attribute__ ((__vector_size__ (32))); typedef > +short __v16hi __attribute__ ((__vector_size__ (32))); typedef char > +__v32qi __attribute__ ((__vector_size__ (32))); typedef long long > +__m256i __attribute__ ((__vector_size__ (32), __may_alias__)); > + > +__m256i > +__attribute__ ((noinline, noclone)) > +foo1 () > +{ > + return __extension__ (__m256i)(__v4di) { -1, -1, 0, 0 }; } > + > +__m256i > +__attribute__ ((noinline, noclone)) > +foo2 () > +{ > + return __extension__ (__m256i)(__v8si) { -1, -1, -1, -1, > + 0, 0, 0, 0 }; } > + > +__m256i > +__attribute__ ((noinline, noclone)) > +foo3 () > +{ > + return __extension__ (__m256i)(__v16hi) { -1, -1, -1, -1, > + -1, -1, -1, -1, > + 0, 0, 0, 0, > + 0, 0, 0, 0 }; } > + > +__m256i > +__attribute__ ((noinline, noclone)) > +foo4 () > +{ > + return __extension__ (__m256i)(__v32qi) { -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + -1, -1, -1, -1, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0 }; } > -- > 2.18.2 > Others LGTM. -- BR, Hongtao [-- Attachment #2: 0001-i386-Optimize-code-generation-of-__mm256_zextsi128_s.patch --] [-- Type: application/octet-stream, Size: 14899 bytes --] From d630db29e76417934b1c418ea625644e5ca1b6df Mon Sep 17 00:00:00 2001 From: "Hu, Lin1" <lin1.hu@intel.com> Date: Tue, 13 Sep 2022 16:28:54 +0800 Subject: [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)) gcc/ChangeLog: PR target/94962 * config/i386/constraints.md (BH): New define_constraint. * config/i386/i386.cc (standard_sse_constant_p): Add return 3/4 when operand matches new predicate. (standard_sse_constant_opcode): Add new alternative branch to return "vpcmpeqd". * config/i386/predicates.md: (vector_all_ones_zero_extend_half_operand): New define_predicate. (vector_all_ones_zero_extend_quarter_operand): Ditto. * config/i386/sse.md (mov<mode>_internal): Add new constraint BH. gcc/testsuite/ChangeLog: PR target/94962 * gcc.target/i386/avx256-unaligned-load-1.c: Modify test. * gcc.target/i386/avx256-unaligned-store-1.c: Ditto. * gcc.target/i386/avx256-unaligned-store-2.c: Ditto. * gcc.target/i386/avx256-unaligned-store-3.c: Ditto. * gcc.target/i386/pr94962-1.c: New test. * gcc.target/i386/pr94962-2.c: Ditto. * gcc.target/i386/pr94962-3.c: Ditto. * gcc.target/i386/pr94962-4.c: Ditto. --- gcc/config/i386/constraints.md | 8 +++ gcc/config/i386/i386.cc | 14 +++- gcc/config/i386/predicates.md | 49 ++++++++++++++ gcc/config/i386/sse.md | 8 +-- .../gcc.target/i386/avx256-unaligned-load-1.c | 4 +- .../i386/avx256-unaligned-store-1.c | 4 +- .../i386/avx256-unaligned-store-2.c | 4 +- .../i386/avx256-unaligned-store-3.c | 4 +- gcc/testsuite/gcc.target/i386/pr94962-1.c | 11 ++++ gcc/testsuite/gcc.target/i386/pr94962-2.c | 17 +++++ gcc/testsuite/gcc.target/i386/pr94962-3.c | 64 +++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr94962-4.c | 49 ++++++++++++++ 12 files changed, 223 insertions(+), 13 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-4.c diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md index 7361687632f..95b2b142d41 100644 --- a/gcc/config/i386/constraints.md +++ b/gcc/config/i386/constraints.md @@ -168,6 +168,9 @@ ;; z Constant call address operand. ;; C Integer SSE constant with all bits set operand. ;; F Floating-point SSE constant with all bits set operand. +;; H Integer SSE constant that is 128/256bit all ones +;; and zero-extand to 256/512bit, or 128bit all ones +;; and zero-extend to 512bit. ;; M x86-64 memory operand. (define_constraint "Bf" @@ -233,6 +236,11 @@ (and (match_test "TARGET_SSE") (match_operand 0 "float_vector_all_ones_operand"))) +(define_constraint "BH" + "@internal integer constant with last half/quarter bits set operand." + (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand") + (match_operand 0 "vector_all_ones_zero_extend_quarter_operand"))) + ;; NB: Similar to 'm', but don't use define_memory_constraint on x86-64 ;; to prevent LRA from converting the operand to the form '(mem (reg X))' ;; where X is a base register. diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index dadf453d6c0..980bb09a4e0 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -5186,7 +5186,8 @@ standard_80387_constant_rtx (int idx) XFmode); } -/* Return 1 if X is all bits 0 and 2 if X is all bits 1 +/* Return 1 if X is all bits 0, 2 if X is all bits 1 + and 3 if X is all bits 1 with zero extend in supported SSE/AVX vector mode. */ int @@ -5234,6 +5235,10 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode) } } + if (vector_all_ones_zero_extend_half_operand (x, mode) + || vector_all_ones_zero_extend_quarter_operand (x, mode)) + return 3; + return 0; } @@ -5341,6 +5346,13 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) gcc_unreachable (); } } + else if (vector_all_ones_zero_extend_half_operand (x, mode) + && GET_MODE_SIZE (mode) == 64) + return "vpcmpeqd \t %t0, %t0, %t0"; + else if ((vector_all_ones_zero_extend_half_operand (x, mode) + && GET_MODE_SIZE (mode) == 32) + || vector_all_ones_zero_extend_quarter_operand (x, mode)) + return "vpcmpeqd \t %x0, %x0, %x0"; gcc_unreachable (); } diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 4f16bb748b5..655eabf793b 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1159,6 +1159,55 @@ (match_test "INTEGRAL_MODE_P (GET_MODE (op))") (match_test "op == CONSTM1_RTX (GET_MODE (op))"))) +/* Return true if operand is an 128/256bit all ones vector + that zero-extends to 256/512bit. */ +(define_predicate "vector_all_ones_zero_extend_half_operand" + (match_code "const_vector") +{ + mode = GET_MODE (op); + if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT + || (GET_MODE_SIZE (mode) != 32 + && GET_MODE_SIZE (mode) != 64)) + return false; + + int nelts = CONST_VECTOR_NUNITS (op); + for (int i = 0; i != nelts; i++) + { + rtx elt = CONST_VECTOR_ELT (op, i); + if (i < nelts / 2 + && elt != CONSTM1_RTX (GET_MODE_INNER (mode))) + return false; + if (i >= nelts / 2 + && elt != CONST0_RTX (GET_MODE_INNER (mode))) + return false; + } + return true; +}) + +/* Return true if operand is an 128bit all ones vector + that zero extends to 512bit. */ +(define_predicate "vector_all_ones_zero_extend_quarter_operand" + (match_code "const_vector") +{ + mode = GET_MODE (op); + if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT + || GET_MODE_SIZE (mode) != 64) + return false; + + int nelts = CONST_VECTOR_NUNITS (op); + for (int i = 0; i != nelts; i++) + { + rtx elt = CONST_VECTOR_ELT (op, i); + if (i < nelts / 4 + && elt != CONSTM1_RTX (GET_MODE_INNER (mode))) + return false; + if (i >= nelts / 4 + && elt != CONST0_RTX (GET_MODE_INNER (mode))) + return false; + } + return true; +}) + ; Return true when OP is operand acceptable for vector memory operand. ; Only AVX can have misaligned memory operand. (define_predicate "vector_memory_operand" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index d535c0af043..f804dbe9b7a 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1318,9 +1318,9 @@ (define_insn "mov<mode>_internal" [(set (match_operand:VMOVE 0 "nonimmediate_operand" - "=v,v ,v ,m") + "=v,v ,v,v ,m") (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand" - " C,<sseconstm1>,vm,v"))] + " C,<sseconstm1>,BH,vm,v"))] "TARGET_SSE && (register_operand (operands[0], <MODE>mode) || register_operand (operands[1], <MODE>mode)) @@ -1338,7 +1338,7 @@ gcc_unreachable (); } } - [(set_attr "type" "sselog1,sselog1,ssemov,ssemov") + [(set_attr "type" "sselog1,sselog1,sselog1,ssemov,ssemov") (set_attr "prefix" "maybe_vex") (set (attr "mode") (cond [(match_test "TARGET_AVX") @@ -1349,7 +1349,7 @@ (and (match_test "<MODE>mode == V2DFmode") (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")) (const_string "V4SF") - (and (eq_attr "alternative" "3") + (and (eq_attr "alternative" "4") (match_test "TARGET_SSE_TYPELESS_STORES")) (const_string "V4SF") (and (eq_attr "alternative" "0") diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c index 68378a556fb..7115b0a9dde 100644 --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c @@ -14,6 +14,6 @@ avx_test (void) c[i] = a[i] * b[i+3]; } -/* { dg-final { scan-assembler-not "vmovups\[^\n\r]*movv8sf_internal/2" } } */ -/* { dg-final { scan-assembler "movv4sf_internal/2" } } */ +/* { dg-final { scan-assembler-not "vmovups\[^\n\r]*movv8sf_internal/3" } } */ +/* { dg-final { scan-assembler "movv4sf_internal/3" } } */ /* { dg-final { scan-assembler "vinsertf128" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c index d82aecffda9..4c713959df2 100644 --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c @@ -17,6 +17,6 @@ avx_test (void) d[i] = c[i] * 20.0; } -/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/3" } } */ -/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/3" } } */ +/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/4" } } */ +/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/4" } } */ /* { dg-final { scan-assembler "vextractf128" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c index be12529e8d5..4978c37f526 100644 --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c @@ -23,6 +23,6 @@ avx_test (void) } } -/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/3" } } */ -/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/3" } } */ +/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/4" } } */ +/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/4" } } */ /* { dg-final { scan-assembler "vextract.128" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c index 918028df9ed..f909099bcb1 100644 --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c @@ -17,6 +17,6 @@ avx_test (void) d[i] = c[i] * 20.0; } -/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/3" } } */ -/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/3" } } */ +/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/4" } } */ +/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/4" } } */ /* { dg-final { scan-assembler "vextractf128" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr94962-1.c b/gcc/testsuite/gcc.target/i386/pr94962-1.c new file mode 100644 index 00000000000..e3b01249421 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr94962-1.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-not "vmovdqa" } } */ +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1 } } */ + +#include <immintrin.h> + +__m256i mask() +{ + return _mm256_zextsi128_si256(_mm_set1_epi8(-1)); +} diff --git a/gcc/testsuite/gcc.target/i386/pr94962-2.c b/gcc/testsuite/gcc.target/i386/pr94962-2.c new file mode 100644 index 00000000000..4e10e927ba1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr94962-2.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512f" } */ +/* { dg-final { scan-assembler-not "vmovdqa" } } */ +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1 } } */ +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 1 } } */ + +#include <immintrin.h> + +__m512i mask1() +{ + return _mm512_zextsi128_si512(_mm_set1_epi8(-1)); +} + +__m512i mask2() +{ + return _mm512_zextsi256_si512(_mm256_set1_epi8(-1)); +} diff --git a/gcc/testsuite/gcc.target/i386/pr94962-3.c b/gcc/testsuite/gcc.target/i386/pr94962-3.c new file mode 100644 index 00000000000..8d0b9974435 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr94962-3.c @@ -0,0 +1,64 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512f" } */ +/* { dg-final { scan-assembler-not "vmovdqa" } } */ +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 4 } } */ + +typedef long long __v8di __attribute__ ((__vector_size__ (64))); +typedef int __v16si __attribute__ ((__vector_size__ (64))); +typedef short __v32hi __attribute__ ((__vector_size__ (64))); +typedef char __v64qi __attribute__ ((__vector_size__ (64))); +typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__)); + +__m512i +__attribute__ ((noinline, noclone)) +foo1 () +{ + return __extension__ (__m512i)(__v8di) { -1, -1, -1, -1, + 0, 0, 0, 0 }; +} + +__m512i +__attribute__ ((noinline, noclone)) +foo2 () +{ + return __extension__ (__m512i)(__v16si) { -1, -1, -1, -1, + -1, -1, -1, -1, + 0, 0, 0, 0, + 0, 0, 0, 0 }; +} + +__m512i +__attribute__ ((noinline, noclone)) +foo3 () +{ + return __extension__ (__m512i)(__v32hi) { -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 }; +} + +__m512i +__attribute__ ((noinline, noclone)) +foo4 () +{ + return __extension__ (__m512i)(__v64qi) { -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 }; +} diff --git a/gcc/testsuite/gcc.target/i386/pr94962-4.c b/gcc/testsuite/gcc.target/i386/pr94962-4.c new file mode 100644 index 00000000000..5502c39910b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr94962-4.c @@ -0,0 +1,49 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-not "vmovdqa" } } */ +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 4 } } */ + +typedef long long __v4di __attribute__ ((__vector_size__ (32))); +typedef int __v8si __attribute__ ((__vector_size__ (32))); +typedef short __v16hi __attribute__ ((__vector_size__ (32))); +typedef char __v32qi __attribute__ ((__vector_size__ (32))); +typedef long long __m256i __attribute__ ((__vector_size__ (32), __may_alias__)); + +__m256i +__attribute__ ((noinline, noclone)) +foo1 () +{ + return __extension__ (__m256i)(__v4di) { -1, -1, 0, 0 }; +} + +__m256i +__attribute__ ((noinline, noclone)) +foo2 () +{ + return __extension__ (__m256i)(__v8si) { -1, -1, -1, -1, + 0, 0, 0, 0 }; +} + +__m256i +__attribute__ ((noinline, noclone)) +foo3 () +{ + return __extension__ (__m256i)(__v16hi) { -1, -1, -1, -1, + -1, -1, -1, -1, + 0, 0, 0, 0, + 0, 0, 0, 0 }; +} + +__m256i +__attribute__ ((noinline, noclone)) +foo4 () +{ + return __extension__ (__m256i)(__v32qi) { -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 }; +} -- 2.18.2 ^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)) 2022-09-23 3:07 ` Hu, Lin1 @ 2022-09-23 3:09 ` Hongtao Liu 0 siblings, 0 replies; 4+ messages in thread From: Hongtao Liu @ 2022-09-23 3:09 UTC (permalink / raw) To: Hu, Lin1; +Cc: gcc-patches, Liu, Hongtao On Fri, Sep 23, 2022 at 11:07 AM Hu, Lin1 <lin1.hu@intel.com> wrote: > > Hi, Hongtao > > I have modefied this patch and regtested on x86_64-pc-linux-gnu. > Ok. > BRs. > Lin > > -----Original Message----- > From: Hongtao Liu <crazylht@gmail.com> > Sent: Friday, September 23, 2022 9:48 AM > To: Hu, Lin1 <lin1.hu@intel.com> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com> > Subject: Re: [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)) > > On Thu, Sep 22, 2022 at 3:20 PM Hu, Lin1 via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > > > Hi all, > > > > This patch aims to optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)). Reduce the number of instructions required to achieve the final result. > > > > Regtested on x86_64-pc-linux-gnu. Ok for trunk? > > > > BRs, > > Lin > > > > gcc/ChangeLog: > > > > PR target/94962 > > * config/i386/constraints.md (BH): New define_constraint. > > * config/i386/i386.cc (standard_sse_constant_p): Add return 3/4 when operand matches new predicate. > > (standard_sse_constant_opcode): Add new alternative branch to return "vpcmpeqd". > > * config/i386/predicates.md (vector_all_ones_zero_extend_half_operand): New define_predicate. > > (vector_all_ones_zero_extend_quarter_operand): Ditto. > > * config/i386/sse.md: Add constraint to insn "mov<mode>_internal". > (mov<mode>_internal): Add new constraint BH. > Put the insn name at first. > > > > gcc/testsuite/ChangeLog: > > > > PR target/94962 > > * gcc.target/i386/avx256-unaligned-load-1.c: Modify test. > > * gcc.target/i386/avx256-unaligned-store-1.c: Ditto. > > * gcc.target/i386/avx256-unaligned-store-2.c: Ditto. > > * gcc.target/i386/avx256-unaligned-store-3.c: Ditto. > > * gcc.target/i386/pr94962-1.c: New test. > > * gcc.target/i386/pr94962-2.c: Ditto. > > * gcc.target/i386/pr94962-3.c: Ditto. > > * gcc.target/i386/pr94962-4.c: Ditto. > > --- > > gcc/config/i386/constraints.md | 8 +++ > > gcc/config/i386/i386.cc | 26 +++++++- > > gcc/config/i386/predicates.md | 49 ++++++++++++++ > > gcc/config/i386/sse.md | 8 +-- > > .../gcc.target/i386/avx256-unaligned-load-1.c | 4 +- > > .../i386/avx256-unaligned-store-1.c | 4 +- > > .../i386/avx256-unaligned-store-2.c | 4 +- > > .../i386/avx256-unaligned-store-3.c | 4 +- > > gcc/testsuite/gcc.target/i386/pr94962-1.c | 11 ++++ > > gcc/testsuite/gcc.target/i386/pr94962-2.c | 17 +++++ > > gcc/testsuite/gcc.target/i386/pr94962-3.c | 64 +++++++++++++++++++ > > gcc/testsuite/gcc.target/i386/pr94962-4.c | 49 ++++++++++++++ > > 12 files changed, 235 insertions(+), 13 deletions(-) create mode > > 100644 gcc/testsuite/gcc.target/i386/pr94962-1.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-2.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-3.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-4.c > > > > diff --git a/gcc/config/i386/constraints.md > > b/gcc/config/i386/constraints.md index 7361687632f..95b2b142d41 100644 > > --- a/gcc/config/i386/constraints.md > > +++ b/gcc/config/i386/constraints.md > > @@ -168,6 +168,9 @@ > > ;; z Constant call address operand. > > ;; C Integer SSE constant with all bits set operand. > > ;; F Floating-point SSE constant with all bits set operand. > > +;; H Integer SSE constant that is 128/256bit all ones > > +;; and zero-extand to 256/512bit, or 128bit all ones > > +;; and zero-extend to 512bit. > > ;; M x86-64 memory operand. > > > > (define_constraint "Bf" > > @@ -233,6 +236,11 @@ > > (and (match_test "TARGET_SSE") > > (match_operand 0 "float_vector_all_ones_operand"))) > > > > +(define_constraint "BH" > > + "@internal integer constant with last half/quarter bits set operand." > > + (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand") > > + (match_operand 0 > > +"vector_all_ones_zero_extend_quarter_operand"))) > > + > > ;; NB: Similar to 'm', but don't use define_memory_constraint on > > x86-64 ;; to prevent LRA from converting the operand to the form '(mem (reg X))' > > ;; where X is a base register. > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index > > dadf453d6c0..ca799da5d7e 100644 > > --- a/gcc/config/i386/i386.cc > > +++ b/gcc/config/i386/i386.cc > > @@ -5186,7 +5186,8 @@ standard_80387_constant_rtx (int idx) > > XFmode); } > > > > -/* Return 1 if X is all bits 0 and 2 if X is all bits 1 > > +/* Return 1 if X is all bits 0, 2 if X is all bits 1 > > + and 3 if X is all bits 1 with zero extend > > in supported SSE/AVX vector mode. */ > > > > int > > @@ -5234,6 +5235,10 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode) > > } > > } > > > > + if (vector_all_ones_zero_extend_half_operand (x, mode) > > + || vector_all_ones_zero_extend_quarter_operand (x, mode)) > > + return 3; > > + > > return 0; > > } > > > > @@ -5341,6 +5346,25 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) > > gcc_unreachable (); > > } > > } > > + else if (vector_all_ones_zero_extend_half_operand (x, mode)) > > + { > > + if (GET_MODE_SIZE (mode) == 64) > > + { > > + gcc_assert (TARGET_AVX512F); > > + return "vpcmpeqd \t %t0, %t0, %t0"; > > + } > > + else if (GET_MODE_SIZE (mode) == 32) > > + { > > + gcc_assert (TARGET_AVX); > > + return "vpcmpeqd \t %x0, %x0, %x0"; > > + } > > + gcc_unreachable (); > > + } > > + else if (vector_all_ones_zero_extend_quarter_operand (x, mode)) > > + { > > + gcc_assert (TARGET_AVX512F); > > + return "vpcmpeqd \t %x0, %x0, %x0"; > > + } > > > Can we merge 2 vpcmpeqd \t %x0, %x0, %x0"; into 1? > like > else if (vector_all_ones_zero_extend_half_operand (x, mode) > && GET_MODE_SIZE(mode) == 64)) > return "vpcmpeqd \t %t0, %t0, %t0"; > else if ((vector_all_ones_zero_extend_half_operand (x, mode) > && GET_MODE_SIZE (mode) == 32) > || vector_all_ones_zero_extend_quarter_operand (x, mode)) > return "vpcmpeqd \t %x0, %x0, %x0"; > > > gcc_unreachable (); > > } > > diff --git a/gcc/config/i386/predicates.md > > b/gcc/config/i386/predicates.md index 4f16bb748b5..655eabf793b 100644 > > --- a/gcc/config/i386/predicates.md > > +++ b/gcc/config/i386/predicates.md > > @@ -1159,6 +1159,55 @@ > > (match_test "INTEGRAL_MODE_P (GET_MODE (op))") > > (match_test "op == CONSTM1_RTX (GET_MODE (op))"))) > > > > +/* Return true if operand is an 128/256bit all ones vector > > + that zero-extends to 256/512bit. */ (define_predicate > > +"vector_all_ones_zero_extend_half_operand" > > + (match_code "const_vector") > > +{ > > + mode = GET_MODE (op); > > + if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT > > + || (GET_MODE_SIZE (mode) != 32 > > + && GET_MODE_SIZE (mode) != 64)) > > + return false; > > + > > + int nelts = CONST_VECTOR_NUNITS (op); > > + for (int i = 0; i != nelts; i++) > > + { > > + rtx elt = CONST_VECTOR_ELT (op, i); > > + if (i < nelts / 2 > > + && elt != CONSTM1_RTX (GET_MODE_INNER (mode))) > > + return false; > > + if (i >= nelts / 2 > > + && elt != CONST0_RTX (GET_MODE_INNER (mode))) > > + return false; > > + } > > + return true; > > +}) > > + > > +/* Return true if operand is an 128bit all ones vector > > + that zero extends to 512bit. */ > > +(define_predicate "vector_all_ones_zero_extend_quarter_operand" > > + (match_code "const_vector") > > +{ > > + mode = GET_MODE (op); > > + if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT > > + || GET_MODE_SIZE (mode) != 64) > > + return false; > > + > > + int nelts = CONST_VECTOR_NUNITS (op); > > + for (int i = 0; i != nelts; i++) > > + { > > + rtx elt = CONST_VECTOR_ELT (op, i); > > + if (i < nelts / 4 > > + && elt != CONSTM1_RTX (GET_MODE_INNER (mode))) > > + return false; > > + if (i >= nelts / 4 > > + && elt != CONST0_RTX (GET_MODE_INNER (mode))) > > + return false; > > + } > > + return true; > > +}) > > + > > ; Return true when OP is operand acceptable for vector memory operand. > > ; Only AVX can have misaligned memory operand. > > (define_predicate "vector_memory_operand" > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index > > d535c0af043..f804dbe9b7a 100644 > > --- a/gcc/config/i386/sse.md > > +++ b/gcc/config/i386/sse.md > > @@ -1318,9 +1318,9 @@ > > > > (define_insn "mov<mode>_internal" > > [(set (match_operand:VMOVE 0 "nonimmediate_operand" > > - "=v,v ,v ,m") > > + "=v,v ,v,v ,m") > > (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand" > > - " C,<sseconstm1>,vm,v"))] > > + " C,<sseconstm1>,BH,vm,v"))] > > "TARGET_SSE > > && (register_operand (operands[0], <MODE>mode) > > || register_operand (operands[1], <MODE>mode)) @@ -1338,7 > > +1338,7 @@ > > gcc_unreachable (); > > } > > } > > - [(set_attr "type" "sselog1,sselog1,ssemov,ssemov") > > + [(set_attr "type" "sselog1,sselog1,sselog1,ssemov,ssemov") > > (set_attr "prefix" "maybe_vex") > > (set (attr "mode") > > (cond [(match_test "TARGET_AVX") @@ -1349,7 +1349,7 @@ > > (and (match_test "<MODE>mode == V2DFmode") > > (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")) > > (const_string "V4SF") > > - (and (eq_attr "alternative" "3") > > + (and (eq_attr "alternative" "4") > > (match_test "TARGET_SSE_TYPELESS_STORES")) > > (const_string "V4SF") > > (and (eq_attr "alternative" "0") diff --git > > a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c > > b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c > > index 68378a556fb..7115b0a9dde 100644 > > --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c > > +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c > > @@ -14,6 +14,6 @@ avx_test (void) > > c[i] = a[i] * b[i+3]; > > } > > > > -/* { dg-final { scan-assembler-not > > "vmovups\[^\n\r]*movv8sf_internal/2" } } */ > > -/* { dg-final { scan-assembler "movv4sf_internal/2" } } */ > > +/* { dg-final { scan-assembler-not > > +"vmovups\[^\n\r]*movv8sf_internal/3" } } */ > > +/* { dg-final { scan-assembler "movv4sf_internal/3" } } */ > > /* { dg-final { scan-assembler "vinsertf128" } } */ diff --git > > a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c > > b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c > > index d82aecffda9..4c713959df2 100644 > > --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c > > +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c > > @@ -17,6 +17,6 @@ avx_test (void) > > d[i] = c[i] * 20.0; > > } > > > > -/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/3" } } > > */ > > -/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/3" } } */ > > +/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/4" } } > > +*/ > > +/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/4" } } */ > > /* { dg-final { scan-assembler "vextractf128" } } */ diff --git > > a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c > > b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c > > index be12529e8d5..4978c37f526 100644 > > --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c > > +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c > > @@ -23,6 +23,6 @@ avx_test (void) > > } > > } > > > > -/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/3" } } > > */ > > -/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/3" } } */ > > +/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/4" } } > > +*/ > > +/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/4" } } */ > > /* { dg-final { scan-assembler "vextract.128" } } */ diff --git > > a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c > > b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c > > index 918028df9ed..f909099bcb1 100644 > > --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c > > +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c > > @@ -17,6 +17,6 @@ avx_test (void) > > d[i] = c[i] * 20.0; > > } > > > > -/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/3" } } > > */ > > -/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/3" } } */ > > +/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/4" } } > > +*/ > > +/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/4" } } */ > > /* { dg-final { scan-assembler "vextractf128" } } */ diff --git > > a/gcc/testsuite/gcc.target/i386/pr94962-1.c > > b/gcc/testsuite/gcc.target/i386/pr94962-1.c > > new file mode 100644 > > index 00000000000..e3b01249421 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr94962-1.c > > @@ -0,0 +1,11 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mavx" } */ > > +/* { dg-final { scan-assembler-not "vmovdqa" } } */ > > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1 > > +} } */ > > + > > +#include <immintrin.h> > > + > > +__m256i mask() > > +{ > > + return _mm256_zextsi128_si256(_mm_set1_epi8(-1)); > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr94962-2.c > > b/gcc/testsuite/gcc.target/i386/pr94962-2.c > > new file mode 100644 > > index 00000000000..4e10e927ba1 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr94962-2.c > > @@ -0,0 +1,17 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mavx512f" } */ > > +/* { dg-final { scan-assembler-not "vmovdqa" } } */ > > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1 > > +} } */ > > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 1 > > +} } */ > > + > > +#include <immintrin.h> > > + > > +__m512i mask1() > > +{ > > + return _mm512_zextsi128_si512(_mm_set1_epi8(-1)); > > +} > > + > > +__m512i mask2() > > +{ > > + return _mm512_zextsi256_si512(_mm256_set1_epi8(-1)); > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr94962-3.c > > b/gcc/testsuite/gcc.target/i386/pr94962-3.c > > new file mode 100644 > > index 00000000000..8d0b9974435 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr94962-3.c > > @@ -0,0 +1,64 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mavx512f" } */ > > +/* { dg-final { scan-assembler-not "vmovdqa" } } */ > > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 4 > > +} } */ > > + > > +typedef long long __v8di __attribute__ ((__vector_size__ (64))); > > +typedef int __v16si __attribute__ ((__vector_size__ (64))); typedef > > +short __v32hi __attribute__ ((__vector_size__ (64))); typedef char > > +__v64qi __attribute__ ((__vector_size__ (64))); typedef long long > > +__m512i __attribute__ ((__vector_size__ (64), __may_alias__)); > > + > > +__m512i > > +__attribute__ ((noinline, noclone)) > > +foo1 () > > +{ > > + return __extension__ (__m512i)(__v8di) { -1, -1, -1, -1, > > + 0, 0, 0, 0 }; } > > + > > +__m512i > > +__attribute__ ((noinline, noclone)) > > +foo2 () > > +{ > > + return __extension__ (__m512i)(__v16si) { -1, -1, -1, -1, > > + -1, -1, -1, -1, > > + 0, 0, 0, 0, > > + 0, 0, 0, 0 }; } > > + > > +__m512i > > +__attribute__ ((noinline, noclone)) > > +foo3 () > > +{ > > + return __extension__ (__m512i)(__v32hi) { -1, -1, -1, -1, > > + -1, -1, -1, -1, > > + -1, -1, -1, -1, > > + -1, -1, -1, -1, > > + 0, 0, 0, 0, > > + 0, 0, 0, 0, > > + 0, 0, 0, 0, > > + 0, 0, 0, 0 }; } > > + > > +__m512i > > +__attribute__ ((noinline, noclone)) > > +foo4 () > > +{ > > + return __extension__ (__m512i)(__v64qi) { -1, -1, -1, -1, > > + -1, -1, -1, -1, > > + -1, -1, -1, -1, > > + -1, -1, -1, -1, > > + -1, -1, -1, -1, > > + -1, -1, -1, -1, > > + -1, -1, -1, -1, > > + -1, -1, -1, -1, > > + 0, 0, 0, 0, > > + 0, 0, 0, 0, > > + 0, 0, 0, 0, > > + 0, 0, 0, 0, > > + 0, 0, 0, 0, > > + 0, 0, 0, 0, > > + 0, 0, 0, 0, > > + 0, 0, 0, 0 }; } > > diff --git a/gcc/testsuite/gcc.target/i386/pr94962-4.c > > b/gcc/testsuite/gcc.target/i386/pr94962-4.c > > new file mode 100644 > > index 00000000000..5502c39910b > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr94962-4.c > > @@ -0,0 +1,49 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mavx" } */ > > +/* { dg-final { scan-assembler-not "vmovdqa" } } */ > > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 4 > > +} } */ > > + > > +typedef long long __v4di __attribute__ ((__vector_size__ (32))); > > +typedef int __v8si __attribute__ ((__vector_size__ (32))); typedef > > +short __v16hi __attribute__ ((__vector_size__ (32))); typedef char > > +__v32qi __attribute__ ((__vector_size__ (32))); typedef long long > > +__m256i __attribute__ ((__vector_size__ (32), __may_alias__)); > > + > > +__m256i > > +__attribute__ ((noinline, noclone)) > > +foo1 () > > +{ > > + return __extension__ (__m256i)(__v4di) { -1, -1, 0, 0 }; } > > + > > +__m256i > > +__attribute__ ((noinline, noclone)) > > +foo2 () > > +{ > > + return __extension__ (__m256i)(__v8si) { -1, -1, -1, -1, > > + 0, 0, 0, 0 }; } > > + > > +__m256i > > +__attribute__ ((noinline, noclone)) > > +foo3 () > > +{ > > + return __extension__ (__m256i)(__v16hi) { -1, -1, -1, -1, > > + -1, -1, -1, -1, > > + 0, 0, 0, 0, > > + 0, 0, 0, 0 }; } > > + > > +__m256i > > +__attribute__ ((noinline, noclone)) > > +foo4 () > > +{ > > + return __extension__ (__m256i)(__v32qi) { -1, -1, -1, -1, > > + -1, -1, -1, -1, > > + -1, -1, -1, -1, > > + -1, -1, -1, -1, > > + 0, 0, 0, 0, > > + 0, 0, 0, 0, > > + 0, 0, 0, 0, > > + 0, 0, 0, 0 }; } > > -- > > 2.18.2 > > > > Others LGTM. > > -- > BR, > Hongtao -- BR, Hongtao ^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2022-09-23 3:09 UTC | newest] Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2022-09-22 7:19 [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)) Hu, Lin1 2022-09-23 1:47 ` Hongtao Liu 2022-09-23 3:07 ` Hu, Lin1 2022-09-23 3:09 ` Hongtao Liu
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).