* [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680] @ 2021-04-23 4:53 Hongtao Liu 2021-04-23 9:13 ` Jakub Jelinek 2021-06-11 8:59 ` [PATCH] i386: Fix up *vec_concat<mode>_0_1 [PR101007] Jakub Jelinek 0 siblings, 2 replies; 9+ messages in thread From: Hongtao Liu @ 2021-04-23 4:53 UTC (permalink / raw) To: GCC Patches; +Cc: Jakub Jelinek [-- Attachment #1: Type: text/plain, Size: 1169 bytes --] Hi: If the second operand of __builtin_shuffle is const vector 0, and with specific mask, it can be optimized to movq/vmovps. .i.e. foo128: - vxorps %xmm1, %xmm1, %xmm1 - vmovlhps %xmm1, %xmm0, %xmm0 + vmovq %xmm0, %xmm0 foo256: - vxorps %xmm1, %xmm1, %xmm1 - vshuff32x4 $0, %ymm1, %ymm0, %ymm0 + vmovaps %xmm0, %xmm0 foo512: - vxorps %xmm1, %xmm1, %xmm1 - vshuff32x4 $68, %zmm1, %zmm0, %zmm0 + vmovaps %ymm0, %ymm0 Bootstrapped and regtested on x86-64_iinux-gnu{-m32,}. Ok for trunk? gcc/ChangeLog: PR target/94680 * config/i386/sse.md (ssedoublevecmode): Add attribute for V64QI/V32HI/V16SI/V4DI. (ssehalfvecmode): Add attribute for V2DI/V2DF. (*vec_concatv4si_0): Extend to VI124_128. (*vec_concat<mode>_0): New pre-reload splitter. * config/i386/predicates.md (movq_parallel): New predicate. gcc/testsuite/ChangeLog: PR target/94680 * gcc.target/i386/avx-pr94680.c: New test. * gcc.target/i386/avx512f-pr94680.c: New test. * gcc.target/i386/sse2-pr94680.c: New test. -- BR, Hongtao [-- Attachment #2: 0001-i386-Optimize-__builtin_shuffle-when-it-s-used-to-ze.patch --] [-- Type: text/x-patch, Size: 12065 bytes --] From eec5469cdeecf0e6650e9d2963dea4117919c5d2 Mon Sep 17 00:00:00 2001 From: liuhongt <hongtao.liu@intel.com> Date: Thu, 22 Apr 2021 15:33:16 +0800 Subject: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680] If the second operand of __builtin_shuffle is const vector 0, and with specific mask, it can be optimized to movq/vmovps. .i.e. foo128: - vxorps %xmm1, %xmm1, %xmm1 - vmovlhps %xmm1, %xmm0, %xmm0 + vmovq %xmm0, %xmm0 foo256: - vxorps %xmm1, %xmm1, %xmm1 - vshuff32x4 $0, %ymm1, %ymm0, %ymm0 + vmovaps %xmm0, %xmm0 foo512: - vxorps %xmm1, %xmm1, %xmm1 - vshuff32x4 $68, %zmm1, %zmm0, %zmm0 + vmovaps %ymm0, %ymm0 gcc/ChangeLog: PR target/94680 * config/i386/sse.md (ssedoublevecmode): Add attribute for V64QI/V32HI/V16SI/V4DI. (ssehalfvecmode): Add attribute for V2DI/V2DF. (*vec_concatv4si_0): Extend to VI124_128. (*vec_concat<mode>_0): New pre-reload splitter. * config/i386/predicates.md (movq_parallel): New predicate. gcc/testsuite/ChangeLog: PR target/94680 * gcc.target/i386/avx-pr94680.c: New test. * gcc.target/i386/avx512f-pr94680.c: New test. * gcc.target/i386/sse2-pr94680.c: New test. --- gcc/config/i386/predicates.md | 33 ++++++++ gcc/config/i386/sse.md | 37 +++++++-- gcc/testsuite/gcc.target/i386/avx-pr94680.c | 59 ++++++++++++++ .../gcc.target/i386/avx512f-pr94680.c | 78 +++++++++++++++++++ gcc/testsuite/gcc.target/i386/sse2-pr94680.c | 51 ++++++++++++ 5 files changed, 250 insertions(+), 8 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/avx-pr94680.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-pr94680.c create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr94680.c diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index b1df8548af6..4b706003ed8 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1524,6 +1524,39 @@ (define_predicate "misaligned_operand" (and (match_code "mem") (match_test "MEM_ALIGN (op) < GET_MODE_BITSIZE (mode)"))) +;; Return true if OP is a parallel for an mov{d,q,dqa,ps,pd} vec_select, +;; where one of the two operands of the vec_concat is const0_operand. +(define_predicate "movq_parallel" + (match_code "parallel") +{ + unsigned nelt = XVECLEN (op, 0); + unsigned nelt2 = nelt >> 1; + unsigned i; + + if (nelt < 2) + return false; + + /* Validate that all of the elements are constants, + lower halves of permute are lower halves of the first operand, + upper halves of permute come from any of the second operand. */ + for (i = 0; i < nelt; ++i) + { + rtx er = XVECEXP (op, 0, i); + unsigned HOST_WIDE_INT ei; + + if (!CONST_INT_P (er)) + return 0; + ei = INTVAL (er); + if (i < nelt2 && ei != i) + return 0; + if (i >= nelt2 + && (ei < nelt || ei >= nelt<<1)) + return 0; + } + + return 1; +}) + ;; Return true if OP is a vzeroall operation, known to be a PARALLEL. (define_predicate "vzeroall_operation" (match_code "parallel") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 9d3728d1cb0..b55636a3e12 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -812,19 +812,22 @@ (define_mode_attr sseintvecmodelower ;; Mapping of vector modes to a vector mode of double size (define_mode_attr ssedoublevecmode - [(V32QI "V64QI") (V16HI "V32HI") (V8SI "V16SI") (V4DI "V8DI") + [(V64QI "V128QI") (V32HI "V64HI") (V16SI "V32SI") (V8DI "V16DI") + (V32QI "V64QI") (V16HI "V32HI") (V8SI "V16SI") (V4DI "V8DI") (V16QI "V32QI") (V8HI "V16HI") (V4SI "V8SI") (V2DI "V4DI") + (V16SF "V32SF") (V8DF "V16DF") (V8SF "V16SF") (V4DF "V8DF") (V4SF "V8SF") (V2DF "V4DF")]) ;; Mapping of vector modes to a vector mode of half size +;; instead of V1DI/V1DF, DI/DF are used for V2DI/V2DF although they are scalar. (define_mode_attr ssehalfvecmode [(V64QI "V32QI") (V32HI "V16HI") (V16SI "V8SI") (V8DI "V4DI") (V4TI "V2TI") (V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI") (V4DI "V2DI") - (V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") + (V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") (V2DI "DI") (V16SF "V8SF") (V8DF "V4DF") (V8SF "V4SF") (V4DF "V2DF") - (V4SF "V2SF")]) + (V4SF "V2SF") (V2DF "DF")]) (define_mode_attr ssehalfvecmodelower [(V64QI "v32qi") (V32HI "v16hi") (V16SI "v8si") (V8DI "v4di") (V4TI "v2ti") @@ -15964,11 +15967,11 @@ (define_insn "*vec_concatv4si" (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex") (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")]) -(define_insn "*vec_concatv4si_0" - [(set (match_operand:V4SI 0 "register_operand" "=v,x") - (vec_concat:V4SI - (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y") - (match_operand:V2SI 2 "const0_operand" " C,C")))] +(define_insn "*vec_concat<mode>_0" + [(set (match_operand:VI124_128 0 "register_operand" "=v,x") + (vec_concat:VI124_128 + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm,?!*y") + (match_operand:<ssehalfvecmode> 2 "const0_operand" " C,C")))] "TARGET_SSE2" "@ %vmovq\t{%1, %0|%0, %1} @@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat<mode>" (set_attr "prefix" "maybe_evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn_and_split "*vec_concat<mode>_0" + [(set (match_operand:V 0 "register_operand") + (vec_select:V + (vec_concat:<ssedoublevecmode> + (match_operand:V 1 "nonimmediate_operand") + (match_operand:V 2 "const0_operand")) + (match_parallel 3 "movq_parallel" + [(match_operand 4 "const_int_operand")])))] + "ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (vec_concat:V (match_dup 1) (match_dup 5)))] +{ + operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]); + operands[5] = CONST0_RTX (<ssehalfvecmode>mode); +}) + (define_insn "vcvtph2ps<mask_name>" [(set (match_operand:V4SF 0 "register_operand" "=v") (vec_select:V4SF diff --git a/gcc/testsuite/gcc.target/i386/avx-pr94680.c b/gcc/testsuite/gcc.target/i386/avx-pr94680.c new file mode 100644 index 00000000000..4fe0f5bede6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-pr94680.c @@ -0,0 +1,59 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx -mno-avx512f -O2" } */ +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%xmm[0-9]} 6 } } */ +/* { dg-final { scan-assembler-not "pxor" } } */ + +typedef float v8sf __attribute__((vector_size(32))); +typedef double v4df __attribute__ ((vector_size (32))); +typedef long long v4di __attribute__((vector_size(32))); +typedef int v8si __attribute__((vector_size(32))); +typedef short v16hi __attribute__ ((vector_size (32))); +typedef char v32qi __attribute__ ((vector_size (32))); + +v4df +foo_v4df (v4df x) +{ + return __builtin_shuffle (x, (v4df) { 0, 0, 0, 0 }, (v4di) { 0, 1, 4, 5 }); +} + +v4di +foo_v4di (v4di x) +{ + return __builtin_shuffle (x, (v4di) { 0, 0, 0, 0 }, (v4di) { 0, 1, 4, 7 }); +} + +v8sf +foo_v8sf (v8sf x) +{ + return __builtin_shuffle (x, (v8sf) { 0, 0, 0, 0, 0, 0, 0, 0 }, + (v8si) { 0, 1, 2, 3, 8, 9, 10, 11 }); +} + +v8si +foo_v8si (v8si x) +{ + return __builtin_shuffle (x, (v8si) { 0, 0, 0, 0, 0, 0, 0, 0 }, + (v8si) { 0, 1, 2, 3, 13, 12, 11, 15 }); +} + +v16hi +foo_v16hi (v16hi x) +{ + return __builtin_shuffle (x, (v16hi) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, + (v16hi) { 0, 1, 2, 3, 4, 5, 6, 7, + 24, 17, 26, 19, 28, 21, 30, 23 }); + } + +v32qi +foo_v32qi (v32qi x) +{ + return __builtin_shuffle (x, (v32qi) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, + (v32qi) { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 32, 49, 34, 58, 36, 53, 38, 39, + 40, 60, 42, 43, 63, 45, 46, 47 }); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c new file mode 100644 index 00000000000..442b79da420 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c @@ -0,0 +1,78 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -mavx512vbmi -O2" } */ +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 6} } */ +/* { dg-final { scan-assembler-not "pxor" } } */ + + +typedef float v16sf __attribute__((vector_size(64))); +typedef double v8df __attribute__ ((vector_size (64))); +typedef long long v8di __attribute__((vector_size(64))); +typedef int v16si __attribute__((vector_size(64))); +typedef short v32hi __attribute__ ((vector_size (64))); +typedef char v64qi __attribute__ ((vector_size (64))); + +v8df +foo_v8df (v8df x) +{ + return __builtin_shuffle (x, (v8df) { 0, 0, 0, 0, 0, 0, 0, 0 }, + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 }); +} + +v8di +foo_v8di (v8di x) +{ + return __builtin_shuffle (x, (v8di) { 0, 0, 0, 0, 0, 0, 0, 0 }, + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 }); +} + +v16sf +foo_v16sf (v16sf x) +{ + return __builtin_shuffle (x, (v16sf) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, + (v16si) { 0, 1, 2, 3, 4, 5, 6, 7, + 16, 17, 18, 19, 20, 21, 22, 23 }); +} + +v16si +foo_v16si (v16si x) +{ + return __builtin_shuffle (x, (v16si) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, + (v16si) { 0, 1, 2, 3, 4, 5, 6, 7, + 16, 17, 18, 19, 20, 21, 22, 23 }); +} + +v32hi +foo_v32hi (v32hi x) +{ + return __builtin_shuffle (x, (v32hi) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, + (v32hi) { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 32, 33, 34, 35, 36, 37, 38, 39, + 40,41, 42, 43, 44, 45, 46, 47 }); +} + +v64qi +foo_v64qi (v64qi x) +{ + return __builtin_shuffle (x, (v64qi) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, + (v64qi) {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 92, 93, 94, 95 }); +} diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr94680.c b/gcc/testsuite/gcc.target/i386/sse2-pr94680.c new file mode 100644 index 00000000000..7f4f98e3b1b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-pr94680.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -mno-sse4.1 -O2" } */ +/* { dg-final { scan-assembler-times {(?n)mov.*%xmm[0-9]} 6} } */ +/* { dg-final { scan-assembler-not "pxor" } } */ + +typedef float v4sf __attribute__((vector_size(16))); +typedef double v2df __attribute__ ((vector_size (16))); +typedef long long v2di __attribute__((vector_size(16))); +typedef int v4si __attribute__((vector_size(16))); +typedef short v8hi __attribute__ ((vector_size (16))); +typedef char v16qi __attribute__ ((vector_size (16))); + +v2df +foo_v2df (v2df x) +{ + return __builtin_shuffle (x, (v2df) { 0, 0 }, (v2di) {0, 2}); +} + +v2di +foo_v2di (v2di x) +{ + return __builtin_shuffle (x, (v2di) { 0, 0 }, (v2di) {0, 3}); +} + +v4sf +foo_v4sf (v4sf x) +{ + return __builtin_shuffle (x, (v4sf) { 0, 0, 0, 0 }, (v4si) {0, 1, 4, 5}); +} + +v4si +foo_v4si (v4si x) +{ + return __builtin_shuffle (x, (v4si) { 0, 0, 0, 0 }, (v4si) {0, 1, 6, 7}); +} + +v8hi +foo_v8hi (v8hi x) +{ + return __builtin_shuffle (x, (v8hi) { 0, 0, 0, 0, 0, 0, 0, 0 }, + (v8hi) { 0, 1, 2, 3, 8, 12, 10, 13 }); +} + +v16qi +foo_v16qi (v16qi x) +{ + return __builtin_shuffle (x, (v16qi) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, + (v16qi) {0, 1, 2, 3, 4, 5, 6, 7, + 16, 24, 18, 26, 20, 28, 22, 30 }); +} -- 2.18.1 ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680] 2021-04-23 4:53 [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680] Hongtao Liu @ 2021-04-23 9:13 ` Jakub Jelinek 2021-04-25 6:57 ` Hongtao Liu 2021-06-11 8:59 ` [PATCH] i386: Fix up *vec_concat<mode>_0_1 [PR101007] Jakub Jelinek 1 sibling, 1 reply; 9+ messages in thread From: Jakub Jelinek @ 2021-04-23 9:13 UTC (permalink / raw) To: Hongtao Liu; +Cc: GCC Patches On Fri, Apr 23, 2021 at 12:53:58PM +0800, Hongtao Liu via Gcc-patches wrote: > + if (!CONST_INT_P (er)) > + return 0; > + ei = INTVAL (er); > + if (i < nelt2 && ei != i) > + return 0; > + if (i >= nelt2 > + && (ei < nelt || ei >= nelt<<1)) Formatting: 1) you have spaces followed by tab, remove the spaces; but, if (i >= nelt2 && (ei < nelt || ei >= nelt<<1)) fits on one line, so keep it on one line. 2) nelt<<1 should be nelt << 1 with spaces around the << > -(define_insn "*vec_concatv4si_0" > - [(set (match_operand:V4SI 0 "register_operand" "=v,x") > - (vec_concat:V4SI > - (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y") > - (match_operand:V2SI 2 "const0_operand" " C,C")))] > +(define_insn "*vec_concat<mode>_0" > + [(set (match_operand:VI124_128 0 "register_operand" "=v,x") > + (vec_concat:VI124_128 > + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm,?!*y") > + (match_operand:<ssehalfvecmode> 2 "const0_operand" " C,C")))] > "TARGET_SSE2" > "@ > %vmovq\t{%1, %0|%0, %1} > @@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat<mode>" > (set_attr "prefix" "maybe_evex") > (set_attr "mode" "<sseinsnmode>")]) > > +(define_insn_and_split "*vec_concat<mode>_0" Would be better to use a different pattern name, *vec_concat<mode>_0 is already used in the above define_insn. Use some additional suffix after _0? > + return __builtin_shuffle (x, (v32qi) { 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0 }, > + (v32qi) { 0, 1, 2, 3, 4, 5, 6, 7, > + 8, 9, 10, 11, 12, 13, 14, 15, > + 32, 49, 34, 58, 36, 53, 38, 39, > + 40, 60, 42, 43, 63, 45, 46, 47 }); In this testcase the shuffles in the part taking indexes from the zero vector are nicely randomized. > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c > @@ -0,0 +1,78 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512bw -mavx512vbmi -O2" } */ > +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 6} } */ > +/* { dg-final { scan-assembler-not "pxor" } } */ > + > + > +typedef float v16sf __attribute__((vector_size(64))); > +typedef double v8df __attribute__ ((vector_size (64))); > +typedef long long v8di __attribute__((vector_size(64))); > +typedef int v16si __attribute__((vector_size(64))); > +typedef short v32hi __attribute__ ((vector_size (64))); > +typedef char v64qi __attribute__ ((vector_size (64))); > + > +v8df > +foo_v8df (v8df x) > +{ > + return __builtin_shuffle (x, (v8df) { 0, 0, 0, 0, 0, 0, 0, 0 }, > + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 }); > +} > + > +v8di > +foo_v8di (v8di x) > +{ > + return __builtin_shuffle (x, (v8di) { 0, 0, 0, 0, 0, 0, 0, 0 }, > + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 }); > +} > + > +v16sf > +foo_v16sf (v16sf x) > +{ > + return __builtin_shuffle (x, (v16sf) { 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0 }, > + (v16si) { 0, 1, 2, 3, 4, 5, 6, 7, > + 16, 17, 18, 19, 20, 21, 22, 23 }); > +} > + > +v16si > +foo_v16si (v16si x) > +{ > + return __builtin_shuffle (x, (v16si) { 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0 }, > + (v16si) { 0, 1, 2, 3, 4, 5, 6, 7, > + 16, 17, 18, 19, 20, 21, 22, 23 }); > +} > + > +v32hi > +foo_v32hi (v32hi x) > +{ > + return __builtin_shuffle (x, (v32hi) { 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0 }, > + (v32hi) { 0, 1, 2, 3, 4, 5, 6, 7, > + 8, 9, 10, 11, 12, 13, 14, 15, > + 32, 33, 34, 35, 36, 37, 38, 39, > + 40,41, 42, 43, 44, 45, 46, 47 }); > +} > + > +v64qi > +foo_v64qi (v64qi x) > +{ > + return __builtin_shuffle (x, (v64qi) { 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0 }, > + (v64qi) {0, 1, 2, 3, 4, 5, 6, 7, > + 8, 9, 10, 11, 12, 13, 14, 15, > + 16, 17, 18, 19, 20, 21, 22, 23, > + 24, 25, 26, 27, 28, 29, 30, 31, > + 64, 65, 66, 67, 68, 69, 70, 71, > + 72, 73, 74, 75, 76, 77, 78, 79, > + 80, 81, 82, 83, 84, 85, 86, 87, > + 88, 89, 90, 91, 92, 93, 94, 95 }); Can't you randomize a little bit at least some of these too? Also, what happens with __builtin_shuffle (zero_vector, x, ...) (i.e. when you swap the two vectors and adjust correspondingly the permutation)? Will it be also recognized or do we just punt on those? Jakub ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680] 2021-04-23 9:13 ` Jakub Jelinek @ 2021-04-25 6:57 ` Hongtao Liu 2021-05-12 7:30 ` Hongtao Liu 2021-05-12 14:19 ` Jakub Jelinek 0 siblings, 2 replies; 9+ messages in thread From: Hongtao Liu @ 2021-04-25 6:57 UTC (permalink / raw) To: Jakub Jelinek; +Cc: GCC Patches [-- Attachment #1: Type: text/plain, Size: 6586 bytes --] On Fri, Apr 23, 2021 at 5:13 PM Jakub Jelinek <jakub@redhat.com> wrote: > > On Fri, Apr 23, 2021 at 12:53:58PM +0800, Hongtao Liu via Gcc-patches wrote: > > + if (!CONST_INT_P (er)) > > + return 0; > > + ei = INTVAL (er); > > + if (i < nelt2 && ei != i) > > + return 0; > > + if (i >= nelt2 > > + && (ei < nelt || ei >= nelt<<1)) > > Formatting: > 1) you have spaces followed by tab, remove the spaces; but, > if (i >= nelt2 && (ei < nelt || ei >= nelt<<1)) > fits on one line, so keep it on one line. > 2) nelt<<1 should be nelt << 1 with spaces around the << > Done. > > -(define_insn "*vec_concatv4si_0" > > - [(set (match_operand:V4SI 0 "register_operand" "=v,x") > > - (vec_concat:V4SI > > - (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y") > > - (match_operand:V2SI 2 "const0_operand" " C,C")))] > > +(define_insn "*vec_concat<mode>_0" > > + [(set (match_operand:VI124_128 0 "register_operand" "=v,x") > > + (vec_concat:VI124_128 > > + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm,?!*y") > > + (match_operand:<ssehalfvecmode> 2 "const0_operand" " C,C")))] > > "TARGET_SSE2" > > "@ > > %vmovq\t{%1, %0|%0, %1} > > @@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat<mode>" > > (set_attr "prefix" "maybe_evex") > > (set_attr "mode" "<sseinsnmode>")]) > > > > +(define_insn_and_split "*vec_concat<mode>_0" > > Would be better to use a different pattern name, *vec_concat<mode>_0 > is already used in the above define_insn. > Use some additional suffix after _0? > Changed to "*vec_concat<mode>_0_1" > > + return __builtin_shuffle (x, (v32qi) { 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0 }, > > + (v32qi) { 0, 1, 2, 3, 4, 5, 6, 7, > > + 8, 9, 10, 11, 12, 13, 14, 15, > > + 32, 49, 34, 58, 36, 53, 38, 39, > > + 40, 60, 42, 43, 63, 45, 46, 47 }); > > In this testcase the shuffles in the part taking indexes from the zero > vector are nicely randomized. > > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c > > @@ -0,0 +1,78 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-mavx512bw -mavx512vbmi -O2" } */ > > +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 6} } */ > > +/* { dg-final { scan-assembler-not "pxor" } } */ > > + > > + > > +typedef float v16sf __attribute__((vector_size(64))); > > +typedef double v8df __attribute__ ((vector_size (64))); > > +typedef long long v8di __attribute__((vector_size(64))); > > +typedef int v16si __attribute__((vector_size(64))); > > +typedef short v32hi __attribute__ ((vector_size (64))); > > +typedef char v64qi __attribute__ ((vector_size (64))); > > + > > +v8df > > +foo_v8df (v8df x) > > +{ > > + return __builtin_shuffle (x, (v8df) { 0, 0, 0, 0, 0, 0, 0, 0 }, > > + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 }); > > +} > > + > > +v8di > > +foo_v8di (v8di x) > > +{ > > + return __builtin_shuffle (x, (v8di) { 0, 0, 0, 0, 0, 0, 0, 0 }, > > + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 }); > > +} > > + > > +v16sf > > +foo_v16sf (v16sf x) > > +{ > > + return __builtin_shuffle (x, (v16sf) { 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0 }, > > + (v16si) { 0, 1, 2, 3, 4, 5, 6, 7, > > + 16, 17, 18, 19, 20, 21, 22, 23 }); > > +} > > + > > +v16si > > +foo_v16si (v16si x) > > +{ > > + return __builtin_shuffle (x, (v16si) { 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0 }, > > + (v16si) { 0, 1, 2, 3, 4, 5, 6, 7, > > + 16, 17, 18, 19, 20, 21, 22, 23 }); > > +} > > + > > +v32hi > > +foo_v32hi (v32hi x) > > +{ > > + return __builtin_shuffle (x, (v32hi) { 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0 }, > > + (v32hi) { 0, 1, 2, 3, 4, 5, 6, 7, > > + 8, 9, 10, 11, 12, 13, 14, 15, > > + 32, 33, 34, 35, 36, 37, 38, 39, > > + 40,41, 42, 43, 44, 45, 46, 47 }); > > +} > > + > > +v64qi > > +foo_v64qi (v64qi x) > > +{ > > + return __builtin_shuffle (x, (v64qi) { 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0 }, > > + (v64qi) {0, 1, 2, 3, 4, 5, 6, 7, > > + 8, 9, 10, 11, 12, 13, 14, 15, > > + 16, 17, 18, 19, 20, 21, 22, 23, > > + 24, 25, 26, 27, 28, 29, 30, 31, > > + 64, 65, 66, 67, 68, 69, 70, 71, > > + 72, 73, 74, 75, 76, 77, 78, 79, > > + 80, 81, 82, 83, 84, 85, 86, 87, > > + 88, 89, 90, 91, 92, 93, 94, 95 }); > > Can't you randomize a little bit at least some of these too? > Done. > Also, what happens with __builtin_shuffle (zero_vector, x, ...) (i.e. when > you swap the two vectors and adjust correspondingly the permutation)? > Will it be also recognized or do we just punt on those? when building gimple, vec_perm(0, x, sel) is simplified to vec_perm(x, 0, sel*)(, with adjustment of selector), since arg0 is a constant. Not sure if rtl phase would do same simplification, anyway i add testcases for __builtin_shuffle (zero_vector, x, ...), but not extend pre-reload splitters to handle (vec_select: (vec_concat: cosnt0_rtx op1) selector). > > Jakub > -- BR, Hongtao [-- Attachment #2: 0001-i386-Optimize-__builtin_shuffle-when-it-s-used-to-ze.patch --] [-- Type: text/x-patch, Size: 15877 bytes --] From fb72db5a751708f3d13027c3c279fbf60e1e1aaf Mon Sep 17 00:00:00 2001 From: liuhongt <hongtao.liu@intel.com> Date: Thu, 22 Apr 2021 15:33:16 +0800 Subject: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680] If the second operand of __builtin_shuffle is const vector 0, and with specific mask, it can be optimized to movq/vmovps. .i.e. foo128: - vxorps %xmm1, %xmm1, %xmm1 - vmovlhps %xmm1, %xmm0, %xmm0 + vmovq %xmm0, %xmm0 foo256: - vxorps %xmm1, %xmm1, %xmm1 - vshuff32x4 $0, %ymm1, %ymm0, %ymm0 + vmovaps %xmm0, %xmm0 foo512: - vxorps %xmm1, %xmm1, %xmm1 - vshuff32x4 $68, %zmm1, %zmm0, %zmm0 + vmovaps %ymm0, %ymm0 gcc/ChangeLog: PR target/94680 * config/i386/sse.md (ssedoublevecmode): Add attribute for V64QI/V32HI/V16SI/V4DI. (ssehalfvecmode): Add attribute for V2DI/V2DF. (*vec_concatv4si_0): Extend to VI124_128. (*vec_concat<mode>_0): New pre-reload splitter. * config/i386/predicates.md (movq_parallel): New predicate. gcc/testsuite/ChangeLog: PR target/94680 * gcc.target/i386/avx-pr94680.c: New test. * gcc.target/i386/avx512f-pr94680.c: New test. * gcc.target/i386/sse2-pr94680.c: New test. --- gcc/config/i386/predicates.md | 32 ++++ gcc/config/i386/sse.md | 37 ++++- gcc/testsuite/gcc.target/i386/avx-pr94680.c | 107 +++++++++++++ .../gcc.target/i386/avx512f-pr94680.c | 144 ++++++++++++++++++ gcc/testsuite/gcc.target/i386/sse2-pr94680.c | 91 +++++++++++ 5 files changed, 403 insertions(+), 8 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/avx-pr94680.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-pr94680.c create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr94680.c diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index b1df8548af6..201aacd65e6 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1524,6 +1524,38 @@ (define_predicate "misaligned_operand" (and (match_code "mem") (match_test "MEM_ALIGN (op) < GET_MODE_BITSIZE (mode)"))) +;; Return true if OP is a parallel for an mov{d,q,dqa,ps,pd} vec_select, +;; where one of the two operands of the vec_concat is const0_operand. +(define_predicate "movq_parallel" + (match_code "parallel") +{ + unsigned nelt = XVECLEN (op, 0); + unsigned nelt2 = nelt >> 1; + unsigned i; + + if (nelt < 2) + return false; + + /* Validate that all of the elements are constants, + lower halves of permute are lower halves of the first operand, + upper halves of permute come from any of the second operand. */ + for (i = 0; i < nelt; ++i) + { + rtx er = XVECEXP (op, 0, i); + unsigned HOST_WIDE_INT ei; + + if (!CONST_INT_P (er)) + return 0; + ei = INTVAL (er); + if (i < nelt2 && ei != i) + return 0; + if (i >= nelt2 && (ei < nelt || ei >= nelt << 1)) + return 0; + } + + return 1; +}) + ;; Return true if OP is a vzeroall operation, known to be a PARALLEL. (define_predicate "vzeroall_operation" (match_code "parallel") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 9d3728d1cb0..369ff2b9f28 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -812,19 +812,22 @@ (define_mode_attr sseintvecmodelower ;; Mapping of vector modes to a vector mode of double size (define_mode_attr ssedoublevecmode - [(V32QI "V64QI") (V16HI "V32HI") (V8SI "V16SI") (V4DI "V8DI") + [(V64QI "V128QI") (V32HI "V64HI") (V16SI "V32SI") (V8DI "V16DI") + (V32QI "V64QI") (V16HI "V32HI") (V8SI "V16SI") (V4DI "V8DI") (V16QI "V32QI") (V8HI "V16HI") (V4SI "V8SI") (V2DI "V4DI") + (V16SF "V32SF") (V8DF "V16DF") (V8SF "V16SF") (V4DF "V8DF") (V4SF "V8SF") (V2DF "V4DF")]) ;; Mapping of vector modes to a vector mode of half size +;; instead of V1DI/V1DF, DI/DF are used for V2DI/V2DF although they are scalar. (define_mode_attr ssehalfvecmode [(V64QI "V32QI") (V32HI "V16HI") (V16SI "V8SI") (V8DI "V4DI") (V4TI "V2TI") (V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI") (V4DI "V2DI") - (V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") + (V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") (V2DI "DI") (V16SF "V8SF") (V8DF "V4DF") (V8SF "V4SF") (V4DF "V2DF") - (V4SF "V2SF")]) + (V4SF "V2SF") (V2DF "DF")]) (define_mode_attr ssehalfvecmodelower [(V64QI "v32qi") (V32HI "v16hi") (V16SI "v8si") (V8DI "v4di") (V4TI "v2ti") @@ -15964,11 +15967,11 @@ (define_insn "*vec_concatv4si" (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex") (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")]) -(define_insn "*vec_concatv4si_0" - [(set (match_operand:V4SI 0 "register_operand" "=v,x") - (vec_concat:V4SI - (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y") - (match_operand:V2SI 2 "const0_operand" " C,C")))] +(define_insn "*vec_concat<mode>_0" + [(set (match_operand:VI124_128 0 "register_operand" "=v,x") + (vec_concat:VI124_128 + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm,?!*y") + (match_operand:<ssehalfvecmode> 2 "const0_operand" " C,C")))] "TARGET_SSE2" "@ %vmovq\t{%1, %0|%0, %1} @@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat<mode>" (set_attr "prefix" "maybe_evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn_and_split "*vec_concat<mode>_0_1" + [(set (match_operand:V 0 "register_operand") + (vec_select:V + (vec_concat:<ssedoublevecmode> + (match_operand:V 1 "nonimmediate_operand") + (match_operand:V 2 "const0_operand")) + (match_parallel 3 "movq_parallel" + [(match_operand 4 "const_int_operand")])))] + "ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (vec_concat:V (match_dup 1) (match_dup 5)))] +{ + operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]); + operands[5] = CONST0_RTX (<ssehalfvecmode>mode); +}) + (define_insn "vcvtph2ps<mask_name>" [(set (match_operand:V4SF 0 "register_operand" "=v") (vec_select:V4SF diff --git a/gcc/testsuite/gcc.target/i386/avx-pr94680.c b/gcc/testsuite/gcc.target/i386/avx-pr94680.c new file mode 100644 index 00000000000..a89e4967f64 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-pr94680.c @@ -0,0 +1,107 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx -mno-avx512f -O2" } */ +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%xmm[0-9]} 12 } } */ +/* { dg-final { scan-assembler-not "pxor" } } */ + +typedef float v8sf __attribute__((vector_size(32))); +typedef double v4df __attribute__ ((vector_size (32))); +typedef long long v4di __attribute__((vector_size(32))); +typedef int v8si __attribute__((vector_size(32))); +typedef short v16hi __attribute__ ((vector_size (32))); +typedef char v32qi __attribute__ ((vector_size (32))); + +v4df +foo_v4df (v4df x) +{ + return __builtin_shuffle (x, (v4df) { 0, 0, 0, 0 }, (v4di) { 0, 1, 4, 5 }); +} + +v4df +foo_v4df_l (v4df x) +{ + return __builtin_shuffle ((v4df) { 0, 0, 0, 0 }, x, (v4di) { 4, 5, 1, 2 }); +} + +v4di +foo_v4di (v4di x) +{ + return __builtin_shuffle (x, (v4di) { 0, 0, 0, 0 }, (v4di) { 0, 1, 4, 7 }); +} + +v4di +foo_v4di_l (v4di x) +{ + return __builtin_shuffle ((v4di) { 0, 0, 0, 0 }, x, (v4di) { 4, 5, 3, 1 }); +} + +v8sf +foo_v8sf (v8sf x) +{ + return __builtin_shuffle ((v8sf) { 0, 0, 0, 0, 0, 0, 0, 0 }, x, + (v8si) { 8, 9, 10, 11, 0, 1, 2, 3 }); +} + +v8sf +foo_v8sf_l (v8sf x) +{ + return __builtin_shuffle (x, (v8sf) { 0, 0, 0, 0, 0, 0, 0, 0 }, + (v8si) { 0, 1, 2, 3, 8, 9, 10, 11 }); +} + +v8si +foo_v8si (v8si x) +{ + return __builtin_shuffle (x, (v8si) { 0, 0, 0, 0, 0, 0, 0, 0 }, + (v8si) { 0, 1, 2, 3, 13, 12, 11, 15 }); +} + +v8si +foo_v8si_l (v8si x) +{ + return __builtin_shuffle ((v8si) { 0, 0, 0, 0, 0, 0, 0, 0 }, x, + (v8si) { 8, 9, 10, 11, 7, 6, 5, 4 }); +} + +v16hi +foo_v16hi (v16hi x) +{ + return __builtin_shuffle (x, (v16hi) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, + (v16hi) { 0, 1, 2, 3, 4, 5, 6, 7, + 24, 17, 26, 19, 28, 21, 30, 23 }); +} + +v16hi +foo_v16hi_l (v16hi x) +{ + return __builtin_shuffle ((v16hi) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, x, + (v16hi) { 16, 17, 18, 20, 21, 22, 23, + 15, 0, 13, 2, 11, 4, 9, 6 }); +} + +v32qi +foo_v32qi (v32qi x) +{ + return __builtin_shuffle (x, (v32qi) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, + (v32qi) { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 32, 49, 34, 58, 36, 53, 38, 39, + 40, 60, 42, 43, 63, 45, 46, 47 }); +} + +v32qi +foo_v32qi_l (v32qi x) +{ + return __builtin_shuffle ((v32qi) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, x, + (v32qi) { 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 31, 0, 29, 2, 27, 4, 25, 6, + 23, 8, 21, 10, 19, 12, 17, 14 }); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c new file mode 100644 index 00000000000..c27431aae72 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c @@ -0,0 +1,144 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -mavx512vbmi -O2" } */ +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 12} } */ +/* { dg-final { scan-assembler-not "pxor" } } */ + + +typedef float v16sf __attribute__((vector_size(64))); +typedef double v8df __attribute__ ((vector_size (64))); +typedef long long v8di __attribute__((vector_size(64))); +typedef int v16si __attribute__((vector_size(64))); +typedef short v32hi __attribute__ ((vector_size (64))); +typedef char v64qi __attribute__ ((vector_size (64))); + +v8df +foo_v8df (v8df x) +{ + return __builtin_shuffle (x, (v8df) { 0, 0, 0, 0, 0, 0, 0, 0 }, + (v8di) { 0, 1, 2, 3, 15, 14, 10, 11 }); +} + +v8df +foo_v8df_l (v8df x) +{ + return __builtin_shuffle ((v8df) { 0, 0, 0, 0, 0, 0, 0, 0 }, x, + (v8di) { 8, 9, 10, 11, 0, 1, 2, 3 }); +} + +v8di +foo_v8di (v8di x) +{ + return __builtin_shuffle (x, (v8di) { 0, 0, 0, 0, 0, 0, 0, 0 }, + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 }); +} + +v8di +foo_v8di_l (v8di x) +{ + return __builtin_shuffle ((v8di) { 0, 0, 0, 0, 0, 0, 0, 0 }, x, + (v8di) { 8, 9, 10, 11, 7, 6, 5, 4 }); +} + +v16sf +foo_v16sf (v16sf x) +{ + return __builtin_shuffle (x, (v16sf) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, + (v16si) { 0, 1, 2, 3, 4, 5, 6, 7, + 16, 17, 18, 19, 20, 21, 22, 23 }); +} + +v16sf +foo_v16sf_l (v16sf x) +{ + return __builtin_shuffle ((v16sf) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, x, + (v16si) { 16, 17, 18, 19, 20, 21, 22, 23, + 0, 15, 2, 13, 4, 11, 6, 9 }); +} + +v16si +foo_v16si (v16si x) +{ + return __builtin_shuffle (x, (v16si) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, + (v16si) { 0, 1, 2, 3, 4, 5, 6, 7, + 31, 30, 29, 28, 20, 21, 22, 23 }); +} + +v16si +foo_v16si_l (v16si x) +{ + return __builtin_shuffle ((v16si) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, x, + (v16si) { 16, 17, 18, 19, 20, 21, 22, 23, + 15, 0, 13, 2, 11, 4, 9, 6 }); +} + +v32hi +foo_v32hi (v32hi x) +{ + return __builtin_shuffle (x, (v32hi) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, + (v32hi) { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 63, 33, 61, 35, 59, 37, 57, 39, + 55, 41, 53, 43, 51, 45, 49, 47 }); +} + +v32hi +foo_v32hi_l (v32hi x) +{ + return __builtin_shuffle ((v32hi) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, x, + (v32hi) { 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 31, 0, 29, 2, 27, 4, 25, 6, + 23, 8, 21, 10, 19, 12, 17, 14 }); +} + +v64qi +foo_v64qi (v64qi x) +{ + return __builtin_shuffle (x, (v64qi) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, + (v64qi) {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 64, 127, 66, 125, 68, 123, 70, 121, + 72, 119, 74, 117, 76, 115, 78, 113, + 80, 111, 82, 109, 84, 107, 86, 105, + 88, 103, 90, 101, 92, 99, 94, 97 }); +} + +v64qi +foo_v64qi_l (v64qi x) +{ + return __builtin_shuffle ((v64qi) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, x, + (v64qi) { 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 92, 93, 94, 95, + 0, 63, 2, 61, 4, 59, 6, 57, + 8, 55, 10, 53, 12, 51, 14, 49, + 16, 47, 18, 45, 20, 43, 22, 41, + 24, 39, 26, 37, 28, 35, 30, 33 }); +} diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr94680.c b/gcc/testsuite/gcc.target/i386/sse2-pr94680.c new file mode 100644 index 00000000000..7e0ff9f6bc7 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-pr94680.c @@ -0,0 +1,91 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -mno-sse4.1 -O2" } */ +/* { dg-final { scan-assembler-times {(?n)(?:mov|psrldq).*%xmm[0-9]} 12 } } */ +/* { dg-final { scan-assembler-not "pxor" } } */ + +typedef float v4sf __attribute__((vector_size(16))); +typedef double v2df __attribute__ ((vector_size (16))); +typedef long long v2di __attribute__((vector_size(16))); +typedef int v4si __attribute__((vector_size(16))); +typedef short v8hi __attribute__ ((vector_size (16))); +typedef char v16qi __attribute__ ((vector_size (16))); + +v2df +foo_v2df (v2df x) +{ + return __builtin_shuffle (x, (v2df) { 0, 0 }, (v2di) {0, 2}); +} + +v2df +foo_v2df_l (v2df x) +{ + return __builtin_shuffle ((v2df) { 0, 0 }, x, (v2di) {3, 1}); +} + +v2di +foo_v2di (v2di x) +{ + return __builtin_shuffle (x, (v2di) { 0, 0 }, (v2di) {0, 3}); +} + +v2di +foo_v2di_l (v2di x) +{ + return __builtin_shuffle ((v2di) { 0, 0 }, x, (v2di) {3, 0}); +} + +v4sf +foo_v4sf (v4sf x) +{ + return __builtin_shuffle (x, (v4sf) { 0, 0, 0, 0 }, (v4si) {0, 1, 4, 5}); +} + +v4sf +foo_v4sf_l (v4sf x) +{ + return __builtin_shuffle ((v4sf) { 0, 0, 0, 0 }, x, (v4si) {4, 5, 3, 1}); +} + +v4si +foo_v4si (v4si x) +{ + return __builtin_shuffle (x, (v4si) { 0, 0, 0, 0 }, (v4si) {0, 1, 6, 7}); +} + +v4si +foo_v4si_l (v4si x) +{ + return __builtin_shuffle ((v4si) { 0, 0, 0, 0 }, x, (v4si) {4, 5, 1, 2}); +} + +v8hi +foo_v8hi (v8hi x) +{ + return __builtin_shuffle (x, (v8hi) { 0, 0, 0, 0, 0, 0, 0, 0 }, + (v8hi) { 0, 1, 2, 3, 8, 12, 10, 13 }); +} + +v8hi +foo_v8hi_l (v8hi x) +{ + return __builtin_shuffle ((v8hi) { 0, 0, 0, 0, 0, 0, 0, 0 }, x, + (v8hi) { 8, 9, 10, 11, 7, 6, 5, 4 }); +} + +v16qi +foo_v16qi (v16qi x) +{ + return __builtin_shuffle (x, (v16qi) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, + (v16qi) {0, 1, 2, 3, 4, 5, 6, 7, + 16, 24, 18, 26, 20, 28, 22, 30 }); +} + +v16qi +foo_v16qi_l (v16qi x) +{ + return __builtin_shuffle ((v16qi) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, x, + (v16qi) { 16, 17, 18, 19, 20, 21, 22, 23, + 15, 0, 13, 2, 11, 4, 9, 6 }); +} -- 2.18.1 ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680] 2021-04-25 6:57 ` Hongtao Liu @ 2021-05-12 7:30 ` Hongtao Liu 2021-05-12 14:19 ` Jakub Jelinek 1 sibling, 0 replies; 9+ messages in thread From: Hongtao Liu @ 2021-05-12 7:30 UTC (permalink / raw) To: Jakub Jelinek; +Cc: GCC Patches ping. On Sun, Apr 25, 2021 at 2:57 PM Hongtao Liu <crazylht@gmail.com> wrote: > > On Fri, Apr 23, 2021 at 5:13 PM Jakub Jelinek <jakub@redhat.com> wrote: > > > > On Fri, Apr 23, 2021 at 12:53:58PM +0800, Hongtao Liu via Gcc-patches wrote: > > > + if (!CONST_INT_P (er)) > > > + return 0; > > > + ei = INTVAL (er); > > > + if (i < nelt2 && ei != i) > > > + return 0; > > > + if (i >= nelt2 > > > + && (ei < nelt || ei >= nelt<<1)) > > > > Formatting: > > 1) you have spaces followed by tab, remove the spaces; but, > > if (i >= nelt2 && (ei < nelt || ei >= nelt<<1)) > > fits on one line, so keep it on one line. > > 2) nelt<<1 should be nelt << 1 with spaces around the << > > > > Done. > > > > -(define_insn "*vec_concatv4si_0" > > > - [(set (match_operand:V4SI 0 "register_operand" "=v,x") > > > - (vec_concat:V4SI > > > - (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y") > > > - (match_operand:V2SI 2 "const0_operand" " C,C")))] > > > +(define_insn "*vec_concat<mode>_0" > > > + [(set (match_operand:VI124_128 0 "register_operand" "=v,x") > > > + (vec_concat:VI124_128 > > > + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm,?!*y") > > > + (match_operand:<ssehalfvecmode> 2 "const0_operand" " C,C")))] > > > "TARGET_SSE2" > > > "@ > > > %vmovq\t{%1, %0|%0, %1} > > > @@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat<mode>" > > > (set_attr "prefix" "maybe_evex") > > > (set_attr "mode" "<sseinsnmode>")]) > > > > > > +(define_insn_and_split "*vec_concat<mode>_0" > > > > Would be better to use a different pattern name, *vec_concat<mode>_0 > > is already used in the above define_insn. > > Use some additional suffix after _0? > > > > Changed to "*vec_concat<mode>_0_1" > > > > + return __builtin_shuffle (x, (v32qi) { 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0 }, > > > + (v32qi) { 0, 1, 2, 3, 4, 5, 6, 7, > > > + 8, 9, 10, 11, 12, 13, 14, 15, > > > + 32, 49, 34, 58, 36, 53, 38, 39, > > > + 40, 60, 42, 43, 63, 45, 46, 47 }); > > > > In this testcase the shuffles in the part taking indexes from the zero > > vector are nicely randomized. > > > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c > > > @@ -0,0 +1,78 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-mavx512bw -mavx512vbmi -O2" } */ > > > +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 6} } */ > > > +/* { dg-final { scan-assembler-not "pxor" } } */ > > > + > > > + > > > +typedef float v16sf __attribute__((vector_size(64))); > > > +typedef double v8df __attribute__ ((vector_size (64))); > > > +typedef long long v8di __attribute__((vector_size(64))); > > > +typedef int v16si __attribute__((vector_size(64))); > > > +typedef short v32hi __attribute__ ((vector_size (64))); > > > +typedef char v64qi __attribute__ ((vector_size (64))); > > > + > > > +v8df > > > +foo_v8df (v8df x) > > > +{ > > > + return __builtin_shuffle (x, (v8df) { 0, 0, 0, 0, 0, 0, 0, 0 }, > > > + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 }); > > > +} > > > + > > > +v8di > > > +foo_v8di (v8di x) > > > +{ > > > + return __builtin_shuffle (x, (v8di) { 0, 0, 0, 0, 0, 0, 0, 0 }, > > > + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 }); > > > +} > > > + > > > +v16sf > > > +foo_v16sf (v16sf x) > > > +{ > > > + return __builtin_shuffle (x, (v16sf) { 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0 }, > > > + (v16si) { 0, 1, 2, 3, 4, 5, 6, 7, > > > + 16, 17, 18, 19, 20, 21, 22, 23 }); > > > +} > > > + > > > +v16si > > > +foo_v16si (v16si x) > > > +{ > > > + return __builtin_shuffle (x, (v16si) { 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0 }, > > > + (v16si) { 0, 1, 2, 3, 4, 5, 6, 7, > > > + 16, 17, 18, 19, 20, 21, 22, 23 }); > > > +} > > > + > > > +v32hi > > > +foo_v32hi (v32hi x) > > > +{ > > > + return __builtin_shuffle (x, (v32hi) { 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0 }, > > > + (v32hi) { 0, 1, 2, 3, 4, 5, 6, 7, > > > + 8, 9, 10, 11, 12, 13, 14, 15, > > > + 32, 33, 34, 35, 36, 37, 38, 39, > > > + 40,41, 42, 43, 44, 45, 46, 47 }); > > > +} > > > + > > > +v64qi > > > +foo_v64qi (v64qi x) > > > +{ > > > + return __builtin_shuffle (x, (v64qi) { 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0 }, > > > + (v64qi) {0, 1, 2, 3, 4, 5, 6, 7, > > > + 8, 9, 10, 11, 12, 13, 14, 15, > > > + 16, 17, 18, 19, 20, 21, 22, 23, > > > + 24, 25, 26, 27, 28, 29, 30, 31, > > > + 64, 65, 66, 67, 68, 69, 70, 71, > > > + 72, 73, 74, 75, 76, 77, 78, 79, > > > + 80, 81, 82, 83, 84, 85, 86, 87, > > > + 88, 89, 90, 91, 92, 93, 94, 95 }); > > > > Can't you randomize a little bit at least some of these too? > > > > Done. > > > Also, what happens with __builtin_shuffle (zero_vector, x, ...) (i.e. when > > you swap the two vectors and adjust correspondingly the permutation)? > > Will it be also recognized or do we just punt on those? > > when building gimple, vec_perm(0, x, sel) is simplified to vec_perm(x, > 0, sel*)(, with adjustment of selector), since arg0 is a constant. > Not sure if rtl phase would do same simplification, anyway i add > testcases for __builtin_shuffle (zero_vector, x, ...), but not extend > pre-reload splitters to handle (vec_select: (vec_concat: cosnt0_rtx > op1) selector). > > > > > Jakub > > > > > -- > BR, > Hongtao -- BR, Hongtao ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680] 2021-04-25 6:57 ` Hongtao Liu 2021-05-12 7:30 ` Hongtao Liu @ 2021-05-12 14:19 ` Jakub Jelinek 2021-05-13 0:44 ` Hongtao Liu 1 sibling, 1 reply; 9+ messages in thread From: Jakub Jelinek @ 2021-05-12 14:19 UTC (permalink / raw) To: Hongtao Liu; +Cc: GCC Patches On Sun, Apr 25, 2021 at 02:57:08PM +0800, Hongtao Liu via Gcc-patches wrote: > gcc/ChangeLog: > > PR target/94680 > * config/i386/sse.md (ssedoublevecmode): Add attribute for > V64QI/V32HI/V16SI/V4DI. > (ssehalfvecmode): Add attribute for V2DI/V2DF. > (*vec_concatv4si_0): Extend to VI124_128. > (*vec_concat<mode>_0): New pre-reload splitter. > * config/i386/predicates.md (movq_parallel): New predicate. > > gcc/testsuite/ChangeLog: > > PR target/94680 > * gcc.target/i386/avx-pr94680.c: New test. > * gcc.target/i386/avx512f-pr94680.c: New test. > * gcc.target/i386/sse2-pr94680.c: New test. Ok, thanks. Sorry for the delay. Jakub ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680] 2021-05-12 14:19 ` Jakub Jelinek @ 2021-05-13 0:44 ` Hongtao Liu 2021-05-13 5:52 ` Hongtao Liu 0 siblings, 1 reply; 9+ messages in thread From: Hongtao Liu @ 2021-05-13 0:44 UTC (permalink / raw) To: Jakub Jelinek; +Cc: GCC Patches On Wed, May 12, 2021 at 10:19 PM Jakub Jelinek <jakub@redhat.com> wrote: > > On Sun, Apr 25, 2021 at 02:57:08PM +0800, Hongtao Liu via Gcc-patches wrote: > > gcc/ChangeLog: > > > > PR target/94680 > > * config/i386/sse.md (ssedoublevecmode): Add attribute for > > V64QI/V32HI/V16SI/V4DI. > > (ssehalfvecmode): Add attribute for V2DI/V2DF. > > (*vec_concatv4si_0): Extend to VI124_128. > > (*vec_concat<mode>_0): New pre-reload splitter. > > * config/i386/predicates.md (movq_parallel): New predicate. > > > > gcc/testsuite/ChangeLog: > > > > PR target/94680 > > * gcc.target/i386/avx-pr94680.c: New test. > > * gcc.target/i386/avx512f-pr94680.c: New test. > > * gcc.target/i386/sse2-pr94680.c: New test. > > Ok, thanks. Sorry for the delay. Thanks for the review. > > Jakub > -- BR, Hongtao ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680] 2021-05-13 0:44 ` Hongtao Liu @ 2021-05-13 5:52 ` Hongtao Liu 0 siblings, 0 replies; 9+ messages in thread From: Hongtao Liu @ 2021-05-13 5:52 UTC (permalink / raw) To: Jakub Jelinek; +Cc: GCC Patches There's a typo in the testcase, I've committed the patch as an obvious fix. Fix typo in testcase. gcc/testsuite/ChangeLog: * gcc.target/i386/avx-pr94680.c: Fix typo in testcase. diff --git a/gcc/testsuite/gcc.target/i386/avx-pr94680.c b/gcc/testsuite/gcc.target/i386/avx-pr94680.c index a89e4967f64..cb5041b6af3 100644 --- a/gcc/testsuite/gcc.target/i386/avx-pr94680.c +++ b/gcc/testsuite/gcc.target/i386/avx-pr94680.c @@ -76,7 +76,7 @@ foo_v16hi_l (v16hi x) { return __builtin_shuffle ((v16hi) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, x, - (v16hi) { 16, 17, 18, 20, 21, 22, 23, + (v16hi) { 16, 17, 18, 19, 20, 21, 22, 23, 15, 0, 13, 2, 11, 4, 9, 6 }); } -- BR, Hongtao ^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH] i386: Fix up *vec_concat<mode>_0_1 [PR101007] 2021-04-23 4:53 [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680] Hongtao Liu 2021-04-23 9:13 ` Jakub Jelinek @ 2021-06-11 8:59 ` Jakub Jelinek 2021-06-11 9:34 ` Uros Bizjak 1 sibling, 1 reply; 9+ messages in thread From: Jakub Jelinek @ 2021-06-11 8:59 UTC (permalink / raw) To: Uros Bizjak, Hongtao Liu; +Cc: GCC Patches On Fri, Apr 23, 2021 at 12:53:58PM +0800, Hongtao Liu via Gcc-patches wrote: > -(define_insn "*vec_concatv4si_0" > - [(set (match_operand:V4SI 0 "register_operand" "=v,x") > - (vec_concat:V4SI > - (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y") > - (match_operand:V2SI 2 "const0_operand" " C,C")))] > +(define_insn "*vec_concat<mode>_0" > + [(set (match_operand:VI124_128 0 "register_operand" "=v,x") > + (vec_concat:VI124_128 > + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm,?!*y") > + (match_operand:<ssehalfvecmode> 2 "const0_operand" " C,C")))] > "TARGET_SSE2" > "@ > %vmovq\t{%1, %0|%0, %1} > @@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat<mode>" > (set_attr "prefix" "maybe_evex") > (set_attr "mode" "<sseinsnmode>")]) > > +(define_insn_and_split "*vec_concat<mode>_0" > + [(set (match_operand:V 0 "register_operand") > + (vec_select:V > + (vec_concat:<ssedoublevecmode> > + (match_operand:V 1 "nonimmediate_operand") > + (match_operand:V 2 "const0_operand")) > + (match_parallel 3 "movq_parallel" > + [(match_operand 4 "const_int_operand")])))] > + "ix86_pre_reload_split ()" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (vec_concat:V (match_dup 1) (match_dup 5)))] > +{ > + operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]); > + operands[5] = CONST0_RTX (<ssehalfvecmode>mode); > +}) This regressed the following testcase with -msse -mno-sse2. The define_insn_and_split splits the permutation into *vec_concat<mode>_0 or *vec_concatv2di_0 insns which both have TARGET_SSE2 in their conditions (for the former you can see it above), but the define_insn_and_split matches always when the V mode's condition do, which for V16QI/V8HI/V4SI/V2DI/V4SF modes is always (well, when those modes are valid, which is TARGET_SSE). Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2021-06-11 Jakub Jelinek <jakub@redhat.com> PR target/101007 * config/i386/sse.md (*vec_concat<mode>_0_1): Require TARGET_SSE2. * gcc.target/i386/sse-pr101007.c: New test. --- gcc/config/i386/sse.md.jj 2021-06-07 09:24:57.706689972 +0200 +++ gcc/config/i386/sse.md 2021-06-10 11:14:52.407588679 +0200 @@ -22395,7 +22395,7 @@ (define_insn_and_split "*vec_concat<mode (match_operand:V 2 "const0_operand")) (match_parallel 3 "movq_parallel" [(match_operand 4 "const_int_operand")])))] - "ix86_pre_reload_split ()" + "TARGET_SSE2 && ix86_pre_reload_split ()" "#" "&& 1" [(set (match_dup 0) --- gcc/testsuite/gcc.target/i386/sse-pr101007.c.jj 2021-06-10 11:41:25.818609527 +0200 +++ gcc/testsuite/gcc.target/i386/sse-pr101007.c 2021-06-10 11:38:39.301910017 +0200 @@ -0,0 +1,14 @@ +/* PR target/101007 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse2" } */ + +typedef unsigned __attribute__((__vector_size__ (8))) U; +typedef unsigned __attribute__((__vector_size__ (16))) V; +V v; +U *p; + +void +foo (void) +{ + *p = (U) __builtin_shufflevector ((V)(0 == (V){} >= 0), v, 4, 2); +} Jakub ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] i386: Fix up *vec_concat<mode>_0_1 [PR101007] 2021-06-11 8:59 ` [PATCH] i386: Fix up *vec_concat<mode>_0_1 [PR101007] Jakub Jelinek @ 2021-06-11 9:34 ` Uros Bizjak 0 siblings, 0 replies; 9+ messages in thread From: Uros Bizjak @ 2021-06-11 9:34 UTC (permalink / raw) To: Jakub Jelinek; +Cc: Hongtao Liu, GCC Patches On Fri, Jun 11, 2021 at 10:59 AM Jakub Jelinek <jakub@redhat.com> wrote: > > On Fri, Apr 23, 2021 at 12:53:58PM +0800, Hongtao Liu via Gcc-patches wrote: > > -(define_insn "*vec_concatv4si_0" > > - [(set (match_operand:V4SI 0 "register_operand" "=v,x") > > - (vec_concat:V4SI > > - (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y") > > - (match_operand:V2SI 2 "const0_operand" " C,C")))] > > +(define_insn "*vec_concat<mode>_0" > > + [(set (match_operand:VI124_128 0 "register_operand" "=v,x") > > + (vec_concat:VI124_128 > > + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm,?!*y") > > + (match_operand:<ssehalfvecmode> 2 "const0_operand" " C,C")))] > > "TARGET_SSE2" > > "@ > > %vmovq\t{%1, %0|%0, %1} > > @@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat<mode>" > > (set_attr "prefix" "maybe_evex") > > (set_attr "mode" "<sseinsnmode>")]) > > > > +(define_insn_and_split "*vec_concat<mode>_0" > > + [(set (match_operand:V 0 "register_operand") > > + (vec_select:V > > + (vec_concat:<ssedoublevecmode> > > + (match_operand:V 1 "nonimmediate_operand") > > + (match_operand:V 2 "const0_operand")) > > + (match_parallel 3 "movq_parallel" > > + [(match_operand 4 "const_int_operand")])))] > > + "ix86_pre_reload_split ()" > > + "#" > > + "&& 1" > > + [(set (match_dup 0) > > + (vec_concat:V (match_dup 1) (match_dup 5)))] > > +{ > > + operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]); > > + operands[5] = CONST0_RTX (<ssehalfvecmode>mode); > > +}) > > This regressed the following testcase with -msse -mno-sse2. > The define_insn_and_split splits the permutation into *vec_concat<mode>_0 > or *vec_concatv2di_0 insns which both have TARGET_SSE2 in their > conditions (for the former you can see it above), but the > define_insn_and_split matches always when the V mode's condition do, > which for V16QI/V8HI/V4SI/V2DI/V4SF modes is always (well, when those > modes are valid, which is TARGET_SSE). > > Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, > ok for trunk? > > 2021-06-11 Jakub Jelinek <jakub@redhat.com> > > PR target/101007 > * config/i386/sse.md (*vec_concat<mode>_0_1): Require TARGET_SSE2. > > * gcc.target/i386/sse-pr101007.c: New test. OK, even as obvious patch. Thanks, Uros. > --- gcc/config/i386/sse.md.jj 2021-06-07 09:24:57.706689972 +0200 > +++ gcc/config/i386/sse.md 2021-06-10 11:14:52.407588679 +0200 > @@ -22395,7 +22395,7 @@ (define_insn_and_split "*vec_concat<mode > (match_operand:V 2 "const0_operand")) > (match_parallel 3 "movq_parallel" > [(match_operand 4 "const_int_operand")])))] > - "ix86_pre_reload_split ()" > + "TARGET_SSE2 && ix86_pre_reload_split ()" > "#" > "&& 1" > [(set (match_dup 0) > --- gcc/testsuite/gcc.target/i386/sse-pr101007.c.jj 2021-06-10 11:41:25.818609527 +0200 > +++ gcc/testsuite/gcc.target/i386/sse-pr101007.c 2021-06-10 11:38:39.301910017 +0200 > @@ -0,0 +1,14 @@ > +/* PR target/101007 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse -mno-sse2" } */ > + > +typedef unsigned __attribute__((__vector_size__ (8))) U; > +typedef unsigned __attribute__((__vector_size__ (16))) V; > +V v; > +U *p; > + > +void > +foo (void) > +{ > + *p = (U) __builtin_shufflevector ((V)(0 == (V){} >= 0), v, 4, 2); > +} > > Jakub > ^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2021-06-11 9:34 UTC | newest] Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2021-04-23 4:53 [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680] Hongtao Liu 2021-04-23 9:13 ` Jakub Jelinek 2021-04-25 6:57 ` Hongtao Liu 2021-05-12 7:30 ` Hongtao Liu 2021-05-12 14:19 ` Jakub Jelinek 2021-05-13 0:44 ` Hongtao Liu 2021-05-13 5:52 ` Hongtao Liu 2021-06-11 8:59 ` [PATCH] i386: Fix up *vec_concat<mode>_0_1 [PR101007] Jakub Jelinek 2021-06-11 9:34 ` Uros Bizjak
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).