* [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680]
@ 2021-04-23 4:53 Hongtao Liu
2021-04-23 9:13 ` Jakub Jelinek
2021-06-11 8:59 ` [PATCH] i386: Fix up *vec_concat<mode>_0_1 [PR101007] Jakub Jelinek
0 siblings, 2 replies; 9+ messages in thread
From: Hongtao Liu @ 2021-04-23 4:53 UTC (permalink / raw)
To: GCC Patches; +Cc: Jakub Jelinek
[-- Attachment #1: Type: text/plain, Size: 1169 bytes --]
Hi:
If the second operand of __builtin_shuffle is const vector 0, and with
specific mask, it can be optimized to movq/vmovps.
.i.e.
foo128:
- vxorps %xmm1, %xmm1, %xmm1
- vmovlhps %xmm1, %xmm0, %xmm0
+ vmovq %xmm0, %xmm0
foo256:
- vxorps %xmm1, %xmm1, %xmm1
- vshuff32x4 $0, %ymm1, %ymm0, %ymm0
+ vmovaps %xmm0, %xmm0
foo512:
- vxorps %xmm1, %xmm1, %xmm1
- vshuff32x4 $68, %zmm1, %zmm0, %zmm0
+ vmovaps %ymm0, %ymm0
Bootstrapped and regtested on x86-64_iinux-gnu{-m32,}.
Ok for trunk?
gcc/ChangeLog:
PR target/94680
* config/i386/sse.md (ssedoublevecmode): Add attribute for
V64QI/V32HI/V16SI/V4DI.
(ssehalfvecmode): Add attribute for V2DI/V2DF.
(*vec_concatv4si_0): Extend to VI124_128.
(*vec_concat<mode>_0): New pre-reload splitter.
* config/i386/predicates.md (movq_parallel): New predicate.
gcc/testsuite/ChangeLog:
PR target/94680
* gcc.target/i386/avx-pr94680.c: New test.
* gcc.target/i386/avx512f-pr94680.c: New test.
* gcc.target/i386/sse2-pr94680.c: New test.
--
BR,
Hongtao
[-- Attachment #2: 0001-i386-Optimize-__builtin_shuffle-when-it-s-used-to-ze.patch --]
[-- Type: text/x-patch, Size: 12065 bytes --]
From eec5469cdeecf0e6650e9d2963dea4117919c5d2 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Thu, 22 Apr 2021 15:33:16 +0800
Subject: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the
upper bits of the dest. [PR target/94680]
If the second operand of __builtin_shuffle is const vector 0, and with
specific mask, it can be optimized to movq/vmovps.
.i.e.
foo128:
- vxorps %xmm1, %xmm1, %xmm1
- vmovlhps %xmm1, %xmm0, %xmm0
+ vmovq %xmm0, %xmm0
foo256:
- vxorps %xmm1, %xmm1, %xmm1
- vshuff32x4 $0, %ymm1, %ymm0, %ymm0
+ vmovaps %xmm0, %xmm0
foo512:
- vxorps %xmm1, %xmm1, %xmm1
- vshuff32x4 $68, %zmm1, %zmm0, %zmm0
+ vmovaps %ymm0, %ymm0
gcc/ChangeLog:
PR target/94680
* config/i386/sse.md (ssedoublevecmode): Add attribute for
V64QI/V32HI/V16SI/V4DI.
(ssehalfvecmode): Add attribute for V2DI/V2DF.
(*vec_concatv4si_0): Extend to VI124_128.
(*vec_concat<mode>_0): New pre-reload splitter.
* config/i386/predicates.md (movq_parallel): New predicate.
gcc/testsuite/ChangeLog:
PR target/94680
* gcc.target/i386/avx-pr94680.c: New test.
* gcc.target/i386/avx512f-pr94680.c: New test.
* gcc.target/i386/sse2-pr94680.c: New test.
---
gcc/config/i386/predicates.md | 33 ++++++++
gcc/config/i386/sse.md | 37 +++++++--
gcc/testsuite/gcc.target/i386/avx-pr94680.c | 59 ++++++++++++++
.../gcc.target/i386/avx512f-pr94680.c | 78 +++++++++++++++++++
gcc/testsuite/gcc.target/i386/sse2-pr94680.c | 51 ++++++++++++
5 files changed, 250 insertions(+), 8 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/avx-pr94680.c
create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr94680.c
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index b1df8548af6..4b706003ed8 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1524,6 +1524,39 @@ (define_predicate "misaligned_operand"
(and (match_code "mem")
(match_test "MEM_ALIGN (op) < GET_MODE_BITSIZE (mode)")))
+;; Return true if OP is a parallel for an mov{d,q,dqa,ps,pd} vec_select,
+;; where one of the two operands of the vec_concat is const0_operand.
+(define_predicate "movq_parallel"
+ (match_code "parallel")
+{
+ unsigned nelt = XVECLEN (op, 0);
+ unsigned nelt2 = nelt >> 1;
+ unsigned i;
+
+ if (nelt < 2)
+ return false;
+
+ /* Validate that all of the elements are constants,
+ lower halves of permute are lower halves of the first operand,
+ upper halves of permute come from any of the second operand. */
+ for (i = 0; i < nelt; ++i)
+ {
+ rtx er = XVECEXP (op, 0, i);
+ unsigned HOST_WIDE_INT ei;
+
+ if (!CONST_INT_P (er))
+ return 0;
+ ei = INTVAL (er);
+ if (i < nelt2 && ei != i)
+ return 0;
+ if (i >= nelt2
+ && (ei < nelt || ei >= nelt<<1))
+ return 0;
+ }
+
+ return 1;
+})
+
;; Return true if OP is a vzeroall operation, known to be a PARALLEL.
(define_predicate "vzeroall_operation"
(match_code "parallel")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 9d3728d1cb0..b55636a3e12 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -812,19 +812,22 @@ (define_mode_attr sseintvecmodelower
;; Mapping of vector modes to a vector mode of double size
(define_mode_attr ssedoublevecmode
- [(V32QI "V64QI") (V16HI "V32HI") (V8SI "V16SI") (V4DI "V8DI")
+ [(V64QI "V128QI") (V32HI "V64HI") (V16SI "V32SI") (V8DI "V16DI")
+ (V32QI "V64QI") (V16HI "V32HI") (V8SI "V16SI") (V4DI "V8DI")
(V16QI "V32QI") (V8HI "V16HI") (V4SI "V8SI") (V2DI "V4DI")
+ (V16SF "V32SF") (V8DF "V16DF")
(V8SF "V16SF") (V4DF "V8DF")
(V4SF "V8SF") (V2DF "V4DF")])
;; Mapping of vector modes to a vector mode of half size
+;; instead of V1DI/V1DF, DI/DF are used for V2DI/V2DF although they are scalar.
(define_mode_attr ssehalfvecmode
[(V64QI "V32QI") (V32HI "V16HI") (V16SI "V8SI") (V8DI "V4DI") (V4TI "V2TI")
(V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI") (V4DI "V2DI")
- (V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI")
+ (V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") (V2DI "DI")
(V16SF "V8SF") (V8DF "V4DF")
(V8SF "V4SF") (V4DF "V2DF")
- (V4SF "V2SF")])
+ (V4SF "V2SF") (V2DF "DF")])
(define_mode_attr ssehalfvecmodelower
[(V64QI "v32qi") (V32HI "v16hi") (V16SI "v8si") (V8DI "v4di") (V4TI "v2ti")
@@ -15964,11 +15967,11 @@ (define_insn "*vec_concatv4si"
(set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex")
(set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")])
-(define_insn "*vec_concatv4si_0"
- [(set (match_operand:V4SI 0 "register_operand" "=v,x")
- (vec_concat:V4SI
- (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y")
- (match_operand:V2SI 2 "const0_operand" " C,C")))]
+(define_insn "*vec_concat<mode>_0"
+ [(set (match_operand:VI124_128 0 "register_operand" "=v,x")
+ (vec_concat:VI124_128
+ (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm,?!*y")
+ (match_operand:<ssehalfvecmode> 2 "const0_operand" " C,C")))]
"TARGET_SSE2"
"@
%vmovq\t{%1, %0|%0, %1}
@@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat<mode>"
(set_attr "prefix" "maybe_evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn_and_split "*vec_concat<mode>_0"
+ [(set (match_operand:V 0 "register_operand")
+ (vec_select:V
+ (vec_concat:<ssedoublevecmode>
+ (match_operand:V 1 "nonimmediate_operand")
+ (match_operand:V 2 "const0_operand"))
+ (match_parallel 3 "movq_parallel"
+ [(match_operand 4 "const_int_operand")])))]
+ "ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (vec_concat:V (match_dup 1) (match_dup 5)))]
+{
+ operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]);
+ operands[5] = CONST0_RTX (<ssehalfvecmode>mode);
+})
+
(define_insn "vcvtph2ps<mask_name>"
[(set (match_operand:V4SF 0 "register_operand" "=v")
(vec_select:V4SF
diff --git a/gcc/testsuite/gcc.target/i386/avx-pr94680.c b/gcc/testsuite/gcc.target/i386/avx-pr94680.c
new file mode 100644
index 00000000000..4fe0f5bede6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-pr94680.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx -mno-avx512f -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%xmm[0-9]} 6 } } */
+/* { dg-final { scan-assembler-not "pxor" } } */
+
+typedef float v8sf __attribute__((vector_size(32)));
+typedef double v4df __attribute__ ((vector_size (32)));
+typedef long long v4di __attribute__((vector_size(32)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef short v16hi __attribute__ ((vector_size (32)));
+typedef char v32qi __attribute__ ((vector_size (32)));
+
+v4df
+foo_v4df (v4df x)
+{
+ return __builtin_shuffle (x, (v4df) { 0, 0, 0, 0 }, (v4di) { 0, 1, 4, 5 });
+}
+
+v4di
+foo_v4di (v4di x)
+{
+ return __builtin_shuffle (x, (v4di) { 0, 0, 0, 0 }, (v4di) { 0, 1, 4, 7 });
+}
+
+v8sf
+foo_v8sf (v8sf x)
+{
+ return __builtin_shuffle (x, (v8sf) { 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v8si) { 0, 1, 2, 3, 8, 9, 10, 11 });
+}
+
+v8si
+foo_v8si (v8si x)
+{
+ return __builtin_shuffle (x, (v8si) { 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v8si) { 0, 1, 2, 3, 13, 12, 11, 15 });
+}
+
+v16hi
+foo_v16hi (v16hi x)
+{
+ return __builtin_shuffle (x, (v16hi) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v16hi) { 0, 1, 2, 3, 4, 5, 6, 7,
+ 24, 17, 26, 19, 28, 21, 30, 23 });
+ }
+
+v32qi
+foo_v32qi (v32qi x)
+{
+ return __builtin_shuffle (x, (v32qi) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v32qi) { 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 32, 49, 34, 58, 36, 53, 38, 39,
+ 40, 60, 42, 43, 63, 45, 46, 47 });
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
new file mode 100644
index 00000000000..442b79da420
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
@@ -0,0 +1,78 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512vbmi -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 6} } */
+/* { dg-final { scan-assembler-not "pxor" } } */
+
+
+typedef float v16sf __attribute__((vector_size(64)));
+typedef double v8df __attribute__ ((vector_size (64)));
+typedef long long v8di __attribute__((vector_size(64)));
+typedef int v16si __attribute__((vector_size(64)));
+typedef short v32hi __attribute__ ((vector_size (64)));
+typedef char v64qi __attribute__ ((vector_size (64)));
+
+v8df
+foo_v8df (v8df x)
+{
+ return __builtin_shuffle (x, (v8df) { 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 });
+}
+
+v8di
+foo_v8di (v8di x)
+{
+ return __builtin_shuffle (x, (v8di) { 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 });
+}
+
+v16sf
+foo_v16sf (v16sf x)
+{
+ return __builtin_shuffle (x, (v16sf) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
+ 16, 17, 18, 19, 20, 21, 22, 23 });
+}
+
+v16si
+foo_v16si (v16si x)
+{
+ return __builtin_shuffle (x, (v16si) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
+ 16, 17, 18, 19, 20, 21, 22, 23 });
+}
+
+v32hi
+foo_v32hi (v32hi x)
+{
+ return __builtin_shuffle (x, (v32hi) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v32hi) { 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40,41, 42, 43, 44, 45, 46, 47 });
+}
+
+v64qi
+foo_v64qi (v64qi x)
+{
+ return __builtin_shuffle (x, (v64qi) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v64qi) {0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 92, 93, 94, 95 });
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr94680.c b/gcc/testsuite/gcc.target/i386/sse2-pr94680.c
new file mode 100644
index 00000000000..7f4f98e3b1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-pr94680.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -mno-sse4.1 -O2" } */
+/* { dg-final { scan-assembler-times {(?n)mov.*%xmm[0-9]} 6} } */
+/* { dg-final { scan-assembler-not "pxor" } } */
+
+typedef float v4sf __attribute__((vector_size(16)));
+typedef double v2df __attribute__ ((vector_size (16)));
+typedef long long v2di __attribute__((vector_size(16)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+typedef char v16qi __attribute__ ((vector_size (16)));
+
+v2df
+foo_v2df (v2df x)
+{
+ return __builtin_shuffle (x, (v2df) { 0, 0 }, (v2di) {0, 2});
+}
+
+v2di
+foo_v2di (v2di x)
+{
+ return __builtin_shuffle (x, (v2di) { 0, 0 }, (v2di) {0, 3});
+}
+
+v4sf
+foo_v4sf (v4sf x)
+{
+ return __builtin_shuffle (x, (v4sf) { 0, 0, 0, 0 }, (v4si) {0, 1, 4, 5});
+}
+
+v4si
+foo_v4si (v4si x)
+{
+ return __builtin_shuffle (x, (v4si) { 0, 0, 0, 0 }, (v4si) {0, 1, 6, 7});
+}
+
+v8hi
+foo_v8hi (v8hi x)
+{
+ return __builtin_shuffle (x, (v8hi) { 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v8hi) { 0, 1, 2, 3, 8, 12, 10, 13 });
+}
+
+v16qi
+foo_v16qi (v16qi x)
+{
+ return __builtin_shuffle (x, (v16qi) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v16qi) {0, 1, 2, 3, 4, 5, 6, 7,
+ 16, 24, 18, 26, 20, 28, 22, 30 });
+}
--
2.18.1
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680]
2021-04-23 4:53 [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680] Hongtao Liu
@ 2021-04-23 9:13 ` Jakub Jelinek
2021-04-25 6:57 ` Hongtao Liu
2021-06-11 8:59 ` [PATCH] i386: Fix up *vec_concat<mode>_0_1 [PR101007] Jakub Jelinek
1 sibling, 1 reply; 9+ messages in thread
From: Jakub Jelinek @ 2021-04-23 9:13 UTC (permalink / raw)
To: Hongtao Liu; +Cc: GCC Patches
On Fri, Apr 23, 2021 at 12:53:58PM +0800, Hongtao Liu via Gcc-patches wrote:
> + if (!CONST_INT_P (er))
> + return 0;
> + ei = INTVAL (er);
> + if (i < nelt2 && ei != i)
> + return 0;
> + if (i >= nelt2
> + && (ei < nelt || ei >= nelt<<1))
Formatting:
1) you have spaces followed by tab, remove the spaces; but,
if (i >= nelt2 && (ei < nelt || ei >= nelt<<1))
fits on one line, so keep it on one line.
2) nelt<<1 should be nelt << 1 with spaces around the <<
> -(define_insn "*vec_concatv4si_0"
> - [(set (match_operand:V4SI 0 "register_operand" "=v,x")
> - (vec_concat:V4SI
> - (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y")
> - (match_operand:V2SI 2 "const0_operand" " C,C")))]
> +(define_insn "*vec_concat<mode>_0"
> + [(set (match_operand:VI124_128 0 "register_operand" "=v,x")
> + (vec_concat:VI124_128
> + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm,?!*y")
> + (match_operand:<ssehalfvecmode> 2 "const0_operand" " C,C")))]
> "TARGET_SSE2"
> "@
> %vmovq\t{%1, %0|%0, %1}
> @@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat<mode>"
> (set_attr "prefix" "maybe_evex")
> (set_attr "mode" "<sseinsnmode>")])
>
> +(define_insn_and_split "*vec_concat<mode>_0"
Would be better to use a different pattern name, *vec_concat<mode>_0
is already used in the above define_insn.
Use some additional suffix after _0?
> + return __builtin_shuffle (x, (v32qi) { 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0 },
> + (v32qi) { 0, 1, 2, 3, 4, 5, 6, 7,
> + 8, 9, 10, 11, 12, 13, 14, 15,
> + 32, 49, 34, 58, 36, 53, 38, 39,
> + 40, 60, 42, 43, 63, 45, 46, 47 });
In this testcase the shuffles in the part taking indexes from the zero
vector are nicely randomized.
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
> @@ -0,0 +1,78 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512bw -mavx512vbmi -O2" } */
> +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 6} } */
> +/* { dg-final { scan-assembler-not "pxor" } } */
> +
> +
> +typedef float v16sf __attribute__((vector_size(64)));
> +typedef double v8df __attribute__ ((vector_size (64)));
> +typedef long long v8di __attribute__((vector_size(64)));
> +typedef int v16si __attribute__((vector_size(64)));
> +typedef short v32hi __attribute__ ((vector_size (64)));
> +typedef char v64qi __attribute__ ((vector_size (64)));
> +
> +v8df
> +foo_v8df (v8df x)
> +{
> + return __builtin_shuffle (x, (v8df) { 0, 0, 0, 0, 0, 0, 0, 0 },
> + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 });
> +}
> +
> +v8di
> +foo_v8di (v8di x)
> +{
> + return __builtin_shuffle (x, (v8di) { 0, 0, 0, 0, 0, 0, 0, 0 },
> + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 });
> +}
> +
> +v16sf
> +foo_v16sf (v16sf x)
> +{
> + return __builtin_shuffle (x, (v16sf) { 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0 },
> + (v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
> + 16, 17, 18, 19, 20, 21, 22, 23 });
> +}
> +
> +v16si
> +foo_v16si (v16si x)
> +{
> + return __builtin_shuffle (x, (v16si) { 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0 },
> + (v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
> + 16, 17, 18, 19, 20, 21, 22, 23 });
> +}
> +
> +v32hi
> +foo_v32hi (v32hi x)
> +{
> + return __builtin_shuffle (x, (v32hi) { 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0 },
> + (v32hi) { 0, 1, 2, 3, 4, 5, 6, 7,
> + 8, 9, 10, 11, 12, 13, 14, 15,
> + 32, 33, 34, 35, 36, 37, 38, 39,
> + 40,41, 42, 43, 44, 45, 46, 47 });
> +}
> +
> +v64qi
> +foo_v64qi (v64qi x)
> +{
> + return __builtin_shuffle (x, (v64qi) { 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0 },
> + (v64qi) {0, 1, 2, 3, 4, 5, 6, 7,
> + 8, 9, 10, 11, 12, 13, 14, 15,
> + 16, 17, 18, 19, 20, 21, 22, 23,
> + 24, 25, 26, 27, 28, 29, 30, 31,
> + 64, 65, 66, 67, 68, 69, 70, 71,
> + 72, 73, 74, 75, 76, 77, 78, 79,
> + 80, 81, 82, 83, 84, 85, 86, 87,
> + 88, 89, 90, 91, 92, 93, 94, 95 });
Can't you randomize a little bit at least some of these too?
Also, what happens with __builtin_shuffle (zero_vector, x, ...) (i.e. when
you swap the two vectors and adjust correspondingly the permutation)?
Will it be also recognized or do we just punt on those?
Jakub
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680]
2021-04-23 9:13 ` Jakub Jelinek
@ 2021-04-25 6:57 ` Hongtao Liu
2021-05-12 7:30 ` Hongtao Liu
2021-05-12 14:19 ` Jakub Jelinek
0 siblings, 2 replies; 9+ messages in thread
From: Hongtao Liu @ 2021-04-25 6:57 UTC (permalink / raw)
To: Jakub Jelinek; +Cc: GCC Patches
[-- Attachment #1: Type: text/plain, Size: 6586 bytes --]
On Fri, Apr 23, 2021 at 5:13 PM Jakub Jelinek <jakub@redhat.com> wrote:
>
> On Fri, Apr 23, 2021 at 12:53:58PM +0800, Hongtao Liu via Gcc-patches wrote:
> > + if (!CONST_INT_P (er))
> > + return 0;
> > + ei = INTVAL (er);
> > + if (i < nelt2 && ei != i)
> > + return 0;
> > + if (i >= nelt2
> > + && (ei < nelt || ei >= nelt<<1))
>
> Formatting:
> 1) you have spaces followed by tab, remove the spaces; but,
> if (i >= nelt2 && (ei < nelt || ei >= nelt<<1))
> fits on one line, so keep it on one line.
> 2) nelt<<1 should be nelt << 1 with spaces around the <<
>
Done.
> > -(define_insn "*vec_concatv4si_0"
> > - [(set (match_operand:V4SI 0 "register_operand" "=v,x")
> > - (vec_concat:V4SI
> > - (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y")
> > - (match_operand:V2SI 2 "const0_operand" " C,C")))]
> > +(define_insn "*vec_concat<mode>_0"
> > + [(set (match_operand:VI124_128 0 "register_operand" "=v,x")
> > + (vec_concat:VI124_128
> > + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm,?!*y")
> > + (match_operand:<ssehalfvecmode> 2 "const0_operand" " C,C")))]
> > "TARGET_SSE2"
> > "@
> > %vmovq\t{%1, %0|%0, %1}
> > @@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat<mode>"
> > (set_attr "prefix" "maybe_evex")
> > (set_attr "mode" "<sseinsnmode>")])
> >
> > +(define_insn_and_split "*vec_concat<mode>_0"
>
> Would be better to use a different pattern name, *vec_concat<mode>_0
> is already used in the above define_insn.
> Use some additional suffix after _0?
>
Changed to "*vec_concat<mode>_0_1"
> > + return __builtin_shuffle (x, (v32qi) { 0, 0, 0, 0, 0, 0, 0, 0,
> > + 0, 0, 0, 0, 0, 0, 0, 0,
> > + 0, 0, 0, 0, 0, 0, 0, 0,
> > + 0, 0, 0, 0, 0, 0, 0, 0 },
> > + (v32qi) { 0, 1, 2, 3, 4, 5, 6, 7,
> > + 8, 9, 10, 11, 12, 13, 14, 15,
> > + 32, 49, 34, 58, 36, 53, 38, 39,
> > + 40, 60, 42, 43, 63, 45, 46, 47 });
>
> In this testcase the shuffles in the part taking indexes from the zero
> vector are nicely randomized.
>
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
> > @@ -0,0 +1,78 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512bw -mavx512vbmi -O2" } */
> > +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 6} } */
> > +/* { dg-final { scan-assembler-not "pxor" } } */
> > +
> > +
> > +typedef float v16sf __attribute__((vector_size(64)));
> > +typedef double v8df __attribute__ ((vector_size (64)));
> > +typedef long long v8di __attribute__((vector_size(64)));
> > +typedef int v16si __attribute__((vector_size(64)));
> > +typedef short v32hi __attribute__ ((vector_size (64)));
> > +typedef char v64qi __attribute__ ((vector_size (64)));
> > +
> > +v8df
> > +foo_v8df (v8df x)
> > +{
> > + return __builtin_shuffle (x, (v8df) { 0, 0, 0, 0, 0, 0, 0, 0 },
> > + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 });
> > +}
> > +
> > +v8di
> > +foo_v8di (v8di x)
> > +{
> > + return __builtin_shuffle (x, (v8di) { 0, 0, 0, 0, 0, 0, 0, 0 },
> > + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 });
> > +}
> > +
> > +v16sf
> > +foo_v16sf (v16sf x)
> > +{
> > + return __builtin_shuffle (x, (v16sf) { 0, 0, 0, 0, 0, 0, 0, 0,
> > + 0, 0, 0, 0, 0, 0, 0, 0 },
> > + (v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
> > + 16, 17, 18, 19, 20, 21, 22, 23 });
> > +}
> > +
> > +v16si
> > +foo_v16si (v16si x)
> > +{
> > + return __builtin_shuffle (x, (v16si) { 0, 0, 0, 0, 0, 0, 0, 0,
> > + 0, 0, 0, 0, 0, 0, 0, 0 },
> > + (v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
> > + 16, 17, 18, 19, 20, 21, 22, 23 });
> > +}
> > +
> > +v32hi
> > +foo_v32hi (v32hi x)
> > +{
> > + return __builtin_shuffle (x, (v32hi) { 0, 0, 0, 0, 0, 0, 0, 0,
> > + 0, 0, 0, 0, 0, 0, 0, 0,
> > + 0, 0, 0, 0, 0, 0, 0, 0,
> > + 0, 0, 0, 0, 0, 0, 0, 0 },
> > + (v32hi) { 0, 1, 2, 3, 4, 5, 6, 7,
> > + 8, 9, 10, 11, 12, 13, 14, 15,
> > + 32, 33, 34, 35, 36, 37, 38, 39,
> > + 40,41, 42, 43, 44, 45, 46, 47 });
> > +}
> > +
> > +v64qi
> > +foo_v64qi (v64qi x)
> > +{
> > + return __builtin_shuffle (x, (v64qi) { 0, 0, 0, 0, 0, 0, 0, 0,
> > + 0, 0, 0, 0, 0, 0, 0, 0,
> > + 0, 0, 0, 0, 0, 0, 0, 0,
> > + 0, 0, 0, 0, 0, 0, 0, 0,
> > + 0, 0, 0, 0, 0, 0, 0, 0,
> > + 0, 0, 0, 0, 0, 0, 0, 0,
> > + 0, 0, 0, 0, 0, 0, 0, 0,
> > + 0, 0, 0, 0, 0, 0, 0, 0 },
> > + (v64qi) {0, 1, 2, 3, 4, 5, 6, 7,
> > + 8, 9, 10, 11, 12, 13, 14, 15,
> > + 16, 17, 18, 19, 20, 21, 22, 23,
> > + 24, 25, 26, 27, 28, 29, 30, 31,
> > + 64, 65, 66, 67, 68, 69, 70, 71,
> > + 72, 73, 74, 75, 76, 77, 78, 79,
> > + 80, 81, 82, 83, 84, 85, 86, 87,
> > + 88, 89, 90, 91, 92, 93, 94, 95 });
>
> Can't you randomize a little bit at least some of these too?
>
Done.
> Also, what happens with __builtin_shuffle (zero_vector, x, ...) (i.e. when
> you swap the two vectors and adjust correspondingly the permutation)?
> Will it be also recognized or do we just punt on those?
when building gimple, vec_perm(0, x, sel) is simplified to vec_perm(x,
0, sel*)(, with adjustment of selector), since arg0 is a constant.
Not sure if rtl phase would do same simplification, anyway i add
testcases for __builtin_shuffle (zero_vector, x, ...), but not extend
pre-reload splitters to handle (vec_select: (vec_concat: cosnt0_rtx
op1) selector).
>
> Jakub
>
--
BR,
Hongtao
[-- Attachment #2: 0001-i386-Optimize-__builtin_shuffle-when-it-s-used-to-ze.patch --]
[-- Type: text/x-patch, Size: 15877 bytes --]
From fb72db5a751708f3d13027c3c279fbf60e1e1aaf Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Thu, 22 Apr 2021 15:33:16 +0800
Subject: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the
upper bits of the dest. [PR target/94680]
If the second operand of __builtin_shuffle is const vector 0, and with
specific mask, it can be optimized to movq/vmovps.
.i.e.
foo128:
- vxorps %xmm1, %xmm1, %xmm1
- vmovlhps %xmm1, %xmm0, %xmm0
+ vmovq %xmm0, %xmm0
foo256:
- vxorps %xmm1, %xmm1, %xmm1
- vshuff32x4 $0, %ymm1, %ymm0, %ymm0
+ vmovaps %xmm0, %xmm0
foo512:
- vxorps %xmm1, %xmm1, %xmm1
- vshuff32x4 $68, %zmm1, %zmm0, %zmm0
+ vmovaps %ymm0, %ymm0
gcc/ChangeLog:
PR target/94680
* config/i386/sse.md (ssedoublevecmode): Add attribute for
V64QI/V32HI/V16SI/V4DI.
(ssehalfvecmode): Add attribute for V2DI/V2DF.
(*vec_concatv4si_0): Extend to VI124_128.
(*vec_concat<mode>_0): New pre-reload splitter.
* config/i386/predicates.md (movq_parallel): New predicate.
gcc/testsuite/ChangeLog:
PR target/94680
* gcc.target/i386/avx-pr94680.c: New test.
* gcc.target/i386/avx512f-pr94680.c: New test.
* gcc.target/i386/sse2-pr94680.c: New test.
---
gcc/config/i386/predicates.md | 32 ++++
gcc/config/i386/sse.md | 37 ++++-
gcc/testsuite/gcc.target/i386/avx-pr94680.c | 107 +++++++++++++
.../gcc.target/i386/avx512f-pr94680.c | 144 ++++++++++++++++++
gcc/testsuite/gcc.target/i386/sse2-pr94680.c | 91 +++++++++++
5 files changed, 403 insertions(+), 8 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/avx-pr94680.c
create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr94680.c
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index b1df8548af6..201aacd65e6 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1524,6 +1524,38 @@ (define_predicate "misaligned_operand"
(and (match_code "mem")
(match_test "MEM_ALIGN (op) < GET_MODE_BITSIZE (mode)")))
+;; Return true if OP is a parallel for an mov{d,q,dqa,ps,pd} vec_select,
+;; where one of the two operands of the vec_concat is const0_operand.
+(define_predicate "movq_parallel"
+ (match_code "parallel")
+{
+ unsigned nelt = XVECLEN (op, 0);
+ unsigned nelt2 = nelt >> 1;
+ unsigned i;
+
+ if (nelt < 2)
+ return false;
+
+ /* Validate that all of the elements are constants,
+ lower halves of permute are lower halves of the first operand,
+ upper halves of permute come from any of the second operand. */
+ for (i = 0; i < nelt; ++i)
+ {
+ rtx er = XVECEXP (op, 0, i);
+ unsigned HOST_WIDE_INT ei;
+
+ if (!CONST_INT_P (er))
+ return 0;
+ ei = INTVAL (er);
+ if (i < nelt2 && ei != i)
+ return 0;
+ if (i >= nelt2 && (ei < nelt || ei >= nelt << 1))
+ return 0;
+ }
+
+ return 1;
+})
+
;; Return true if OP is a vzeroall operation, known to be a PARALLEL.
(define_predicate "vzeroall_operation"
(match_code "parallel")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 9d3728d1cb0..369ff2b9f28 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -812,19 +812,22 @@ (define_mode_attr sseintvecmodelower
;; Mapping of vector modes to a vector mode of double size
(define_mode_attr ssedoublevecmode
- [(V32QI "V64QI") (V16HI "V32HI") (V8SI "V16SI") (V4DI "V8DI")
+ [(V64QI "V128QI") (V32HI "V64HI") (V16SI "V32SI") (V8DI "V16DI")
+ (V32QI "V64QI") (V16HI "V32HI") (V8SI "V16SI") (V4DI "V8DI")
(V16QI "V32QI") (V8HI "V16HI") (V4SI "V8SI") (V2DI "V4DI")
+ (V16SF "V32SF") (V8DF "V16DF")
(V8SF "V16SF") (V4DF "V8DF")
(V4SF "V8SF") (V2DF "V4DF")])
;; Mapping of vector modes to a vector mode of half size
+;; instead of V1DI/V1DF, DI/DF are used for V2DI/V2DF although they are scalar.
(define_mode_attr ssehalfvecmode
[(V64QI "V32QI") (V32HI "V16HI") (V16SI "V8SI") (V8DI "V4DI") (V4TI "V2TI")
(V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI") (V4DI "V2DI")
- (V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI")
+ (V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") (V2DI "DI")
(V16SF "V8SF") (V8DF "V4DF")
(V8SF "V4SF") (V4DF "V2DF")
- (V4SF "V2SF")])
+ (V4SF "V2SF") (V2DF "DF")])
(define_mode_attr ssehalfvecmodelower
[(V64QI "v32qi") (V32HI "v16hi") (V16SI "v8si") (V8DI "v4di") (V4TI "v2ti")
@@ -15964,11 +15967,11 @@ (define_insn "*vec_concatv4si"
(set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex")
(set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")])
-(define_insn "*vec_concatv4si_0"
- [(set (match_operand:V4SI 0 "register_operand" "=v,x")
- (vec_concat:V4SI
- (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y")
- (match_operand:V2SI 2 "const0_operand" " C,C")))]
+(define_insn "*vec_concat<mode>_0"
+ [(set (match_operand:VI124_128 0 "register_operand" "=v,x")
+ (vec_concat:VI124_128
+ (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm,?!*y")
+ (match_operand:<ssehalfvecmode> 2 "const0_operand" " C,C")))]
"TARGET_SSE2"
"@
%vmovq\t{%1, %0|%0, %1}
@@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat<mode>"
(set_attr "prefix" "maybe_evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn_and_split "*vec_concat<mode>_0_1"
+ [(set (match_operand:V 0 "register_operand")
+ (vec_select:V
+ (vec_concat:<ssedoublevecmode>
+ (match_operand:V 1 "nonimmediate_operand")
+ (match_operand:V 2 "const0_operand"))
+ (match_parallel 3 "movq_parallel"
+ [(match_operand 4 "const_int_operand")])))]
+ "ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (vec_concat:V (match_dup 1) (match_dup 5)))]
+{
+ operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]);
+ operands[5] = CONST0_RTX (<ssehalfvecmode>mode);
+})
+
(define_insn "vcvtph2ps<mask_name>"
[(set (match_operand:V4SF 0 "register_operand" "=v")
(vec_select:V4SF
diff --git a/gcc/testsuite/gcc.target/i386/avx-pr94680.c b/gcc/testsuite/gcc.target/i386/avx-pr94680.c
new file mode 100644
index 00000000000..a89e4967f64
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-pr94680.c
@@ -0,0 +1,107 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx -mno-avx512f -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%xmm[0-9]} 12 } } */
+/* { dg-final { scan-assembler-not "pxor" } } */
+
+typedef float v8sf __attribute__((vector_size(32)));
+typedef double v4df __attribute__ ((vector_size (32)));
+typedef long long v4di __attribute__((vector_size(32)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef short v16hi __attribute__ ((vector_size (32)));
+typedef char v32qi __attribute__ ((vector_size (32)));
+
+v4df
+foo_v4df (v4df x)
+{
+ return __builtin_shuffle (x, (v4df) { 0, 0, 0, 0 }, (v4di) { 0, 1, 4, 5 });
+}
+
+v4df
+foo_v4df_l (v4df x)
+{
+ return __builtin_shuffle ((v4df) { 0, 0, 0, 0 }, x, (v4di) { 4, 5, 1, 2 });
+}
+
+v4di
+foo_v4di (v4di x)
+{
+ return __builtin_shuffle (x, (v4di) { 0, 0, 0, 0 }, (v4di) { 0, 1, 4, 7 });
+}
+
+v4di
+foo_v4di_l (v4di x)
+{
+ return __builtin_shuffle ((v4di) { 0, 0, 0, 0 }, x, (v4di) { 4, 5, 3, 1 });
+}
+
+v8sf
+foo_v8sf (v8sf x)
+{
+ return __builtin_shuffle ((v8sf) { 0, 0, 0, 0, 0, 0, 0, 0 }, x,
+ (v8si) { 8, 9, 10, 11, 0, 1, 2, 3 });
+}
+
+v8sf
+foo_v8sf_l (v8sf x)
+{
+ return __builtin_shuffle (x, (v8sf) { 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v8si) { 0, 1, 2, 3, 8, 9, 10, 11 });
+}
+
+v8si
+foo_v8si (v8si x)
+{
+ return __builtin_shuffle (x, (v8si) { 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v8si) { 0, 1, 2, 3, 13, 12, 11, 15 });
+}
+
+v8si
+foo_v8si_l (v8si x)
+{
+ return __builtin_shuffle ((v8si) { 0, 0, 0, 0, 0, 0, 0, 0 }, x,
+ (v8si) { 8, 9, 10, 11, 7, 6, 5, 4 });
+}
+
+v16hi
+foo_v16hi (v16hi x)
+{
+ return __builtin_shuffle (x, (v16hi) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v16hi) { 0, 1, 2, 3, 4, 5, 6, 7,
+ 24, 17, 26, 19, 28, 21, 30, 23 });
+}
+
+v16hi
+foo_v16hi_l (v16hi x)
+{
+ return __builtin_shuffle ((v16hi) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 }, x,
+ (v16hi) { 16, 17, 18, 20, 21, 22, 23,
+ 15, 0, 13, 2, 11, 4, 9, 6 });
+}
+
+v32qi
+foo_v32qi (v32qi x)
+{
+ return __builtin_shuffle (x, (v32qi) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v32qi) { 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 32, 49, 34, 58, 36, 53, 38, 39,
+ 40, 60, 42, 43, 63, 45, 46, 47 });
+}
+
+v32qi
+foo_v32qi_l (v32qi x)
+{
+ return __builtin_shuffle ((v32qi) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 }, x,
+ (v32qi) { 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 31, 0, 29, 2, 27, 4, 25, 6,
+ 23, 8, 21, 10, 19, 12, 17, 14 });
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
new file mode 100644
index 00000000000..c27431aae72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
@@ -0,0 +1,144 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512vbmi -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 12} } */
+/* { dg-final { scan-assembler-not "pxor" } } */
+
+
+typedef float v16sf __attribute__((vector_size(64)));
+typedef double v8df __attribute__ ((vector_size (64)));
+typedef long long v8di __attribute__((vector_size(64)));
+typedef int v16si __attribute__((vector_size(64)));
+typedef short v32hi __attribute__ ((vector_size (64)));
+typedef char v64qi __attribute__ ((vector_size (64)));
+
+v8df
+foo_v8df (v8df x)
+{
+ return __builtin_shuffle (x, (v8df) { 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v8di) { 0, 1, 2, 3, 15, 14, 10, 11 });
+}
+
+v8df
+foo_v8df_l (v8df x)
+{
+ return __builtin_shuffle ((v8df) { 0, 0, 0, 0, 0, 0, 0, 0 }, x,
+ (v8di) { 8, 9, 10, 11, 0, 1, 2, 3 });
+}
+
+v8di
+foo_v8di (v8di x)
+{
+ return __builtin_shuffle (x, (v8di) { 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 });
+}
+
+v8di
+foo_v8di_l (v8di x)
+{
+ return __builtin_shuffle ((v8di) { 0, 0, 0, 0, 0, 0, 0, 0 }, x,
+ (v8di) { 8, 9, 10, 11, 7, 6, 5, 4 });
+}
+
+v16sf
+foo_v16sf (v16sf x)
+{
+ return __builtin_shuffle (x, (v16sf) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
+ 16, 17, 18, 19, 20, 21, 22, 23 });
+}
+
+v16sf
+foo_v16sf_l (v16sf x)
+{
+ return __builtin_shuffle ((v16sf) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 }, x,
+ (v16si) { 16, 17, 18, 19, 20, 21, 22, 23,
+ 0, 15, 2, 13, 4, 11, 6, 9 });
+}
+
+v16si
+foo_v16si (v16si x)
+{
+ return __builtin_shuffle (x, (v16si) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
+ 31, 30, 29, 28, 20, 21, 22, 23 });
+}
+
+v16si
+foo_v16si_l (v16si x)
+{
+ return __builtin_shuffle ((v16si) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 }, x,
+ (v16si) { 16, 17, 18, 19, 20, 21, 22, 23,
+ 15, 0, 13, 2, 11, 4, 9, 6 });
+}
+
+v32hi
+foo_v32hi (v32hi x)
+{
+ return __builtin_shuffle (x, (v32hi) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v32hi) { 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 63, 33, 61, 35, 59, 37, 57, 39,
+ 55, 41, 53, 43, 51, 45, 49, 47 });
+}
+
+v32hi
+foo_v32hi_l (v32hi x)
+{
+ return __builtin_shuffle ((v32hi) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 }, x,
+ (v32hi) { 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 31, 0, 29, 2, 27, 4, 25, 6,
+ 23, 8, 21, 10, 19, 12, 17, 14 });
+}
+
+v64qi
+foo_v64qi (v64qi x)
+{
+ return __builtin_shuffle (x, (v64qi) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v64qi) {0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 64, 127, 66, 125, 68, 123, 70, 121,
+ 72, 119, 74, 117, 76, 115, 78, 113,
+ 80, 111, 82, 109, 84, 107, 86, 105,
+ 88, 103, 90, 101, 92, 99, 94, 97 });
+}
+
+v64qi
+foo_v64qi_l (v64qi x)
+{
+ return __builtin_shuffle ((v64qi) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 }, x,
+ (v64qi) { 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 92, 93, 94, 95,
+ 0, 63, 2, 61, 4, 59, 6, 57,
+ 8, 55, 10, 53, 12, 51, 14, 49,
+ 16, 47, 18, 45, 20, 43, 22, 41,
+ 24, 39, 26, 37, 28, 35, 30, 33 });
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr94680.c b/gcc/testsuite/gcc.target/i386/sse2-pr94680.c
new file mode 100644
index 00000000000..7e0ff9f6bc7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-pr94680.c
@@ -0,0 +1,91 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -mno-sse4.1 -O2" } */
+/* { dg-final { scan-assembler-times {(?n)(?:mov|psrldq).*%xmm[0-9]} 12 } } */
+/* { dg-final { scan-assembler-not "pxor" } } */
+
+typedef float v4sf __attribute__((vector_size(16)));
+typedef double v2df __attribute__ ((vector_size (16)));
+typedef long long v2di __attribute__((vector_size(16)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+typedef char v16qi __attribute__ ((vector_size (16)));
+
+v2df
+foo_v2df (v2df x)
+{
+ return __builtin_shuffle (x, (v2df) { 0, 0 }, (v2di) {0, 2});
+}
+
+v2df
+foo_v2df_l (v2df x)
+{
+ return __builtin_shuffle ((v2df) { 0, 0 }, x, (v2di) {3, 1});
+}
+
+v2di
+foo_v2di (v2di x)
+{
+ return __builtin_shuffle (x, (v2di) { 0, 0 }, (v2di) {0, 3});
+}
+
+v2di
+foo_v2di_l (v2di x)
+{
+ return __builtin_shuffle ((v2di) { 0, 0 }, x, (v2di) {3, 0});
+}
+
+v4sf
+foo_v4sf (v4sf x)
+{
+ return __builtin_shuffle (x, (v4sf) { 0, 0, 0, 0 }, (v4si) {0, 1, 4, 5});
+}
+
+v4sf
+foo_v4sf_l (v4sf x)
+{
+ return __builtin_shuffle ((v4sf) { 0, 0, 0, 0 }, x, (v4si) {4, 5, 3, 1});
+}
+
+v4si
+foo_v4si (v4si x)
+{
+ return __builtin_shuffle (x, (v4si) { 0, 0, 0, 0 }, (v4si) {0, 1, 6, 7});
+}
+
+v4si
+foo_v4si_l (v4si x)
+{
+ return __builtin_shuffle ((v4si) { 0, 0, 0, 0 }, x, (v4si) {4, 5, 1, 2});
+}
+
+v8hi
+foo_v8hi (v8hi x)
+{
+ return __builtin_shuffle (x, (v8hi) { 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v8hi) { 0, 1, 2, 3, 8, 12, 10, 13 });
+}
+
+v8hi
+foo_v8hi_l (v8hi x)
+{
+ return __builtin_shuffle ((v8hi) { 0, 0, 0, 0, 0, 0, 0, 0 }, x,
+ (v8hi) { 8, 9, 10, 11, 7, 6, 5, 4 });
+}
+
+v16qi
+foo_v16qi (v16qi x)
+{
+ return __builtin_shuffle (x, (v16qi) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v16qi) {0, 1, 2, 3, 4, 5, 6, 7,
+ 16, 24, 18, 26, 20, 28, 22, 30 });
+}
+
+v16qi
+foo_v16qi_l (v16qi x)
+{
+ return __builtin_shuffle ((v16qi) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 }, x,
+ (v16qi) { 16, 17, 18, 19, 20, 21, 22, 23,
+ 15, 0, 13, 2, 11, 4, 9, 6 });
+}
--
2.18.1
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680]
2021-04-25 6:57 ` Hongtao Liu
@ 2021-05-12 7:30 ` Hongtao Liu
2021-05-12 14:19 ` Jakub Jelinek
1 sibling, 0 replies; 9+ messages in thread
From: Hongtao Liu @ 2021-05-12 7:30 UTC (permalink / raw)
To: Jakub Jelinek; +Cc: GCC Patches
ping.
On Sun, Apr 25, 2021 at 2:57 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Fri, Apr 23, 2021 at 5:13 PM Jakub Jelinek <jakub@redhat.com> wrote:
> >
> > On Fri, Apr 23, 2021 at 12:53:58PM +0800, Hongtao Liu via Gcc-patches wrote:
> > > + if (!CONST_INT_P (er))
> > > + return 0;
> > > + ei = INTVAL (er);
> > > + if (i < nelt2 && ei != i)
> > > + return 0;
> > > + if (i >= nelt2
> > > + && (ei < nelt || ei >= nelt<<1))
> >
> > Formatting:
> > 1) you have spaces followed by tab, remove the spaces; but,
> > if (i >= nelt2 && (ei < nelt || ei >= nelt<<1))
> > fits on one line, so keep it on one line.
> > 2) nelt<<1 should be nelt << 1 with spaces around the <<
> >
>
> Done.
>
> > > -(define_insn "*vec_concatv4si_0"
> > > - [(set (match_operand:V4SI 0 "register_operand" "=v,x")
> > > - (vec_concat:V4SI
> > > - (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y")
> > > - (match_operand:V2SI 2 "const0_operand" " C,C")))]
> > > +(define_insn "*vec_concat<mode>_0"
> > > + [(set (match_operand:VI124_128 0 "register_operand" "=v,x")
> > > + (vec_concat:VI124_128
> > > + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm,?!*y")
> > > + (match_operand:<ssehalfvecmode> 2 "const0_operand" " C,C")))]
> > > "TARGET_SSE2"
> > > "@
> > > %vmovq\t{%1, %0|%0, %1}
> > > @@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat<mode>"
> > > (set_attr "prefix" "maybe_evex")
> > > (set_attr "mode" "<sseinsnmode>")])
> > >
> > > +(define_insn_and_split "*vec_concat<mode>_0"
> >
> > Would be better to use a different pattern name, *vec_concat<mode>_0
> > is already used in the above define_insn.
> > Use some additional suffix after _0?
> >
>
> Changed to "*vec_concat<mode>_0_1"
>
> > > + return __builtin_shuffle (x, (v32qi) { 0, 0, 0, 0, 0, 0, 0, 0,
> > > + 0, 0, 0, 0, 0, 0, 0, 0,
> > > + 0, 0, 0, 0, 0, 0, 0, 0,
> > > + 0, 0, 0, 0, 0, 0, 0, 0 },
> > > + (v32qi) { 0, 1, 2, 3, 4, 5, 6, 7,
> > > + 8, 9, 10, 11, 12, 13, 14, 15,
> > > + 32, 49, 34, 58, 36, 53, 38, 39,
> > > + 40, 60, 42, 43, 63, 45, 46, 47 });
> >
> > In this testcase the shuffles in the part taking indexes from the zero
> > vector are nicely randomized.
> >
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
> > > @@ -0,0 +1,78 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-mavx512bw -mavx512vbmi -O2" } */
> > > +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 6} } */
> > > +/* { dg-final { scan-assembler-not "pxor" } } */
> > > +
> > > +
> > > +typedef float v16sf __attribute__((vector_size(64)));
> > > +typedef double v8df __attribute__ ((vector_size (64)));
> > > +typedef long long v8di __attribute__((vector_size(64)));
> > > +typedef int v16si __attribute__((vector_size(64)));
> > > +typedef short v32hi __attribute__ ((vector_size (64)));
> > > +typedef char v64qi __attribute__ ((vector_size (64)));
> > > +
> > > +v8df
> > > +foo_v8df (v8df x)
> > > +{
> > > + return __builtin_shuffle (x, (v8df) { 0, 0, 0, 0, 0, 0, 0, 0 },
> > > + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 });
> > > +}
> > > +
> > > +v8di
> > > +foo_v8di (v8di x)
> > > +{
> > > + return __builtin_shuffle (x, (v8di) { 0, 0, 0, 0, 0, 0, 0, 0 },
> > > + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 });
> > > +}
> > > +
> > > +v16sf
> > > +foo_v16sf (v16sf x)
> > > +{
> > > + return __builtin_shuffle (x, (v16sf) { 0, 0, 0, 0, 0, 0, 0, 0,
> > > + 0, 0, 0, 0, 0, 0, 0, 0 },
> > > + (v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
> > > + 16, 17, 18, 19, 20, 21, 22, 23 });
> > > +}
> > > +
> > > +v16si
> > > +foo_v16si (v16si x)
> > > +{
> > > + return __builtin_shuffle (x, (v16si) { 0, 0, 0, 0, 0, 0, 0, 0,
> > > + 0, 0, 0, 0, 0, 0, 0, 0 },
> > > + (v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
> > > + 16, 17, 18, 19, 20, 21, 22, 23 });
> > > +}
> > > +
> > > +v32hi
> > > +foo_v32hi (v32hi x)
> > > +{
> > > + return __builtin_shuffle (x, (v32hi) { 0, 0, 0, 0, 0, 0, 0, 0,
> > > + 0, 0, 0, 0, 0, 0, 0, 0,
> > > + 0, 0, 0, 0, 0, 0, 0, 0,
> > > + 0, 0, 0, 0, 0, 0, 0, 0 },
> > > + (v32hi) { 0, 1, 2, 3, 4, 5, 6, 7,
> > > + 8, 9, 10, 11, 12, 13, 14, 15,
> > > + 32, 33, 34, 35, 36, 37, 38, 39,
> > > + 40,41, 42, 43, 44, 45, 46, 47 });
> > > +}
> > > +
> > > +v64qi
> > > +foo_v64qi (v64qi x)
> > > +{
> > > + return __builtin_shuffle (x, (v64qi) { 0, 0, 0, 0, 0, 0, 0, 0,
> > > + 0, 0, 0, 0, 0, 0, 0, 0,
> > > + 0, 0, 0, 0, 0, 0, 0, 0,
> > > + 0, 0, 0, 0, 0, 0, 0, 0,
> > > + 0, 0, 0, 0, 0, 0, 0, 0,
> > > + 0, 0, 0, 0, 0, 0, 0, 0,
> > > + 0, 0, 0, 0, 0, 0, 0, 0,
> > > + 0, 0, 0, 0, 0, 0, 0, 0 },
> > > + (v64qi) {0, 1, 2, 3, 4, 5, 6, 7,
> > > + 8, 9, 10, 11, 12, 13, 14, 15,
> > > + 16, 17, 18, 19, 20, 21, 22, 23,
> > > + 24, 25, 26, 27, 28, 29, 30, 31,
> > > + 64, 65, 66, 67, 68, 69, 70, 71,
> > > + 72, 73, 74, 75, 76, 77, 78, 79,
> > > + 80, 81, 82, 83, 84, 85, 86, 87,
> > > + 88, 89, 90, 91, 92, 93, 94, 95 });
> >
> > Can't you randomize a little bit at least some of these too?
> >
>
> Done.
>
> > Also, what happens with __builtin_shuffle (zero_vector, x, ...) (i.e. when
> > you swap the two vectors and adjust correspondingly the permutation)?
> > Will it be also recognized or do we just punt on those?
>
> when building gimple, vec_perm(0, x, sel) is simplified to vec_perm(x,
> 0, sel*)(, with adjustment of selector), since arg0 is a constant.
> Not sure if rtl phase would do same simplification, anyway i add
> testcases for __builtin_shuffle (zero_vector, x, ...), but not extend
> pre-reload splitters to handle (vec_select: (vec_concat: cosnt0_rtx
> op1) selector).
>
> >
> > Jakub
> >
>
>
> --
> BR,
> Hongtao
--
BR,
Hongtao
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680]
2021-04-25 6:57 ` Hongtao Liu
2021-05-12 7:30 ` Hongtao Liu
@ 2021-05-12 14:19 ` Jakub Jelinek
2021-05-13 0:44 ` Hongtao Liu
1 sibling, 1 reply; 9+ messages in thread
From: Jakub Jelinek @ 2021-05-12 14:19 UTC (permalink / raw)
To: Hongtao Liu; +Cc: GCC Patches
On Sun, Apr 25, 2021 at 02:57:08PM +0800, Hongtao Liu via Gcc-patches wrote:
> gcc/ChangeLog:
>
> PR target/94680
> * config/i386/sse.md (ssedoublevecmode): Add attribute for
> V64QI/V32HI/V16SI/V4DI.
> (ssehalfvecmode): Add attribute for V2DI/V2DF.
> (*vec_concatv4si_0): Extend to VI124_128.
> (*vec_concat<mode>_0): New pre-reload splitter.
> * config/i386/predicates.md (movq_parallel): New predicate.
>
> gcc/testsuite/ChangeLog:
>
> PR target/94680
> * gcc.target/i386/avx-pr94680.c: New test.
> * gcc.target/i386/avx512f-pr94680.c: New test.
> * gcc.target/i386/sse2-pr94680.c: New test.
Ok, thanks. Sorry for the delay.
Jakub
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680]
2021-05-12 14:19 ` Jakub Jelinek
@ 2021-05-13 0:44 ` Hongtao Liu
2021-05-13 5:52 ` Hongtao Liu
0 siblings, 1 reply; 9+ messages in thread
From: Hongtao Liu @ 2021-05-13 0:44 UTC (permalink / raw)
To: Jakub Jelinek; +Cc: GCC Patches
On Wed, May 12, 2021 at 10:19 PM Jakub Jelinek <jakub@redhat.com> wrote:
>
> On Sun, Apr 25, 2021 at 02:57:08PM +0800, Hongtao Liu via Gcc-patches wrote:
> > gcc/ChangeLog:
> >
> > PR target/94680
> > * config/i386/sse.md (ssedoublevecmode): Add attribute for
> > V64QI/V32HI/V16SI/V4DI.
> > (ssehalfvecmode): Add attribute for V2DI/V2DF.
> > (*vec_concatv4si_0): Extend to VI124_128.
> > (*vec_concat<mode>_0): New pre-reload splitter.
> > * config/i386/predicates.md (movq_parallel): New predicate.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR target/94680
> > * gcc.target/i386/avx-pr94680.c: New test.
> > * gcc.target/i386/avx512f-pr94680.c: New test.
> > * gcc.target/i386/sse2-pr94680.c: New test.
>
> Ok, thanks. Sorry for the delay.
Thanks for the review.
>
> Jakub
>
--
BR,
Hongtao
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680]
2021-05-13 0:44 ` Hongtao Liu
@ 2021-05-13 5:52 ` Hongtao Liu
0 siblings, 0 replies; 9+ messages in thread
From: Hongtao Liu @ 2021-05-13 5:52 UTC (permalink / raw)
To: Jakub Jelinek; +Cc: GCC Patches
There's a typo in the testcase, I've committed the patch as an obvious fix.
Fix typo in testcase.
gcc/testsuite/ChangeLog:
* gcc.target/i386/avx-pr94680.c: Fix typo in testcase.
diff --git a/gcc/testsuite/gcc.target/i386/avx-pr94680.c
b/gcc/testsuite/gcc.target/i386/avx-pr94680.c
index a89e4967f64..cb5041b6af3 100644
--- a/gcc/testsuite/gcc.target/i386/avx-pr94680.c
+++ b/gcc/testsuite/gcc.target/i386/avx-pr94680.c
@@ -76,7 +76,7 @@ foo_v16hi_l (v16hi x)
{
return __builtin_shuffle ((v16hi) { 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 }, x,
- (v16hi) { 16, 17, 18, 20, 21, 22, 23,
+ (v16hi) { 16, 17, 18, 19, 20, 21, 22, 23,
15, 0, 13, 2, 11, 4, 9, 6 });
}
--
BR,
Hongtao
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH] i386: Fix up *vec_concat<mode>_0_1 [PR101007]
2021-04-23 4:53 [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680] Hongtao Liu
2021-04-23 9:13 ` Jakub Jelinek
@ 2021-06-11 8:59 ` Jakub Jelinek
2021-06-11 9:34 ` Uros Bizjak
1 sibling, 1 reply; 9+ messages in thread
From: Jakub Jelinek @ 2021-06-11 8:59 UTC (permalink / raw)
To: Uros Bizjak, Hongtao Liu; +Cc: GCC Patches
On Fri, Apr 23, 2021 at 12:53:58PM +0800, Hongtao Liu via Gcc-patches wrote:
> -(define_insn "*vec_concatv4si_0"
> - [(set (match_operand:V4SI 0 "register_operand" "=v,x")
> - (vec_concat:V4SI
> - (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y")
> - (match_operand:V2SI 2 "const0_operand" " C,C")))]
> +(define_insn "*vec_concat<mode>_0"
> + [(set (match_operand:VI124_128 0 "register_operand" "=v,x")
> + (vec_concat:VI124_128
> + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm,?!*y")
> + (match_operand:<ssehalfvecmode> 2 "const0_operand" " C,C")))]
> "TARGET_SSE2"
> "@
> %vmovq\t{%1, %0|%0, %1}
> @@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat<mode>"
> (set_attr "prefix" "maybe_evex")
> (set_attr "mode" "<sseinsnmode>")])
>
> +(define_insn_and_split "*vec_concat<mode>_0"
> + [(set (match_operand:V 0 "register_operand")
> + (vec_select:V
> + (vec_concat:<ssedoublevecmode>
> + (match_operand:V 1 "nonimmediate_operand")
> + (match_operand:V 2 "const0_operand"))
> + (match_parallel 3 "movq_parallel"
> + [(match_operand 4 "const_int_operand")])))]
> + "ix86_pre_reload_split ()"
> + "#"
> + "&& 1"
> + [(set (match_dup 0)
> + (vec_concat:V (match_dup 1) (match_dup 5)))]
> +{
> + operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]);
> + operands[5] = CONST0_RTX (<ssehalfvecmode>mode);
> +})
This regressed the following testcase with -msse -mno-sse2.
The define_insn_and_split splits the permutation into *vec_concat<mode>_0
or *vec_concatv2di_0 insns which both have TARGET_SSE2 in their
conditions (for the former you can see it above), but the
define_insn_and_split matches always when the V mode's condition do,
which for V16QI/V8HI/V4SI/V2DI/V4SF modes is always (well, when those
modes are valid, which is TARGET_SSE).
Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux,
ok for trunk?
2021-06-11 Jakub Jelinek <jakub@redhat.com>
PR target/101007
* config/i386/sse.md (*vec_concat<mode>_0_1): Require TARGET_SSE2.
* gcc.target/i386/sse-pr101007.c: New test.
--- gcc/config/i386/sse.md.jj 2021-06-07 09:24:57.706689972 +0200
+++ gcc/config/i386/sse.md 2021-06-10 11:14:52.407588679 +0200
@@ -22395,7 +22395,7 @@ (define_insn_and_split "*vec_concat<mode
(match_operand:V 2 "const0_operand"))
(match_parallel 3 "movq_parallel"
[(match_operand 4 "const_int_operand")])))]
- "ix86_pre_reload_split ()"
+ "TARGET_SSE2 && ix86_pre_reload_split ()"
"#"
"&& 1"
[(set (match_dup 0)
--- gcc/testsuite/gcc.target/i386/sse-pr101007.c.jj 2021-06-10 11:41:25.818609527 +0200
+++ gcc/testsuite/gcc.target/i386/sse-pr101007.c 2021-06-10 11:38:39.301910017 +0200
@@ -0,0 +1,14 @@
+/* PR target/101007 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse2" } */
+
+typedef unsigned __attribute__((__vector_size__ (8))) U;
+typedef unsigned __attribute__((__vector_size__ (16))) V;
+V v;
+U *p;
+
+void
+foo (void)
+{
+ *p = (U) __builtin_shufflevector ((V)(0 == (V){} >= 0), v, 4, 2);
+}
Jakub
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] i386: Fix up *vec_concat<mode>_0_1 [PR101007]
2021-06-11 8:59 ` [PATCH] i386: Fix up *vec_concat<mode>_0_1 [PR101007] Jakub Jelinek
@ 2021-06-11 9:34 ` Uros Bizjak
0 siblings, 0 replies; 9+ messages in thread
From: Uros Bizjak @ 2021-06-11 9:34 UTC (permalink / raw)
To: Jakub Jelinek; +Cc: Hongtao Liu, GCC Patches
On Fri, Jun 11, 2021 at 10:59 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> On Fri, Apr 23, 2021 at 12:53:58PM +0800, Hongtao Liu via Gcc-patches wrote:
> > -(define_insn "*vec_concatv4si_0"
> > - [(set (match_operand:V4SI 0 "register_operand" "=v,x")
> > - (vec_concat:V4SI
> > - (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y")
> > - (match_operand:V2SI 2 "const0_operand" " C,C")))]
> > +(define_insn "*vec_concat<mode>_0"
> > + [(set (match_operand:VI124_128 0 "register_operand" "=v,x")
> > + (vec_concat:VI124_128
> > + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm,?!*y")
> > + (match_operand:<ssehalfvecmode> 2 "const0_operand" " C,C")))]
> > "TARGET_SSE2"
> > "@
> > %vmovq\t{%1, %0|%0, %1}
> > @@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat<mode>"
> > (set_attr "prefix" "maybe_evex")
> > (set_attr "mode" "<sseinsnmode>")])
> >
> > +(define_insn_and_split "*vec_concat<mode>_0"
> > + [(set (match_operand:V 0 "register_operand")
> > + (vec_select:V
> > + (vec_concat:<ssedoublevecmode>
> > + (match_operand:V 1 "nonimmediate_operand")
> > + (match_operand:V 2 "const0_operand"))
> > + (match_parallel 3 "movq_parallel"
> > + [(match_operand 4 "const_int_operand")])))]
> > + "ix86_pre_reload_split ()"
> > + "#"
> > + "&& 1"
> > + [(set (match_dup 0)
> > + (vec_concat:V (match_dup 1) (match_dup 5)))]
> > +{
> > + operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]);
> > + operands[5] = CONST0_RTX (<ssehalfvecmode>mode);
> > +})
>
> This regressed the following testcase with -msse -mno-sse2.
> The define_insn_and_split splits the permutation into *vec_concat<mode>_0
> or *vec_concatv2di_0 insns which both have TARGET_SSE2 in their
> conditions (for the former you can see it above), but the
> define_insn_and_split matches always when the V mode's condition do,
> which for V16QI/V8HI/V4SI/V2DI/V4SF modes is always (well, when those
> modes are valid, which is TARGET_SSE).
>
> Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux,
> ok for trunk?
>
> 2021-06-11 Jakub Jelinek <jakub@redhat.com>
>
> PR target/101007
> * config/i386/sse.md (*vec_concat<mode>_0_1): Require TARGET_SSE2.
>
> * gcc.target/i386/sse-pr101007.c: New test.
OK, even as obvious patch.
Thanks,
Uros.
> --- gcc/config/i386/sse.md.jj 2021-06-07 09:24:57.706689972 +0200
> +++ gcc/config/i386/sse.md 2021-06-10 11:14:52.407588679 +0200
> @@ -22395,7 +22395,7 @@ (define_insn_and_split "*vec_concat<mode
> (match_operand:V 2 "const0_operand"))
> (match_parallel 3 "movq_parallel"
> [(match_operand 4 "const_int_operand")])))]
> - "ix86_pre_reload_split ()"
> + "TARGET_SSE2 && ix86_pre_reload_split ()"
> "#"
> "&& 1"
> [(set (match_dup 0)
> --- gcc/testsuite/gcc.target/i386/sse-pr101007.c.jj 2021-06-10 11:41:25.818609527 +0200
> +++ gcc/testsuite/gcc.target/i386/sse-pr101007.c 2021-06-10 11:38:39.301910017 +0200
> @@ -0,0 +1,14 @@
> +/* PR target/101007 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse2" } */
> +
> +typedef unsigned __attribute__((__vector_size__ (8))) U;
> +typedef unsigned __attribute__((__vector_size__ (16))) V;
> +V v;
> +U *p;
> +
> +void
> +foo (void)
> +{
> + *p = (U) __builtin_shufflevector ((V)(0 == (V){} >= 0), v, 4, 2);
> +}
>
> Jakub
>
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2021-06-11 9:34 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-23 4:53 [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680] Hongtao Liu
2021-04-23 9:13 ` Jakub Jelinek
2021-04-25 6:57 ` Hongtao Liu
2021-05-12 7:30 ` Hongtao Liu
2021-05-12 14:19 ` Jakub Jelinek
2021-05-13 0:44 ` Hongtao Liu
2021-05-13 5:52 ` Hongtao Liu
2021-06-11 8:59 ` [PATCH] i386: Fix up *vec_concat<mode>_0_1 [PR101007] Jakub Jelinek
2021-06-11 9:34 ` Uros Bizjak
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).