public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r12-2869] Combine avx_vec_concatv16si and avx512f_zero_extendv16hiv16si2_1 to avx512f_zero_extendv16hiv16si2_2
@ 2021-08-12  6:03 hongtao Liu
  0 siblings, 0 replies; only message in thread
From: hongtao Liu @ 2021-08-12  6:03 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:95e1eca43d106d821720744ac6ff1f5df41a1e78

commit r12-2869-g95e1eca43d106d821720744ac6ff1f5df41a1e78
Author: liuhongt <hongtao.liu@intel.com>
Date:   Wed Aug 11 14:00:00 2021 +0800

    Combine avx_vec_concatv16si and avx512f_zero_extendv16hiv16si2_1 to avx512f_zero_extendv16hiv16si2_2.
    
    Add define_insn_and_split to combine avx_vec_concatv16si/2 and
    avx512f_zero_extendv16hiv16si2_1 since the latter already zero_extend
    the upper bits, similar for other patterns which are related to
    pmovzx{bw,wd,dq}.
    
    It will do optimization like
    
    -       vmovdqa %ymm0, %ymm0    # 7     [c=4 l=6]  avx_vec_concatv16si/2
            vpmovzxwd       %ymm0, %zmm0    # 22    [c=4 l=6]  avx512f_zero_extendv16hiv16si2
            ret             # 25    [c=0 l=1]  simple_return_internal
    
    gcc/ChangeLog:
    
            PR target/101846
            * config/i386/sse.md (*avx2_zero_extendv16qiv16hi2_2): New
            post_reload define_insn_and_split.
            (*avx512bw_zero_extendv32qiv32hi2_2): Ditto.
            (*sse4_1_zero_extendv8qiv8hi2_4): Ditto.
            (*avx512f_zero_extendv16hiv16si2_2): Ditto.
            (*avx2_zero_extendv8hiv8si2_2): Ditto.
            (*sse4_1_zero_extendv4hiv4si2_4): Ditto.
            (*avx512f_zero_extendv8siv8di2_2): Ditto.
            (*avx2_zero_extendv4siv4di2_2): Ditto.
            (*sse4_1_zero_extendv2siv2di2_4): Ditto.
            (VI248_256, VI248_512, VI148_512, VI148_256, VI148_128): New
            mode iterator.
    
    gcc/testsuite/ChangeLog:
    
            PR target/101846
            * gcc.target/i386/pr101846-1.c: New test.

Diff:
---
 gcc/config/i386/sse.md                     | 219 +++++++++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr101846-1.c |  95 +++++++++++++
 2 files changed, 314 insertions(+)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3957c86c3df..3a7bbaec7af 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -681,7 +681,12 @@
 (define_mode_iterator VI124_128 [V16QI V8HI V4SI])
 (define_mode_iterator VI24_128 [V8HI V4SI])
 (define_mode_iterator VI248_128 [V8HI V4SI V2DI])
+(define_mode_iterator VI248_256 [V16HI V8SI V4DI])
+(define_mode_iterator VI248_512 [V32HI V16SI V8DI])
 (define_mode_iterator VI48_128 [V4SI V2DI])
+(define_mode_iterator VI148_512 [V64QI V16SI V8DI])
+(define_mode_iterator VI148_256 [V32QI V8SI V4DI])
+(define_mode_iterator VI148_128 [V16QI V4SI V2DI])
 
 ;; Various 256bit and 512 vector integer mode combinations
 (define_mode_iterator VI124_256 [V32QI V16HI V8SI])
@@ -18603,6 +18608,26 @@
   operands[1] = lowpart_subreg (V16QImode, operands[1], V32QImode);
 })
 
+(define_insn_and_split "*avx2_zero_extendv16qiv16hi2_2"
+  [(set (match_operand:V32QI 0 "register_operand" "=v")
+	(vec_select:V32QI
+	  (vec_concat:V64QI
+	    (subreg:V32QI
+	      (vec_concat:VI248_256
+		(match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm")
+		(match_operand:<ssehalfvecmode> 2 "const0_operand" "C")) 0)
+	    (match_operand:V32QI 3 "const0_operand" "C"))
+	  (match_parallel 4 "pmovzx_parallel"
+	    [(match_operand 5 "const_int_operand" "n")])))]
+  "TARGET_AVX2"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V16HI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V16HImode, operands[0], V32QImode);
+  operands[1] = lowpart_subreg (V16QImode, operands[1], <ssehalfvecmode>mode);
+})
+
 (define_expand "<insn>v16qiv16hi2"
   [(set (match_operand:V16HI 0 "register_operand")
 	(any_extend:V16HI
@@ -18637,6 +18662,26 @@
   operands[1] = lowpart_subreg (V32QImode, operands[1], V64QImode);
 })
 
+(define_insn_and_split "*avx512bw_zero_extendv32qiv32hi2_2"
+  [(set (match_operand:V64QI 0 "register_operand" "=v")
+	(vec_select:V64QI
+	  (vec_concat:V128QI
+	    (subreg:V64QI
+	      (vec_concat:VI248_512
+		(match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm")
+		(match_operand:<ssehalfvecmode> 2 "const0_operand" "C")) 0)
+	    (match_operand:V64QI 3 "const0_operand" "C"))
+	  (match_parallel 4 "pmovzx_parallel"
+	    [(match_operand 5 "const_int_operand" "n")])))]
+  "TARGET_AVX512BW"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V32HI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V32HImode, operands[0], V64QImode);
+  operands[1] = lowpart_subreg (V32QImode, operands[1], <ssehalfvecmode>mode);
+})
+
 (define_expand "<insn>v32qiv32hi2"
   [(set (match_operand:V32HI 0 "register_operand")
 	(any_extend:V32HI
@@ -18723,6 +18768,41 @@
 }
   [(set_attr "isa" "noavx,noavx,avx")])
 
+(define_insn_and_split "*sse4_1_zero_extendv8qiv8hi2_4"
+  [(set (match_operand:V16QI 0 "register_operand" "=Yr,*x,Yw")
+	(vec_select:V16QI
+	  (vec_concat:V32QI
+	    (subreg:V16QI
+	      (vec_concat:VI248_128
+		(match_operand:<ssehalfvecmode> 1 "vector_operand" "YrBm,*xBm,Ywm")
+		(match_operand:<ssehalfvecmode> 2 "const0_operand" "C,C,C")) 0)
+	    (match_operand:V16QI 3 "const0_operand" "C,C,C"))
+	  (match_parallel 4 "pmovzx_parallel"
+	    [(match_operand 5 "const_int_operand" "n,n,n")])))]
+  "TARGET_SSE4_1"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+	(zero_extend:V8HI
+	  (vec_select:V8QI
+	    (match_dup 1)
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)
+		       (const_int 4) (const_int 5)
+		       (const_int 6) (const_int 7)]))))]
+{
+  operands[0] = lowpart_subreg (V8HImode, operands[0], V16QImode);
+  if (MEM_P (operands[1]))
+    {
+      operands[1] = lowpart_subreg (V8QImode, operands[1], <ssehalfvecmode>mode);
+      operands[1] = gen_rtx_ZERO_EXTEND (V8HImode, operands[1]);
+      emit_insn (gen_rtx_SET (operands[0], operands[1]));
+      DONE;
+    }
+  operands[1] = lowpart_subreg (V16QImode, operands[1], <ssehalfvecmode>mode);
+}
+  [(set_attr "isa" "noavx,noavx,avx")])
+
 (define_expand "<insn>v8qiv8hi2"
   [(set (match_operand:V8HI 0 "register_operand")
 	(any_extend:V8HI
@@ -18913,6 +18993,26 @@
   operands[1] = lowpart_subreg (V16HImode, operands[1], V32HImode);
 })
 
+(define_insn_and_split "*avx512f_zero_extendv16hiv16si2_2"
+  [(set (match_operand:V32HI 0 "register_operand" "=v")
+	(vec_select:V32HI
+	  (vec_concat:V64HI
+	    (subreg:V32HI
+	      (vec_concat:VI148_512
+	        (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm")
+		(match_operand:<ssehalfvecmode> 2 "const0_operand" "C")) 0)
+	    (match_operand:V32HI 3 "const0_operand" "C"))
+	  (match_parallel 4 "pmovzx_parallel"
+	    [(match_operand 5 "const_int_operand" "n")])))]
+  "TARGET_AVX512F"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V16SI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V16SImode, operands[0], V32HImode);
+  operands[1] = lowpart_subreg (V16HImode, operands[1], <ssehalfvecmode>mode);
+})
+
 (define_insn "avx2_<code>v8hiv8si2<mask_name>"
   [(set (match_operand:V8SI 0 "register_operand" "=v")
 	(any_extend:V8SI
@@ -18947,6 +19047,27 @@
   operands[1] = lowpart_subreg (V8HImode, operands[1], V16HImode);
 })
 
+(define_insn_and_split "*avx2_zero_extendv8hiv8si2_2"
+  [(set (match_operand:V16HI 0 "register_operand" "=v")
+	(vec_select:V16HI
+	  (vec_concat:V32HI
+	    (subreg:V16HI
+	      (vec_concat:VI148_256
+		(match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm")
+		(match_operand:<ssehalfvecmode> 2 "const0_operand" "C")) 0)
+	    (match_operand:V16HI 3 "const0_operand" "C"))
+	  (match_parallel 4 "pmovzx_parallel"
+	    [(match_operand 5 "const_int_operand" "n")])))]
+  "TARGET_AVX2"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V8SI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V8SImode, operands[0], V16HImode);
+  operands[1] = lowpart_subreg (V8HImode, operands[1], <ssehalfvecmode>mode);
+})
+
+
 (define_insn "sse4_1_<code>v4hiv4si2<mask_name>"
   [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v")
 	(any_extend:V4SI
@@ -19036,6 +19157,39 @@
 }
   [(set_attr "isa" "noavx,noavx,avx")])
 
+(define_insn_and_split "*sse4_1_zero_extendv4hiv4si2_4"
+  [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v")
+	(vec_select:V8HI
+	  (vec_concat:V16HI
+	    (subreg:V8HI
+	      (vec_concat:VI148_128
+		(match_operand:<ssehalfvecmode> 1 "vector_operand" "YrBm,*xBm,vm")
+		(match_operand:<ssehalfvecmode> 2 "const0_operand" "C,C,C")) 0)
+	    (match_operand:V8HI 3 "const0_operand" "C,C,C"))
+	  (match_parallel 4 "pmovzx_parallel"
+	    [(match_operand 5 "const_int_operand" "n,n,n")])))]
+  "TARGET_SSE4_1"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+	(zero_extend:V4SI
+	  (vec_select:V4HI
+	    (match_dup 1)
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)]))))]
+{
+  operands[0] = lowpart_subreg (V4SImode, operands[0], V8HImode);
+  if (MEM_P (operands[1]))
+    {
+      operands[1] = lowpart_subreg (V4HImode, operands[1], <ssehalfvecmode>mode);
+      operands[1] = gen_rtx_ZERO_EXTEND (V4SImode, operands[1]);
+      emit_insn (gen_rtx_SET (operands[0], operands[1]));
+      DONE;
+    }
+  operands[1] = lowpart_subreg (V8HImode, operands[1], <ssehalfvecmode>mode);
+}
+  [(set_attr "isa" "noavx,noavx,avx")])
+
 (define_insn "avx512f_<code>v8qiv8di2<mask_name>"
   [(set (match_operand:V8DI 0 "register_operand" "=v")
 	(any_extend:V8DI
@@ -19346,6 +19500,24 @@
   operands[1] = lowpart_subreg (V8SImode, operands[1], V16SImode);
 })
 
+(define_insn_and_split "*avx512f_zero_extendv8siv8di2_2"
+  [(set (match_operand:V16SI 0 "register_operand" "=v")
+	(vec_select:V16SI
+	  (vec_concat:V32SI
+	    (vec_concat:V16SI
+	      (match_operand:V8SI 1 "nonimmediate_operand" "vm")
+	      (match_operand:V8SI 2 "const0_operand" "C"))
+	    (match_operand:V16SI 3 "const0_operand" "C"))
+	  (match_parallel 4 "pmovzx_parallel"
+	    [(match_operand 5 "const_int_operand" "n")])))]
+  "TARGET_AVX512F"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V8DI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V8DImode, operands[0], V16SImode);
+})
+
 (define_expand "<insn>v8siv8di2"
   [(set (match_operand:V8DI 0 "register_operand" "=v")
 	(any_extend:V8DI
@@ -19380,6 +19552,24 @@
   operands[1] = lowpart_subreg (V4SImode, operands[1], V8SImode);
 })
 
+(define_insn_and_split "*avx2_zero_extendv4siv4di2_2"
+  [(set (match_operand:V8SI 0 "register_operand" "=v")
+	(vec_select:V8SI
+	  (vec_concat:V16SI
+	    (vec_concat:V8SI
+	      (match_operand:V4SI 1 "nonimmediate_operand" "vm")
+	      (match_operand:V4SI 2 "const0_operand" "C"))
+	    (match_operand:V8SI 3 "const0_operand" "C"))
+	  (match_parallel 4 "pmovzx_parallel"
+	    [(match_operand 5 "const_int_operand" "n")])))]
+  "TARGET_AVX2"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V4DI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V4DImode, operands[0], V8SImode);
+})
+
 (define_expand "<insn>v4siv4di2"
   [(set (match_operand:V4DI 0 "register_operand")
 	(any_extend:V4DI
@@ -19456,6 +19646,35 @@
 }
   [(set_attr "isa" "noavx,noavx,avx")])
 
+(define_insn_and_split "*sse4_1_zero_extendv2siv2di2_4"
+  [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v")
+	(vec_select:V4SI
+	  (vec_concat:V8SI
+	    (vec_concat:V4SI
+	      (match_operand:V2SI 1 "vector_operand" "YrBm, *xBm, vm")
+	      (match_operand:V2SI 2 "const0_operand" "C,C,C"))
+	    (match_operand:V4SI 3 "const0_operand" "C,C,C"))
+	  (match_parallel 4 "pmovzx_parallel"
+	    [(match_operand 5 "const_int_operand" "n,n,n")])))]
+  "TARGET_SSE4_1"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+	(zero_extend:V2DI
+	  (vec_select:V2SI (match_dup 1)
+			   (parallel [(const_int 0) (const_int 1)]))))]
+{
+  operands[0] = lowpart_subreg (V2DImode, operands[0], V4SImode);
+  if (MEM_P (operands[1]))
+    {
+      operands[1] = gen_rtx_ZERO_EXTEND (V2DImode, operands[1]);
+      emit_insn (gen_rtx_SET (operands[0], operands[1]));
+      DONE;
+    }
+  operands[1] = lowpart_subreg (V4SImode, operands[1], V2SImode);
+}
+  [(set_attr "isa" "noavx,noavx,avx")])
+
 (define_expand "<insn>v2siv2di2"
   [(set (match_operand:V2DI 0 "register_operand")
 	(any_extend:V2DI
diff --git a/gcc/testsuite/gcc.target/i386/pr101846-1.c b/gcc/testsuite/gcc.target/i386/pr101846-1.c
new file mode 100644
index 00000000000..40d95bde6fd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101846-1.c
@@ -0,0 +1,95 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512vl -mavx512dq -O2" } */
+/* { dg-final { scan-assembler-not "vmov" } } */
+/* { dg-final { scan-assembler-times "vpmovzxbw" "3" } } */
+/* { dg-final { scan-assembler-times "vpmovzxwd" "3" } } */
+/* { dg-final { scan-assembler-times "vpmovzxdq" "3" } } */
+
+typedef short v4hi __attribute__((vector_size (8)));
+typedef short v8hi __attribute__((vector_size (16)));
+typedef short v16hi __attribute__((vector_size (32)));
+typedef short v32hi __attribute__((vector_size (64)));
+typedef char v8qi __attribute__((vector_size (8)));
+typedef char v16qi __attribute__((vector_size (16)));
+typedef char v32qi __attribute__((vector_size (32)));
+typedef char v64qi __attribute__((vector_size (64)));
+typedef int v2si __attribute__((vector_size (8)));
+typedef int v4si __attribute__((vector_size (16)));
+typedef int v8si __attribute__((vector_size (32)));
+typedef int v16si __attribute__((vector_size (64)));
+
+v32hi
+foo_zxwd_512 (v16hi x)
+{
+  return __builtin_shufflevector (x, (v16hi) {},
+				  0, 16, 1, 17, 2, 18, 3, 19,
+				  4, 20, 5, 21, 6, 22, 7, 23,
+				  8, 24, 9, 25, 10, 26, 11, 27,
+				  12, 28, 13, 29, 14, 30, 15, 31);
+}
+
+v16hi
+foo_zxwd_256 (v8hi x)
+{
+  return __builtin_shufflevector (x, (v8hi) {},
+				  0, 8, 1, 9, 2, 10, 3, 11,
+				  4, 12, 5, 13, 6, 14, 7, 15);
+}
+
+v8hi
+foo_zxwd_128 (v4hi x)
+{
+  return __builtin_shufflevector (x, (v4hi) {}, 0, 4, 1, 5, 2, 6, 3, 7);
+}
+
+v16si
+foo_zxdq_512 (v8si x)
+{
+  return __builtin_shufflevector (x, (v8si) {},
+				  0, 8, 1, 9, 2, 10, 3, 11,
+				  4, 12, 5, 13, 6, 14, 7, 15);
+}
+
+v8si
+foo_zxdq_256 (v4si x)
+{
+  return __builtin_shufflevector (x, (v4si) {}, 0, 4, 1, 5, 2, 6, 3, 7);
+}
+
+v4si
+foo_zxdq_128 (v2si x)
+{
+  return __builtin_shufflevector (x, (v2si) {}, 0, 2, 1, 3);
+}
+
+v64qi
+foo_zxbw_512 (v32qi x)
+{
+  return __builtin_shufflevector (x, (v32qi) {},
+				  0, 32, 1, 33, 2, 34, 3, 35,
+				  4, 36, 5, 37, 6, 38, 7, 39,
+				  8, 40, 9, 41, 10, 42, 11, 43,
+				  12, 44, 13, 45, 14, 46, 15, 47,
+				  16, 48, 17, 49, 18, 50, 19, 51,
+				  20, 52, 21, 53, 22, 54, 23, 55,
+				  24, 56, 25, 57, 26, 58, 27, 59,
+				  28, 60, 29, 61, 30, 62, 31, 63);
+}
+
+v32qi
+foo_zxbw_256 (v16qi x)
+{
+  return __builtin_shufflevector (x, (v16qi) {},
+				  0, 16, 1, 17, 2, 18, 3, 19,
+				  4, 20, 5, 21, 6, 22, 7, 23,
+				  8, 24, 9, 25, 10, 26, 11, 27,
+				  12, 28, 13, 29, 14, 30, 15, 31);
+}
+
+v16qi
+foo_zxbw_128 (v8qi x)
+{
+  return __builtin_shufflevector (x, (v8qi) {},
+				  0, 8, 1, 9, 2, 10, 3, 11,
+				  4, 12, 5, 13, 6, 14, 7, 15);
+}


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-08-12  6:03 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-12  6:03 [gcc r12-2869] Combine avx_vec_concatv16si and avx512f_zero_extendv16hiv16si2_1 to avx512f_zero_extendv16hiv16si2_2 hongtao Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).