public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
From: Hongtao Liu <crazylht@gmail.com>
To: GCC Patches <gcc-patches@gcc.gnu.org>
Cc: Jakub Jelinek <jakub@redhat.com>
Subject: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680]
Date: Fri, 23 Apr 2021 12:53:58 +0800	[thread overview]
Message-ID: <CAMZc-byCJGFwdui=Zj7Gkcub9=QaUACOAn1YtozbcPR5JLr0DQ@mail.gmail.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 1169 bytes --]

Hi:
  If the second operand of __builtin_shuffle is const vector 0, and with
specific mask, it can be optimized to movq/vmovps.

.i.e.
foo128:
-       vxorps  %xmm1, %xmm1, %xmm1
-       vmovlhps        %xmm1, %xmm0, %xmm0
+       vmovq   %xmm0, %xmm0

 foo256:
-       vxorps  %xmm1, %xmm1, %xmm1
-       vshuff32x4      $0, %ymm1, %ymm0, %ymm0
+       vmovaps %xmm0, %xmm0

 foo512:
-       vxorps  %xmm1, %xmm1, %xmm1
-       vshuff32x4      $68, %zmm1, %zmm0, %zmm0
+       vmovaps %ymm0, %ymm0

  Bootstrapped and regtested on x86-64_iinux-gnu{-m32,}.
  Ok for trunk?

gcc/ChangeLog:

        PR target/94680
        * config/i386/sse.md (ssedoublevecmode): Add attribute for
        V64QI/V32HI/V16SI/V4DI.
        (ssehalfvecmode): Add attribute for V2DI/V2DF.
        (*vec_concatv4si_0): Extend to VI124_128.
        (*vec_concat<mode>_0): New pre-reload splitter.
        * config/i386/predicates.md (movq_parallel): New predicate.

gcc/testsuite/ChangeLog:

        PR target/94680
        * gcc.target/i386/avx-pr94680.c: New test.
        * gcc.target/i386/avx512f-pr94680.c: New test.
        * gcc.target/i386/sse2-pr94680.c: New test.


-- 
BR,
Hongtao

[-- Attachment #2: 0001-i386-Optimize-__builtin_shuffle-when-it-s-used-to-ze.patch --]
[-- Type: text/x-patch, Size: 12065 bytes --]

From eec5469cdeecf0e6650e9d2963dea4117919c5d2 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Thu, 22 Apr 2021 15:33:16 +0800
Subject: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the
 upper bits of the dest. [PR target/94680]

If the second operand of __builtin_shuffle is const vector 0, and with
specific mask, it can be optimized to movq/vmovps.

.i.e.
foo128:
-       vxorps  %xmm1, %xmm1, %xmm1
-       vmovlhps        %xmm1, %xmm0, %xmm0
+       vmovq   %xmm0, %xmm0

 foo256:
-       vxorps  %xmm1, %xmm1, %xmm1
-       vshuff32x4      $0, %ymm1, %ymm0, %ymm0
+       vmovaps %xmm0, %xmm0

 foo512:
-       vxorps  %xmm1, %xmm1, %xmm1
-       vshuff32x4      $68, %zmm1, %zmm0, %zmm0
+       vmovaps %ymm0, %ymm0

gcc/ChangeLog:

	PR target/94680
	* config/i386/sse.md (ssedoublevecmode): Add attribute for
	V64QI/V32HI/V16SI/V4DI.
	(ssehalfvecmode): Add attribute for V2DI/V2DF.
	(*vec_concatv4si_0): Extend to VI124_128.
	(*vec_concat<mode>_0): New pre-reload splitter.
	* config/i386/predicates.md (movq_parallel): New predicate.

gcc/testsuite/ChangeLog:

	PR target/94680
	* gcc.target/i386/avx-pr94680.c: New test.
	* gcc.target/i386/avx512f-pr94680.c: New test.
	* gcc.target/i386/sse2-pr94680.c: New test.
---
 gcc/config/i386/predicates.md                 | 33 ++++++++
 gcc/config/i386/sse.md                        | 37 +++++++--
 gcc/testsuite/gcc.target/i386/avx-pr94680.c   | 59 ++++++++++++++
 .../gcc.target/i386/avx512f-pr94680.c         | 78 +++++++++++++++++++
 gcc/testsuite/gcc.target/i386/sse2-pr94680.c  | 51 ++++++++++++
 5 files changed, 250 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-pr94680.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr94680.c

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index b1df8548af6..4b706003ed8 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1524,6 +1524,39 @@ (define_predicate "misaligned_operand"
   (and (match_code "mem")
        (match_test "MEM_ALIGN (op) < GET_MODE_BITSIZE (mode)")))
 
+;; Return true if OP is a parallel for an mov{d,q,dqa,ps,pd} vec_select,
+;; where one of the two operands of the vec_concat is const0_operand.
+(define_predicate "movq_parallel"
+  (match_code "parallel")
+{
+  unsigned nelt = XVECLEN (op, 0);
+  unsigned nelt2 = nelt >> 1;
+  unsigned i;
+
+  if (nelt < 2)
+    return false;
+
+  /* Validate that all of the elements are constants,
+     lower halves of permute are lower halves of the first operand,
+     upper halves of permute come from any of the second operand.  */
+  for (i = 0; i < nelt; ++i)
+    {
+      rtx er = XVECEXP (op, 0, i);
+      unsigned HOST_WIDE_INT ei;
+
+      if (!CONST_INT_P (er))
+	return 0;
+      ei = INTVAL (er);
+      if (i < nelt2 && ei != i)
+	return 0;
+      if (i >= nelt2
+      	 && (ei < nelt || ei >= nelt<<1))
+	return 0;
+    }
+
+  return 1;
+})
+
 ;; Return true if OP is a vzeroall operation, known to be a PARALLEL.
 (define_predicate "vzeroall_operation"
   (match_code "parallel")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 9d3728d1cb0..b55636a3e12 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -812,19 +812,22 @@ (define_mode_attr sseintvecmodelower
 
 ;; Mapping of vector modes to a vector mode of double size
 (define_mode_attr ssedoublevecmode
-  [(V32QI "V64QI") (V16HI "V32HI") (V8SI "V16SI") (V4DI "V8DI")
+  [(V64QI "V128QI") (V32HI "V64HI") (V16SI "V32SI") (V8DI "V16DI")
+   (V32QI "V64QI") (V16HI "V32HI") (V8SI "V16SI") (V4DI "V8DI")
    (V16QI "V32QI") (V8HI "V16HI") (V4SI "V8SI") (V2DI "V4DI")
+   (V16SF "V32SF") (V8DF "V16DF")
    (V8SF "V16SF") (V4DF "V8DF")
    (V4SF "V8SF") (V2DF "V4DF")])
 
 ;; Mapping of vector modes to a vector mode of half size
+;; instead of V1DI/V1DF, DI/DF are used for V2DI/V2DF although they are scalar.
 (define_mode_attr ssehalfvecmode
   [(V64QI "V32QI") (V32HI "V16HI") (V16SI "V8SI") (V8DI "V4DI") (V4TI "V2TI")
    (V32QI "V16QI") (V16HI  "V8HI") (V8SI  "V4SI") (V4DI "V2DI")
-   (V16QI  "V8QI") (V8HI   "V4HI") (V4SI  "V2SI")
+   (V16QI  "V8QI") (V8HI   "V4HI") (V4SI  "V2SI") (V2DI "DI")
    (V16SF "V8SF") (V8DF "V4DF")
    (V8SF  "V4SF") (V4DF "V2DF")
-   (V4SF  "V2SF")])
+   (V4SF  "V2SF") (V2DF "DF")])
 
 (define_mode_attr ssehalfvecmodelower
   [(V64QI "v32qi") (V32HI "v16hi") (V16SI "v8si") (V8DI "v4di") (V4TI "v2ti")
@@ -15964,11 +15967,11 @@ (define_insn "*vec_concatv4si"
    (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex")
    (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")])
 
-(define_insn "*vec_concatv4si_0"
-  [(set (match_operand:V4SI 0 "register_operand"       "=v,x")
-	(vec_concat:V4SI
-	  (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y")
-	  (match_operand:V2SI 2 "const0_operand"       " C,C")))]
+(define_insn "*vec_concat<mode>_0"
+  [(set (match_operand:VI124_128 0 "register_operand"       "=v,x")
+	(vec_concat:VI124_128
+	  (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm,?!*y")
+	  (match_operand:<ssehalfvecmode> 2 "const0_operand"       " C,C")))]
   "TARGET_SSE2"
   "@
    %vmovq\t{%1, %0|%0, %1}
@@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat<mode>"
    (set_attr "prefix" "maybe_evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*vec_concat<mode>_0"
+  [(set (match_operand:V 0 "register_operand")
+	(vec_select:V
+	  (vec_concat:<ssedoublevecmode>
+	    (match_operand:V 1 "nonimmediate_operand")
+	    (match_operand:V 2 "const0_operand"))
+	  (match_parallel 3 "movq_parallel"
+	    [(match_operand 4 "const_int_operand")])))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(vec_concat:V (match_dup 1) (match_dup 5)))]
+{
+  operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]);
+  operands[5] = CONST0_RTX (<ssehalfvecmode>mode);
+})
+
 (define_insn "vcvtph2ps<mask_name>"
   [(set (match_operand:V4SF 0 "register_operand" "=v")
 	(vec_select:V4SF
diff --git a/gcc/testsuite/gcc.target/i386/avx-pr94680.c b/gcc/testsuite/gcc.target/i386/avx-pr94680.c
new file mode 100644
index 00000000000..4fe0f5bede6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-pr94680.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx -mno-avx512f -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%xmm[0-9]} 6 } } */
+/* { dg-final { scan-assembler-not "pxor" } } */
+
+typedef float v8sf __attribute__((vector_size(32)));
+typedef double v4df __attribute__ ((vector_size (32)));
+typedef long long v4di __attribute__((vector_size(32)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef short v16hi __attribute__ ((vector_size (32)));
+typedef char v32qi __attribute__ ((vector_size (32)));
+
+v4df
+foo_v4df (v4df x)
+{
+  return __builtin_shuffle (x, (v4df) { 0, 0, 0, 0 }, (v4di) { 0, 1, 4, 5 });
+}
+
+v4di
+foo_v4di (v4di x)
+{
+  return __builtin_shuffle (x, (v4di) { 0, 0, 0, 0 }, (v4di) { 0, 1, 4, 7 });
+}
+
+v8sf
+foo_v8sf (v8sf x)
+{
+  return __builtin_shuffle (x, (v8sf) { 0, 0, 0, 0, 0, 0, 0, 0 },
+			    (v8si) { 0, 1, 2, 3, 8, 9, 10, 11 });
+}
+
+v8si
+foo_v8si (v8si x)
+{
+  return __builtin_shuffle (x, (v8si) { 0, 0, 0, 0, 0, 0, 0, 0 },
+			    (v8si) { 0, 1, 2, 3, 13, 12, 11, 15 });
+}
+
+v16hi
+foo_v16hi (v16hi x)
+{
+  return __builtin_shuffle (x, (v16hi)  { 0, 0, 0, 0, 0, 0, 0, 0,
+					  0, 0, 0, 0, 0, 0, 0, 0 },
+			       (v16hi) { 0, 1, 2, 3, 4, 5, 6, 7,
+					 24, 17, 26, 19, 28, 21, 30, 23 });
+  }
+
+v32qi
+foo_v32qi (v32qi x)
+{
+  return __builtin_shuffle (x, (v32qi) { 0, 0, 0, 0, 0, 0, 0, 0,
+					 0, 0, 0, 0, 0, 0, 0, 0,
+					 0, 0, 0, 0, 0, 0, 0, 0,
+					 0, 0, 0, 0, 0, 0, 0, 0 },
+			       (v32qi) { 0, 1, 2, 3, 4, 5, 6, 7,
+					 8, 9, 10, 11, 12, 13, 14, 15,
+					 32, 49, 34, 58, 36, 53, 38, 39,
+					 40, 60, 42, 43, 63, 45, 46, 47 });
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
new file mode 100644
index 00000000000..442b79da420
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
@@ -0,0 +1,78 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512vbmi -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 6} } */
+/* { dg-final { scan-assembler-not "pxor" } } */
+
+
+typedef float v16sf __attribute__((vector_size(64)));
+typedef double v8df __attribute__ ((vector_size (64)));
+typedef long long v8di __attribute__((vector_size(64)));
+typedef int v16si __attribute__((vector_size(64)));
+typedef short v32hi __attribute__ ((vector_size (64)));
+typedef char v64qi __attribute__ ((vector_size (64)));
+
+v8df
+foo_v8df (v8df x)
+{
+  return __builtin_shuffle (x, (v8df) { 0, 0, 0, 0, 0, 0, 0, 0 },
+			    (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 });
+}
+
+v8di
+foo_v8di (v8di x)
+{
+  return __builtin_shuffle (x, (v8di) { 0, 0, 0, 0, 0, 0, 0, 0 },
+			    (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 });
+}
+
+v16sf
+foo_v16sf (v16sf x)
+{
+  return __builtin_shuffle (x, (v16sf)  { 0, 0, 0, 0, 0, 0, 0, 0,
+					   0, 0, 0, 0, 0, 0, 0, 0 },
+			       (v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
+					 16, 17, 18, 19, 20, 21, 22, 23 });
+}
+
+v16si
+foo_v16si (v16si x)
+{
+    return __builtin_shuffle (x, (v16si)  { 0, 0, 0, 0, 0, 0, 0, 0,
+					   0, 0, 0, 0, 0, 0, 0, 0 },
+			       (v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
+					 16, 17, 18, 19, 20, 21, 22, 23 });
+}
+
+v32hi
+foo_v32hi (v32hi x)
+{
+  return __builtin_shuffle (x, (v32hi) { 0, 0, 0, 0, 0, 0, 0, 0,
+					 0, 0, 0, 0, 0, 0, 0, 0,
+					 0, 0, 0, 0, 0, 0, 0, 0,
+					 0, 0, 0, 0, 0, 0, 0, 0 },
+			       (v32hi) { 0, 1, 2, 3, 4, 5, 6, 7,
+					 8, 9, 10, 11, 12, 13, 14, 15,
+					 32, 33, 34, 35, 36, 37, 38, 39,
+					 40,41, 42, 43, 44, 45, 46, 47 });
+}
+
+v64qi
+foo_v64qi (v64qi x)
+{
+  return __builtin_shuffle (x, (v64qi) { 0, 0, 0, 0, 0, 0, 0, 0,
+					 0, 0, 0, 0, 0, 0, 0, 0,
+					 0, 0, 0, 0, 0, 0, 0, 0,
+					 0, 0, 0, 0, 0, 0, 0, 0,
+					 0, 0, 0, 0, 0, 0, 0, 0,
+					 0, 0, 0, 0, 0, 0, 0, 0,
+					 0, 0, 0, 0, 0, 0, 0, 0,
+					 0, 0, 0, 0, 0, 0, 0, 0 },
+			       (v64qi) {0, 1, 2, 3, 4, 5, 6, 7,
+					  8, 9, 10, 11, 12, 13, 14, 15,
+					  16, 17, 18, 19, 20, 21, 22, 23,
+					  24, 25, 26, 27, 28, 29, 30, 31,
+					  64, 65, 66, 67, 68, 69, 70, 71,
+					  72, 73, 74, 75, 76, 77, 78, 79,
+					  80, 81, 82, 83, 84, 85, 86, 87,
+					  88, 89, 90, 91, 92, 93, 94, 95 });
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr94680.c b/gcc/testsuite/gcc.target/i386/sse2-pr94680.c
new file mode 100644
index 00000000000..7f4f98e3b1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-pr94680.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -mno-sse4.1 -O2" } */
+/* { dg-final { scan-assembler-times {(?n)mov.*%xmm[0-9]} 6} } */
+/* { dg-final { scan-assembler-not "pxor" } } */
+
+typedef float v4sf __attribute__((vector_size(16)));
+typedef double v2df __attribute__ ((vector_size (16)));
+typedef long long v2di __attribute__((vector_size(16)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+typedef char v16qi __attribute__ ((vector_size (16)));
+
+v2df
+foo_v2df (v2df x)
+{
+  return __builtin_shuffle (x, (v2df) { 0, 0 }, (v2di) {0, 2});
+}
+
+v2di
+foo_v2di (v2di x)
+{
+  return __builtin_shuffle (x, (v2di) { 0, 0 }, (v2di) {0, 3});
+}
+
+v4sf
+foo_v4sf (v4sf x)
+{
+  return __builtin_shuffle (x, (v4sf) { 0, 0, 0, 0 }, (v4si) {0, 1, 4, 5});
+}
+
+v4si
+foo_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si) { 0, 0, 0, 0 }, (v4si) {0, 1, 6, 7});
+}
+
+v8hi
+foo_v8hi (v8hi x)
+{
+  return __builtin_shuffle (x, (v8hi) { 0, 0, 0, 0, 0, 0, 0, 0 },
+			       (v8hi) { 0, 1, 2, 3, 8, 12, 10, 13 });
+}
+
+v16qi
+foo_v16qi (v16qi x)
+{
+  return __builtin_shuffle (x, (v16qi) { 0, 0, 0, 0, 0, 0, 0, 0,
+					 0, 0, 0, 0, 0, 0, 0, 0 },
+			       (v16qi) {0, 1, 2, 3, 4, 5, 6, 7,
+					16, 24, 18, 26, 20, 28, 22, 30 });
+}
-- 
2.18.1


             reply	other threads:[~2021-04-23  4:49 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-23  4:53 Hongtao Liu [this message]
2021-04-23  9:13 ` Jakub Jelinek
2021-04-25  6:57   ` Hongtao Liu
2021-05-12  7:30     ` Hongtao Liu
2021-05-12 14:19     ` Jakub Jelinek
2021-05-13  0:44       ` Hongtao Liu
2021-05-13  5:52         ` Hongtao Liu
2021-06-11  8:59 ` [PATCH] i386: Fix up *vec_concat<mode>_0_1 [PR101007] Jakub Jelinek
2021-06-11  9:34   ` Uros Bizjak

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAMZc-byCJGFwdui=Zj7Gkcub9=QaUACOAn1YtozbcPR5JLr0DQ@mail.gmail.com' \
    --to=crazylht@gmail.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=jakub@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).