public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
From: liuhongt <hongtao.liu@intel.com>
To: gcc-patches@gcc.gnu.org
Cc: crazylht@gmail.com, hjl.tools@gmail.com
Subject: [PATCH] [x86] Support 2-instruction vector shuffle for V4SI/V4SF in ix86_expand_vec_perm_const_1.
Date: Mon, 26 Sep 2022 11:14:34 +0800	[thread overview]
Message-ID: <20220926031434.47605-1-hongtao.liu@intel.com> (raw)
In-Reply-To: <Yy1X2PQntkgAZ/t7@tucnak>

>Missing space before (
Changed.
>> +      /* shufps.  */
>> +      ok = expand_vselect_vconcat(tmp, d->op0, d->op1,
>> +                               perm1, d->nelt, false);
>
>Ditto.
Changed.
>
>> +      /* When lone_idx is not 0, it must from second op(count == 1).  */
>> +      gcc_assert ((lone_idx == 0 && count == 3)
>> +               || (lone_idx != 0 && count == 1));
>
>Perhaps write it more simply as
>      gcc_assert (count == (lone_idx ? 1 : 3));
>?
Changed.
>
>> +      /* shufps.  */
>> +      ok = expand_vselect_vconcat(tmp, d->op0, d->op1,
>> +                               perm1, d->nelt, false);
>
>Missing space before (
>
Changed.
>> +      gcc_assert (ok);
>> +
>> +      /* Refine lone and pair index to original order.  */
>> +      perm1[shift] = lone_idx << 1;
>> +      perm1[shift + 1] = pair_idx << 1;
>> +
>> +      /* Select the remaining 2 elements in another vector.  */
>> +      for (i = 2 - shift; i < 4 - shift; ++i)
>> +     perm1[i] = (lone_idx == 1) ? (d->perm[i] + 4) : d->perm[i];
>
>All the ()s in the above line aren't needed.
>
Changed.
>> +      /* shufps.  */
>> +      ok = expand_vselect_vconcat(d->target, tmp, d->op1,
>> +                               perm1, d->nelt, false);
>
>Again, missing space
>
>Otherwise LGTM
Thanks, here's the update patch i'm going to check in.

2022-09-23  Hongtao Liu  <hongtao.liu@intel.com>
	    Liwei Xu  <liwei.xu@intel.com>

gcc/ChangeLog:

	PR target/53346
	* config/i386/i386-expand.cc (expand_vec_perm_shufps_shufps):
	New function.
	(ix86_expand_vec_perm_const_1): Insert
	expand_vec_perm_shufps_shufps at the end of 2-instruction
	expand sequence.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr53346-1.c: New test.
	* gcc.target/i386/pr53346-2.c: New test.
	* gcc.target/i386/pr53346-3.c: New test.
	* gcc.target/i386/pr53346-4.c: New test.
---
 gcc/config/i386/i386-expand.cc            | 116 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr53346-1.c |  70 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr53346-2.c |  59 +++++++++++
 gcc/testsuite/gcc.target/i386/pr53346-3.c |  69 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr53346-4.c |  59 +++++++++++
 5 files changed, 373 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr53346-4.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 5334363e235..6baff6d0e61 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -19604,6 +19604,119 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
   return false;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
+   in terms of a pair of shufps+ shufps/pshufd instructions.  */
+static bool
+expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
+{
+  unsigned char perm1[4];
+  machine_mode vmode = d->vmode;
+  bool ok;
+  unsigned i, j, k, count = 0;
+
+  if (d->one_operand_p
+      || (vmode != V4SImode && vmode != V4SFmode))
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  for (i = 0; i < 4; ++i)
+    count += d->perm[i] > 3 ? 1 : 0;
+
+  gcc_assert (count & 3);
+
+  rtx tmp = gen_reg_rtx (vmode);
+  /* 2 from op0 and 2 from op1.  */
+  if (count == 2)
+    {
+      unsigned char perm2[4];
+      for (i = 0, j = 0, k = 2; i < 4; ++i)
+	if (d->perm[i] & 4)
+	  {
+	    perm1[k++] = d->perm[i];
+	    perm2[i] = k - 1;
+	  }
+	else
+	  {
+	    perm1[j++] = d->perm[i];
+	    perm2[i] = j - 1;
+	  }
+
+      /* shufps.  */
+      ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
+				  perm1, d->nelt, false);
+      gcc_assert (ok);
+      if (vmode == V4SImode && TARGET_SSE2)
+      /* pshufd.  */
+	ok = expand_vselect (d->target, tmp,
+			     perm2, d->nelt, false);
+      else
+	{
+	  /* shufps.  */
+	  perm2[2] += 4;
+	  perm2[3] += 4;
+	  ok = expand_vselect_vconcat (d->target, tmp, tmp,
+				       perm2, d->nelt, false);
+	}
+      gcc_assert (ok);
+    }
+  /* 3 from one op and 1 from another.  */
+  else
+    {
+      unsigned pair_idx = 8, lone_idx = 8, shift;
+
+      /* Find the lone index.  */
+      for (i = 0; i < 4; ++i)
+	if ((d->perm[i] > 3 && count == 1)
+	    || (d->perm[i] < 4 && count == 3))
+	  lone_idx = i;
+
+      /* When lone_idx is not 0, it must from second op(count == 1).  */
+      gcc_assert (count == (lone_idx ? 1 : 3));
+
+      /* Find the pair index that sits in the same half as the lone index.  */
+      shift = lone_idx & 2;
+      pair_idx = 1 - lone_idx + 2 * shift;
+
+      /* First permutate lone index and pair index into the same vector as
+	 [ lone, lone, pair, pair ].  */
+      perm1[1] = perm1[0]
+	= (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
+      perm1[3] = perm1[2]
+	= (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
+
+      /* Alway put the vector contains lone indx at the first.  */
+      if (count == 1)
+	std::swap (d->op0, d->op1);
+
+      /* shufps.  */
+      ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
+				   perm1, d->nelt, false);
+      gcc_assert (ok);
+
+      /* Refine lone and pair index to original order.  */
+      perm1[shift] = lone_idx << 1;
+      perm1[shift + 1] = pair_idx << 1;
+
+      /* Select the remaining 2 elements in another vector.  */
+      for (i = 2 - shift; i < 4 - shift; ++i)
+	perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
+
+      /* Adjust to original selector.  */
+      if (lone_idx > 1)
+	std::swap (tmp, d->op1);
+
+      /* shufps.  */
+      ok = expand_vselect_vconcat (d->target, tmp, d->op1,
+				   perm1, d->nelt, false);
+
+      gcc_assert (ok);
+    }
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
    in terms of a pair of pshuflw + pshufhw instructions.  */
 
@@ -22152,6 +22265,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_2perm_pblendv (d, true))
     return true;
 
+  if (expand_vec_perm_shufps_shufps (d))
+    return true;
+
   /* Try sequences of three instructions.  */
 
   if (expand_vec_perm_even_odd_pack (d))
diff --git a/gcc/testsuite/gcc.target/i386/pr53346-1.c b/gcc/testsuite/gcc.target/i386/pr53346-1.c
new file mode 100644
index 00000000000..6d230da632c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53346-1.c
@@ -0,0 +1,70 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2 -mno-sse3" } */
+/* { dg-final { scan-assembler-times "shufps" 15 } } */
+/* { dg-final { scan-assembler-times "pshufd" 2 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+
+v4si
+__attribute__((noipa))
+foo (v4si a, v4si b)
+{
+  return __builtin_shufflevector (a, b, 1, 2, 5, 3);
+}
+
+v4si
+__attribute__((noipa))
+foo1 (v4si a, v4si b)
+{
+  return __builtin_shufflevector (a, b, 1, 5, 2, 3);
+}
+
+v4si
+__attribute__((noipa))
+foo2 (v4si a, v4si b)
+{
+  return __builtin_shufflevector (a, b, 1, 2, 3, 5);
+}
+
+v4si
+__attribute__((noipa))
+foo3 (v4si a, v4si b)
+{
+  return __builtin_shufflevector (a, b, 1, 4, 5, 6);
+}
+
+v4si
+__attribute__((noipa))
+foo4 (v4si a, v4si b)
+{
+  return __builtin_shufflevector (a, b, 3, 6, 7, 5);
+}
+
+v4si
+__attribute__((noipa))
+foo5 (v4si a, v4si b)
+{
+  return __builtin_shufflevector (a, b, 2, 4, 7, 6);
+}
+
+v4si
+__attribute__((noipa))
+foo6 (v4si a, v4si b)
+{
+  return __builtin_shufflevector (a, b, 2, 4, 3, 6);
+}
+
+v4si
+__attribute__((noipa))
+foo7 (v4si a, v4si b)
+{
+  return __builtin_shufflevector (a, b, 2, 3, 4, 6);
+}
+
+v4si
+__attribute__((noipa))
+foo8 (v4si a, v4si b)
+{
+  return __builtin_shufflevector (a, b, 2, 4, 6, 3);
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr53346-2.c b/gcc/testsuite/gcc.target/i386/pr53346-2.c
new file mode 100644
index 00000000000..0c6c7b35e01
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53346-2.c
@@ -0,0 +1,59 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+
+#include "sse2-check.h"
+#include "pr53346-1.c"
+
+static void
+sse2_test ()
+{
+  v4si a = __extension__(v4si) { 0, 1, 2, 3 };
+  v4si b = __extension__(v4si) { 4, 5, 6, 7 };
+  v4si exp = __extension__(v4si) { 1, 2, 5, 3 };
+  v4si dest;
+  dest = foo (a, b);
+  if (__builtin_memcmp (&dest, &exp, 16))
+    __builtin_abort ();
+
+  exp = __extension__ (v4si) { 1, 5, 2, 3 };
+  dest = foo1 (a, b);
+  if (__builtin_memcmp (&dest, &exp, 16))
+    __builtin_abort ();
+
+  exp = __extension__ (v4si) { 1, 2, 3, 5 };
+  dest = foo2 (a, b);
+  if (__builtin_memcmp (&dest, &exp, 16))
+    __builtin_abort ();
+
+  exp = __extension__ (v4si) { 1, 4, 5, 6 };
+  dest = foo3 (a, b);
+  if (__builtin_memcmp (&dest, &exp, 16))
+    __builtin_abort ();
+
+  exp = __extension__ (v4si) { 3, 6, 7, 5 };
+  dest = foo4 (a, b);
+  if (__builtin_memcmp (&dest, &exp, 16))
+    __builtin_abort ();
+
+  exp = __extension__ (v4si) { 2, 4, 7, 6 };
+  dest = foo5 (a, b);
+  if (__builtin_memcmp (&dest, &exp, 16))
+    __builtin_abort ();
+
+  exp = __extension__ (v4si) { 2, 4, 3, 6 };
+  dest = foo6 (a, b);
+  if (__builtin_memcmp (&dest, &exp, 16))
+    __builtin_abort ();
+
+  exp = __extension__ (v4si) { 2, 3, 4, 6 };
+  dest = foo7 (a, b);
+  if (__builtin_memcmp (&dest, &exp, 16))
+    __builtin_abort ();
+
+  exp = __extension__ (v4si) { 2, 4, 6, 3 };
+  dest = foo8 (a, b);
+  if (__builtin_memcmp (&dest, &exp, 16))
+    __builtin_abort ();
+
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr53346-3.c b/gcc/testsuite/gcc.target/i386/pr53346-3.c
new file mode 100644
index 00000000000..0b204f6f210
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53346-3.c
@@ -0,0 +1,69 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2 -mno-sse3" } */
+/* { dg-final { scan-assembler-times "shufps" 17 } } */
+
+typedef float v4sf __attribute__((vector_size(16)));
+
+v4sf
+__attribute__((noipa))
+foo (v4sf a, v4sf b)
+{
+  return __builtin_shufflevector (a, b, 1, 2, 5, 3);
+}
+
+v4sf
+__attribute__((noipa))
+foo1 (v4sf a, v4sf b)
+{
+  return __builtin_shufflevector (a, b, 1, 5, 2, 3);
+}
+
+v4sf
+__attribute__((noipa))
+foo2 (v4sf a, v4sf b)
+{
+  return __builtin_shufflevector (a, b, 1, 2, 3, 5);
+}
+
+v4sf
+__attribute__((noipa))
+foo3 (v4sf a, v4sf b)
+{
+  return __builtin_shufflevector (a, b, 1, 4, 5, 6);
+}
+
+v4sf
+__attribute__((noipa))
+foo4 (v4sf a, v4sf b)
+{
+  return __builtin_shufflevector (a, b, 3, 6, 7, 5);
+}
+
+v4sf
+__attribute__((noipa))
+foo5 (v4sf a, v4sf b)
+{
+  return __builtin_shufflevector (a, b, 2, 4, 7, 6);
+}
+
+v4sf
+__attribute__((noipa))
+foo6 (v4sf a, v4sf b)
+{
+  return __builtin_shufflevector (a, b, 2, 4, 3, 6);
+}
+
+v4sf
+__attribute__((noipa))
+foo7 (v4sf a, v4sf b)
+{
+  return __builtin_shufflevector (a, b, 2, 3, 4, 6);
+}
+
+v4sf
+__attribute__((noipa))
+foo8 (v4sf a, v4sf b)
+{
+  return __builtin_shufflevector (a, b, 2, 4, 6, 3);
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr53346-4.c b/gcc/testsuite/gcc.target/i386/pr53346-4.c
new file mode 100644
index 00000000000..9e4e45bd584
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53346-4.c
@@ -0,0 +1,59 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+
+#include "sse2-check.h"
+#include "pr53346-3.c"
+
+static void
+sse2_test ()
+{
+  v4sf a = __extension__(v4sf) { 0, 1, 2, 3 };
+  v4sf b = __extension__(v4sf) { 4, 5, 6, 7 };
+  v4sf exp = __extension__(v4sf) { 1, 2, 5, 3 };
+  v4sf dest;
+  dest = foo (a, b);
+  if (__builtin_memcmp (&dest, &exp, 16))
+    __builtin_abort ();
+
+  exp = __extension__ (v4sf) { 1, 5, 2, 3 };
+  dest = foo1 (a, b);
+  if (__builtin_memcmp (&dest, &exp, 16))
+    __builtin_abort ();
+
+  exp = __extension__ (v4sf) { 1, 2, 3, 5 };
+  dest = foo2 (a, b);
+  if (__builtin_memcmp (&dest, &exp, 16))
+    __builtin_abort ();
+
+  exp = __extension__ (v4sf) { 1, 4, 5, 6 };
+  dest = foo3 (a, b);
+  if (__builtin_memcmp (&dest, &exp, 16))
+    __builtin_abort ();
+
+  exp = __extension__ (v4sf) { 3, 6, 7, 5 };
+  dest = foo4 (a, b);
+  if (__builtin_memcmp (&dest, &exp, 16))
+    __builtin_abort ();
+
+  exp = __extension__ (v4sf) { 2, 4, 7, 6 };
+  dest = foo5 (a, b);
+  if (__builtin_memcmp (&dest, &exp, 16))
+    __builtin_abort ();
+
+  exp = __extension__ (v4sf) { 2, 4, 3, 6 };
+  dest = foo6 (a, b);
+  if (__builtin_memcmp (&dest, &exp, 16))
+    __builtin_abort ();
+
+  exp = __extension__ (v4sf) { 2, 3, 4, 6 };
+  dest = foo7 (a, b);
+  if (__builtin_memcmp (&dest, &exp, 16))
+    __builtin_abort ();
+
+  exp = __extension__ (v4sf) { 2, 4, 6, 3 };
+  dest = foo8 (a, b);
+  if (__builtin_memcmp (&dest, &exp, 16))
+    __builtin_abort ();
+
+}
-- 
2.18.1


      reply	other threads:[~2022-09-26  3:16 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-09-23  6:42 liuhongt
2022-09-23  6:53 ` Jakub Jelinek
2022-09-26  3:14   ` liuhongt [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220926031434.47605-1-hongtao.liu@intel.com \
    --to=hongtao.liu@intel.com \
    --cc=crazylht@gmail.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=hjl.tools@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).