[PATCH] [x86] Optimize 64-bit vector permutation with punpcklqdq + 128-bit vector pshuf.

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] [x86] Optimize 64-bit vector permutation with punpcklqdq + 128-bit vector pshuf.
@ 2024-04-28  6:03 liuhongt
  0 siblings, 0 replies; only message in thread
From: liuhongt @ 2024-04-28  6:03 UTC (permalink / raw)
  To: gcc-patches; +Cc: crazylht, hjl.tools

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ready push to trunk.

gcc/ChangeLog:

	PR target/113090
	* config/i386/i386-expand.cc
	(expand_vec_perm_punpckldq_pshuf): New function.
	(ix86_expand_vec_perm_const_1): Try
	expand_vec_perm_punpckldq_pshuf for sequence of 2
	instructions.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr113090.c: New test.
---
 gcc/config/i386/i386-expand.cc           | 71 ++++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr113090.c | 25 +++++++++
 2 files changed, 96 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr113090.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 8bb8f21e686..fd49d866004 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -20813,6 +20813,74 @@ expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to permute 2 64-bit vectors by punpckldq + 128-bit vector shuffle.  */
+static bool
+expand_vec_perm_punpckldq_pshuf (struct expand_vec_perm_d *d)
+{
+  if (GET_MODE_BITSIZE (d->vmode) != 64
+      || !TARGET_MMX_WITH_SSE
+      || d->one_operand_p)
+    return false;
+
+  machine_mode widen_vmode;
+  switch (d->vmode)
+    {
+    /* pshufd.  */
+    case E_V2SImode:
+      widen_vmode = V4SImode;
+      break;
+
+    /* pshufd.  */
+    case E_V2SFmode:
+      widen_vmode = V4SFmode;
+      break;
+
+    case E_V4HImode:
+      widen_vmode = V8HImode;
+      /* pshufb.  */
+      if (!TARGET_SSSE3)
+	return false;
+      break;
+
+    case E_V8QImode:
+      /* pshufb.  */
+      widen_vmode = V16QImode;
+      if (!TARGET_SSSE3)
+	return false;
+      break;
+
+    default:
+      return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  struct expand_vec_perm_d dperm;
+  dperm.target = gen_reg_rtx (widen_vmode);
+  rtx op0 = gen_reg_rtx (widen_vmode);
+  emit_move_insn (op0, gen_rtx_VEC_CONCAT (widen_vmode, d->op0, d->op1));
+  dperm.op0 = op0;
+  dperm.op1 = op0;
+  dperm.vmode = widen_vmode;
+  unsigned nelt = GET_MODE_NUNITS (widen_vmode);
+  dperm.nelt = nelt;
+  dperm.one_operand_p = true;
+  dperm.testing_p = false;
+
+  for (unsigned i = 0; i != nelt / 2; i++)
+    {
+      dperm.perm[i] = d->perm[i];
+      dperm.perm[i + nelt / 2] = d->perm[i];
+    }
+
+  gcc_assert (expand_vec_perm_1 (&dperm));
+  emit_move_insn (d->target, lowpart_subreg (d->vmode,
+					     dperm.target,
+					     dperm.vmode));
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
    the permutation using the SSSE3 palignr instruction.  This succeeds
    when all of the elements in PERM fit within one vector and we merely
@@ -23325,6 +23393,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_shufps_shufps (d))
     return true;
 
+  if (expand_vec_perm_punpckldq_pshuf (d))
+    return true;
+
   /* Try sequences of three instructions.  */
 
   if (expand_vec_perm_even_odd_pack (d))
diff --git a/gcc/testsuite/gcc.target/i386/pr113090.c b/gcc/testsuite/gcc.target/i386/pr113090.c
new file mode 100644
index 00000000000..0f0b7cc0084
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr113090.c
@@ -0,0 +1,25 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse4.1" } */
+/* { dg-final { scan-assembler-times "pshufd" 3 } } */
+
+typedef int v2si __attribute__((vector_size(8)));
+typedef short v4hi __attribute__((vector_size(8)));
+typedef char v8qi __attribute__((vector_size(8)));
+
+v2si
+foo (v2si a, v2si b)
+{
+    return __builtin_shufflevector (a, b, 1, 2);
+}
+
+v4hi
+foo1 (v4hi a, v4hi b)
+{
+  return __builtin_shufflevector (a, b, 2, 3, 4, 5);
+}
+
+v8qi
+foo2 (v8qi a, v8qi b)
+{
+  return __builtin_shufflevector (a, b, 4, 5, 6, 7, 8, 9, 10, 11);
+}
-- 
2.31.1


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2024-04-28  6:05 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-28  6:03 [PATCH] [x86] Optimize 64-bit vector permutation with punpcklqdq + 128-bit vector pshuf liuhongt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).