* [PATCH] [x86] Optimize 64-bit vector permutation with punpcklqdq + 128-bit vector pshuf.
@ 2024-04-28 6:03 liuhongt
0 siblings, 0 replies; only message in thread
From: liuhongt @ 2024-04-28 6:03 UTC (permalink / raw)
To: gcc-patches; +Cc: crazylht, hjl.tools
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ready push to trunk.
gcc/ChangeLog:
PR target/113090
* config/i386/i386-expand.cc
(expand_vec_perm_punpckldq_pshuf): New function.
(ix86_expand_vec_perm_const_1): Try
expand_vec_perm_punpckldq_pshuf for sequence of 2
instructions.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr113090.c: New test.
---
gcc/config/i386/i386-expand.cc | 71 ++++++++++++++++++++++++
gcc/testsuite/gcc.target/i386/pr113090.c | 25 +++++++++
2 files changed, 96 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/i386/pr113090.c
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 8bb8f21e686..fd49d866004 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -20813,6 +20813,74 @@ expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
return true;
}
+/* Try to permute 2 64-bit vectors by punpckldq + 128-bit vector shuffle. */
+static bool
+expand_vec_perm_punpckldq_pshuf (struct expand_vec_perm_d *d)
+{
+ if (GET_MODE_BITSIZE (d->vmode) != 64
+ || !TARGET_MMX_WITH_SSE
+ || d->one_operand_p)
+ return false;
+
+ machine_mode widen_vmode;
+ switch (d->vmode)
+ {
+ /* pshufd. */
+ case E_V2SImode:
+ widen_vmode = V4SImode;
+ break;
+
+ /* pshufd. */
+ case E_V2SFmode:
+ widen_vmode = V4SFmode;
+ break;
+
+ case E_V4HImode:
+ widen_vmode = V8HImode;
+ /* pshufb. */
+ if (!TARGET_SSSE3)
+ return false;
+ break;
+
+ case E_V8QImode:
+ /* pshufb. */
+ widen_vmode = V16QImode;
+ if (!TARGET_SSSE3)
+ return false;
+ break;
+
+ default:
+ return false;
+ }
+
+ if (d->testing_p)
+ return true;
+
+ struct expand_vec_perm_d dperm;
+ dperm.target = gen_reg_rtx (widen_vmode);
+ rtx op0 = gen_reg_rtx (widen_vmode);
+ emit_move_insn (op0, gen_rtx_VEC_CONCAT (widen_vmode, d->op0, d->op1));
+ dperm.op0 = op0;
+ dperm.op1 = op0;
+ dperm.vmode = widen_vmode;
+ unsigned nelt = GET_MODE_NUNITS (widen_vmode);
+ dperm.nelt = nelt;
+ dperm.one_operand_p = true;
+ dperm.testing_p = false;
+
+ for (unsigned i = 0; i != nelt / 2; i++)
+ {
+ dperm.perm[i] = d->perm[i];
+ dperm.perm[i + nelt / 2] = d->perm[i];
+ }
+
+ gcc_assert (expand_vec_perm_1 (&dperm));
+ emit_move_insn (d->target, lowpart_subreg (d->vmode,
+ dperm.target,
+ dperm.vmode));
+ return true;
+}
+
/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
the permutation using the SSSE3 palignr instruction. This succeeds
when all of the elements in PERM fit within one vector and we merely
@@ -23325,6 +23393,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_shufps_shufps (d))
return true;
+ if (expand_vec_perm_punpckldq_pshuf (d))
+ return true;
+
/* Try sequences of three instructions. */
if (expand_vec_perm_even_odd_pack (d))
diff --git a/gcc/testsuite/gcc.target/i386/pr113090.c b/gcc/testsuite/gcc.target/i386/pr113090.c
new file mode 100644
index 00000000000..0f0b7cc0084
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr113090.c
@@ -0,0 +1,25 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse4.1" } */
+/* { dg-final { scan-assembler-times "pshufd" 3 } } */
+
+typedef int v2si __attribute__((vector_size(8)));
+typedef short v4hi __attribute__((vector_size(8)));
+typedef char v8qi __attribute__((vector_size(8)));
+
+v2si
+foo (v2si a, v2si b)
+{
+ return __builtin_shufflevector (a, b, 1, 2);
+}
+
+v4hi
+foo1 (v4hi a, v4hi b)
+{
+ return __builtin_shufflevector (a, b, 2, 3, 4, 5);
+}
+
+v8qi
+foo2 (v8qi a, v8qi b)
+{
+ return __builtin_shufflevector (a, b, 4, 5, 6, 7, 8, 9, 10, 11);
+}
--
2.31.1
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2024-04-28 6:05 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-28 6:03 [PATCH] [x86] Optimize 64-bit vector permutation with punpcklqdq + 128-bit vector pshuf liuhongt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).