public inbox for gcc-cvs@sourceware.org help / color / mirror / Atom feed
From: hongtao Liu <liuhongt@gcc.gnu.org> To: gcc-cvs@gcc.gnu.org Subject: [gcc r13-2843] Support 2-instruction vector shuffle for V4SI/V4SF in ix86_expand_vec_perm_const_1. Date: Mon, 26 Sep 2022 03:22:06 +0000 (GMT) [thread overview] Message-ID: <20220926032206.305E83858CDA@sourceware.org> (raw) https://gcc.gnu.org/g:3db8e9c2422d924a958336fd0871b24cce3e65d1 commit r13-2843-g3db8e9c2422d924a958336fd0871b24cce3e65d1 Author: liuhongt <hongtao.liu@intel.com> Date: Wed Sep 21 14:56:08 2022 +0800 Support 2-instruction vector shuffle for V4SI/V4SF in ix86_expand_vec_perm_const_1. 2022-09-23 Hongtao Liu <hongtao.liu@intel.com> Liwei Xu <liwei.xu@intel.com> gcc/ChangeLog: PR target/53346 * config/i386/i386-expand.cc (expand_vec_perm_shufps_shufps): New function. (ix86_expand_vec_perm_const_1): Insert expand_vec_perm_shufps_shufps at the end of 2-instruction expand sequence. gcc/testsuite/ChangeLog: * gcc.target/i386/pr53346-1.c: New test. * gcc.target/i386/pr53346-2.c: New test. * gcc.target/i386/pr53346-3.c: New test. * gcc.target/i386/pr53346-4.c: New test. Diff: --- gcc/config/i386/i386-expand.cc | 116 ++++++++++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr53346-1.c | 70 ++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr53346-2.c | 59 +++++++++++++++ gcc/testsuite/gcc.target/i386/pr53346-3.c | 69 ++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr53346-4.c | 59 +++++++++++++++ 5 files changed, 373 insertions(+) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 5334363e235..6baff6d0e61 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -19604,6 +19604,119 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) return false; } +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D + in terms of a pair of shufps+ shufps/pshufd instructions. */ +static bool +expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d) +{ + unsigned char perm1[4]; + machine_mode vmode = d->vmode; + bool ok; + unsigned i, j, k, count = 0; + + if (d->one_operand_p + || (vmode != V4SImode && vmode != V4SFmode)) + return false; + + if (d->testing_p) + return true; + + for (i = 0; i < 4; ++i) + count += d->perm[i] > 3 ? 1 : 0; + + gcc_assert (count & 3); + + rtx tmp = gen_reg_rtx (vmode); + /* 2 from op0 and 2 from op1. */ + if (count == 2) + { + unsigned char perm2[4]; + for (i = 0, j = 0, k = 2; i < 4; ++i) + if (d->perm[i] & 4) + { + perm1[k++] = d->perm[i]; + perm2[i] = k - 1; + } + else + { + perm1[j++] = d->perm[i]; + perm2[i] = j - 1; + } + + /* shufps. */ + ok = expand_vselect_vconcat (tmp, d->op0, d->op1, + perm1, d->nelt, false); + gcc_assert (ok); + if (vmode == V4SImode && TARGET_SSE2) + /* pshufd. */ + ok = expand_vselect (d->target, tmp, + perm2, d->nelt, false); + else + { + /* shufps. */ + perm2[2] += 4; + perm2[3] += 4; + ok = expand_vselect_vconcat (d->target, tmp, tmp, + perm2, d->nelt, false); + } + gcc_assert (ok); + } + /* 3 from one op and 1 from another. */ + else + { + unsigned pair_idx = 8, lone_idx = 8, shift; + + /* Find the lone index. */ + for (i = 0; i < 4; ++i) + if ((d->perm[i] > 3 && count == 1) + || (d->perm[i] < 4 && count == 3)) + lone_idx = i; + + /* When lone_idx is not 0, it must from second op(count == 1). */ + gcc_assert (count == (lone_idx ? 1 : 3)); + + /* Find the pair index that sits in the same half as the lone index. */ + shift = lone_idx & 2; + pair_idx = 1 - lone_idx + 2 * shift; + + /* First permutate lone index and pair index into the same vector as + [ lone, lone, pair, pair ]. */ + perm1[1] = perm1[0] + = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4; + perm1[3] = perm1[2] + = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4; + + /* Alway put the vector contains lone indx at the first. */ + if (count == 1) + std::swap (d->op0, d->op1); + + /* shufps. */ + ok = expand_vselect_vconcat (tmp, d->op0, d->op1, + perm1, d->nelt, false); + gcc_assert (ok); + + /* Refine lone and pair index to original order. */ + perm1[shift] = lone_idx << 1; + perm1[shift + 1] = pair_idx << 1; + + /* Select the remaining 2 elements in another vector. */ + for (i = 2 - shift; i < 4 - shift; ++i) + perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i]; + + /* Adjust to original selector. */ + if (lone_idx > 1) + std::swap (tmp, d->op1); + + /* shufps. */ + ok = expand_vselect_vconcat (d->target, tmp, d->op1, + perm1, d->nelt, false); + + gcc_assert (ok); + } + + return true; +} + /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D in terms of a pair of pshuflw + pshufhw instructions. */ @@ -22152,6 +22265,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_2perm_pblendv (d, true)) return true; + if (expand_vec_perm_shufps_shufps (d)) + return true; + /* Try sequences of three instructions. */ if (expand_vec_perm_even_odd_pack (d)) diff --git a/gcc/testsuite/gcc.target/i386/pr53346-1.c b/gcc/testsuite/gcc.target/i386/pr53346-1.c new file mode 100644 index 00000000000..6d230da632c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr53346-1.c @@ -0,0 +1,70 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -O2 -mno-sse3" } */ +/* { dg-final { scan-assembler-times "shufps" 15 } } */ +/* { dg-final { scan-assembler-times "pshufd" 2 } } */ + +typedef int v4si __attribute__((vector_size(16))); + +v4si +__attribute__((noipa)) +foo (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 1, 2, 5, 3); +} + +v4si +__attribute__((noipa)) +foo1 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 1, 5, 2, 3); +} + +v4si +__attribute__((noipa)) +foo2 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 1, 2, 3, 5); +} + +v4si +__attribute__((noipa)) +foo3 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 1, 4, 5, 6); +} + +v4si +__attribute__((noipa)) +foo4 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 3, 6, 7, 5); +} + +v4si +__attribute__((noipa)) +foo5 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 2, 4, 7, 6); +} + +v4si +__attribute__((noipa)) +foo6 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 2, 4, 3, 6); +} + +v4si +__attribute__((noipa)) +foo7 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 2, 3, 4, 6); +} + +v4si +__attribute__((noipa)) +foo8 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 2, 4, 6, 3); +} + diff --git a/gcc/testsuite/gcc.target/i386/pr53346-2.c b/gcc/testsuite/gcc.target/i386/pr53346-2.c new file mode 100644 index 00000000000..0c6c7b35e01 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr53346-2.c @@ -0,0 +1,59 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -msse2" } */ +/* { dg-require-effective-target sse2 } */ + +#include "sse2-check.h" +#include "pr53346-1.c" + +static void +sse2_test () +{ + v4si a = __extension__(v4si) { 0, 1, 2, 3 }; + v4si b = __extension__(v4si) { 4, 5, 6, 7 }; + v4si exp = __extension__(v4si) { 1, 2, 5, 3 }; + v4si dest; + dest = foo (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 1, 5, 2, 3 }; + dest = foo1 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 1, 2, 3, 5 }; + dest = foo2 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 1, 4, 5, 6 }; + dest = foo3 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 3, 6, 7, 5 }; + dest = foo4 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 2, 4, 7, 6 }; + dest = foo5 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 2, 4, 3, 6 }; + dest = foo6 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 2, 3, 4, 6 }; + dest = foo7 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 2, 4, 6, 3 }; + dest = foo8 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + +} diff --git a/gcc/testsuite/gcc.target/i386/pr53346-3.c b/gcc/testsuite/gcc.target/i386/pr53346-3.c new file mode 100644 index 00000000000..0b204f6f210 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr53346-3.c @@ -0,0 +1,69 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -O2 -mno-sse3" } */ +/* { dg-final { scan-assembler-times "shufps" 17 } } */ + +typedef float v4sf __attribute__((vector_size(16))); + +v4sf +__attribute__((noipa)) +foo (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 1, 2, 5, 3); +} + +v4sf +__attribute__((noipa)) +foo1 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 1, 5, 2, 3); +} + +v4sf +__attribute__((noipa)) +foo2 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 1, 2, 3, 5); +} + +v4sf +__attribute__((noipa)) +foo3 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 1, 4, 5, 6); +} + +v4sf +__attribute__((noipa)) +foo4 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 3, 6, 7, 5); +} + +v4sf +__attribute__((noipa)) +foo5 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 2, 4, 7, 6); +} + +v4sf +__attribute__((noipa)) +foo6 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 2, 4, 3, 6); +} + +v4sf +__attribute__((noipa)) +foo7 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 2, 3, 4, 6); +} + +v4sf +__attribute__((noipa)) +foo8 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 2, 4, 6, 3); +} + diff --git a/gcc/testsuite/gcc.target/i386/pr53346-4.c b/gcc/testsuite/gcc.target/i386/pr53346-4.c new file mode 100644 index 00000000000..9e4e45bd584 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr53346-4.c @@ -0,0 +1,59 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -msse2" } */ +/* { dg-require-effective-target sse2 } */ + +#include "sse2-check.h" +#include "pr53346-3.c" + +static void +sse2_test () +{ + v4sf a = __extension__(v4sf) { 0, 1, 2, 3 }; + v4sf b = __extension__(v4sf) { 4, 5, 6, 7 }; + v4sf exp = __extension__(v4sf) { 1, 2, 5, 3 }; + v4sf dest; + dest = foo (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 1, 5, 2, 3 }; + dest = foo1 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 1, 2, 3, 5 }; + dest = foo2 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 1, 4, 5, 6 }; + dest = foo3 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 3, 6, 7, 5 }; + dest = foo4 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 2, 4, 7, 6 }; + dest = foo5 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 2, 4, 3, 6 }; + dest = foo6 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 2, 3, 4, 6 }; + dest = foo7 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 2, 4, 6, 3 }; + dest = foo8 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + +}
reply other threads:[~2022-09-26 3:22 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20220926032206.305E83858CDA@sourceware.org \ --to=liuhongt@gcc.gnu.org \ --cc=gcc-cvs@gcc.gnu.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).