From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 2078) id 305E83858CDA; Mon, 26 Sep 2022 03:22:06 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 305E83858CDA DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1664162526; bh=yzSBRSoKtiTKzFldD7Nlvak0fVPimbeFAXLw0hBsJZw=; h=From:To:Subject:Date:From; b=X7fTPq4X42NRIM6tovBavizPipUPK45//Z/y+Y9IaIBP0ghjn5KeZHPSBZ/Z/dChH keFw6XZUMAnMhhiZQcm48AQYbTXD0pwb5W+dIahmhTVUxsm0Hfu6eyL8BaGG7MlJzB 02nwOuenrvH3uqTR6ibo0YiRivlxNpJ5O8ga8qNI= MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: hongtao Liu To: gcc-cvs@gcc.gnu.org Subject: [gcc r13-2843] Support 2-instruction vector shuffle for V4SI/V4SF in ix86_expand_vec_perm_const_1. X-Act-Checkin: gcc X-Git-Author: liuhongt X-Git-Refname: refs/heads/master X-Git-Oldrev: de613c6295ea50d75167eaf89f41074a69298108 X-Git-Newrev: 3db8e9c2422d924a958336fd0871b24cce3e65d1 Message-Id: <20220926032206.305E83858CDA@sourceware.org> Date: Mon, 26 Sep 2022 03:22:06 +0000 (GMT) List-Id: https://gcc.gnu.org/g:3db8e9c2422d924a958336fd0871b24cce3e65d1 commit r13-2843-g3db8e9c2422d924a958336fd0871b24cce3e65d1 Author: liuhongt Date: Wed Sep 21 14:56:08 2022 +0800 Support 2-instruction vector shuffle for V4SI/V4SF in ix86_expand_vec_perm_const_1. 2022-09-23 Hongtao Liu Liwei Xu gcc/ChangeLog: PR target/53346 * config/i386/i386-expand.cc (expand_vec_perm_shufps_shufps): New function. (ix86_expand_vec_perm_const_1): Insert expand_vec_perm_shufps_shufps at the end of 2-instruction expand sequence. gcc/testsuite/ChangeLog: * gcc.target/i386/pr53346-1.c: New test. * gcc.target/i386/pr53346-2.c: New test. * gcc.target/i386/pr53346-3.c: New test. * gcc.target/i386/pr53346-4.c: New test. Diff: --- gcc/config/i386/i386-expand.cc | 116 ++++++++++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr53346-1.c | 70 ++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr53346-2.c | 59 +++++++++++++++ gcc/testsuite/gcc.target/i386/pr53346-3.c | 69 ++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr53346-4.c | 59 +++++++++++++++ 5 files changed, 373 insertions(+) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 5334363e235..6baff6d0e61 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -19604,6 +19604,119 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) return false; } +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D + in terms of a pair of shufps+ shufps/pshufd instructions. */ +static bool +expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d) +{ + unsigned char perm1[4]; + machine_mode vmode = d->vmode; + bool ok; + unsigned i, j, k, count = 0; + + if (d->one_operand_p + || (vmode != V4SImode && vmode != V4SFmode)) + return false; + + if (d->testing_p) + return true; + + for (i = 0; i < 4; ++i) + count += d->perm[i] > 3 ? 1 : 0; + + gcc_assert (count & 3); + + rtx tmp = gen_reg_rtx (vmode); + /* 2 from op0 and 2 from op1. */ + if (count == 2) + { + unsigned char perm2[4]; + for (i = 0, j = 0, k = 2; i < 4; ++i) + if (d->perm[i] & 4) + { + perm1[k++] = d->perm[i]; + perm2[i] = k - 1; + } + else + { + perm1[j++] = d->perm[i]; + perm2[i] = j - 1; + } + + /* shufps. */ + ok = expand_vselect_vconcat (tmp, d->op0, d->op1, + perm1, d->nelt, false); + gcc_assert (ok); + if (vmode == V4SImode && TARGET_SSE2) + /* pshufd. */ + ok = expand_vselect (d->target, tmp, + perm2, d->nelt, false); + else + { + /* shufps. */ + perm2[2] += 4; + perm2[3] += 4; + ok = expand_vselect_vconcat (d->target, tmp, tmp, + perm2, d->nelt, false); + } + gcc_assert (ok); + } + /* 3 from one op and 1 from another. */ + else + { + unsigned pair_idx = 8, lone_idx = 8, shift; + + /* Find the lone index. */ + for (i = 0; i < 4; ++i) + if ((d->perm[i] > 3 && count == 1) + || (d->perm[i] < 4 && count == 3)) + lone_idx = i; + + /* When lone_idx is not 0, it must from second op(count == 1). */ + gcc_assert (count == (lone_idx ? 1 : 3)); + + /* Find the pair index that sits in the same half as the lone index. */ + shift = lone_idx & 2; + pair_idx = 1 - lone_idx + 2 * shift; + + /* First permutate lone index and pair index into the same vector as + [ lone, lone, pair, pair ]. */ + perm1[1] = perm1[0] + = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4; + perm1[3] = perm1[2] + = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4; + + /* Alway put the vector contains lone indx at the first. */ + if (count == 1) + std::swap (d->op0, d->op1); + + /* shufps. */ + ok = expand_vselect_vconcat (tmp, d->op0, d->op1, + perm1, d->nelt, false); + gcc_assert (ok); + + /* Refine lone and pair index to original order. */ + perm1[shift] = lone_idx << 1; + perm1[shift + 1] = pair_idx << 1; + + /* Select the remaining 2 elements in another vector. */ + for (i = 2 - shift; i < 4 - shift; ++i) + perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i]; + + /* Adjust to original selector. */ + if (lone_idx > 1) + std::swap (tmp, d->op1); + + /* shufps. */ + ok = expand_vselect_vconcat (d->target, tmp, d->op1, + perm1, d->nelt, false); + + gcc_assert (ok); + } + + return true; +} + /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D in terms of a pair of pshuflw + pshufhw instructions. */ @@ -22152,6 +22265,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_2perm_pblendv (d, true)) return true; + if (expand_vec_perm_shufps_shufps (d)) + return true; + /* Try sequences of three instructions. */ if (expand_vec_perm_even_odd_pack (d)) diff --git a/gcc/testsuite/gcc.target/i386/pr53346-1.c b/gcc/testsuite/gcc.target/i386/pr53346-1.c new file mode 100644 index 00000000000..6d230da632c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr53346-1.c @@ -0,0 +1,70 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -O2 -mno-sse3" } */ +/* { dg-final { scan-assembler-times "shufps" 15 } } */ +/* { dg-final { scan-assembler-times "pshufd" 2 } } */ + +typedef int v4si __attribute__((vector_size(16))); + +v4si +__attribute__((noipa)) +foo (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 1, 2, 5, 3); +} + +v4si +__attribute__((noipa)) +foo1 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 1, 5, 2, 3); +} + +v4si +__attribute__((noipa)) +foo2 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 1, 2, 3, 5); +} + +v4si +__attribute__((noipa)) +foo3 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 1, 4, 5, 6); +} + +v4si +__attribute__((noipa)) +foo4 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 3, 6, 7, 5); +} + +v4si +__attribute__((noipa)) +foo5 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 2, 4, 7, 6); +} + +v4si +__attribute__((noipa)) +foo6 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 2, 4, 3, 6); +} + +v4si +__attribute__((noipa)) +foo7 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 2, 3, 4, 6); +} + +v4si +__attribute__((noipa)) +foo8 (v4si a, v4si b) +{ + return __builtin_shufflevector (a, b, 2, 4, 6, 3); +} + diff --git a/gcc/testsuite/gcc.target/i386/pr53346-2.c b/gcc/testsuite/gcc.target/i386/pr53346-2.c new file mode 100644 index 00000000000..0c6c7b35e01 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr53346-2.c @@ -0,0 +1,59 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -msse2" } */ +/* { dg-require-effective-target sse2 } */ + +#include "sse2-check.h" +#include "pr53346-1.c" + +static void +sse2_test () +{ + v4si a = __extension__(v4si) { 0, 1, 2, 3 }; + v4si b = __extension__(v4si) { 4, 5, 6, 7 }; + v4si exp = __extension__(v4si) { 1, 2, 5, 3 }; + v4si dest; + dest = foo (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 1, 5, 2, 3 }; + dest = foo1 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 1, 2, 3, 5 }; + dest = foo2 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 1, 4, 5, 6 }; + dest = foo3 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 3, 6, 7, 5 }; + dest = foo4 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 2, 4, 7, 6 }; + dest = foo5 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 2, 4, 3, 6 }; + dest = foo6 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 2, 3, 4, 6 }; + dest = foo7 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4si) { 2, 4, 6, 3 }; + dest = foo8 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + +} diff --git a/gcc/testsuite/gcc.target/i386/pr53346-3.c b/gcc/testsuite/gcc.target/i386/pr53346-3.c new file mode 100644 index 00000000000..0b204f6f210 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr53346-3.c @@ -0,0 +1,69 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -O2 -mno-sse3" } */ +/* { dg-final { scan-assembler-times "shufps" 17 } } */ + +typedef float v4sf __attribute__((vector_size(16))); + +v4sf +__attribute__((noipa)) +foo (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 1, 2, 5, 3); +} + +v4sf +__attribute__((noipa)) +foo1 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 1, 5, 2, 3); +} + +v4sf +__attribute__((noipa)) +foo2 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 1, 2, 3, 5); +} + +v4sf +__attribute__((noipa)) +foo3 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 1, 4, 5, 6); +} + +v4sf +__attribute__((noipa)) +foo4 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 3, 6, 7, 5); +} + +v4sf +__attribute__((noipa)) +foo5 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 2, 4, 7, 6); +} + +v4sf +__attribute__((noipa)) +foo6 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 2, 4, 3, 6); +} + +v4sf +__attribute__((noipa)) +foo7 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 2, 3, 4, 6); +} + +v4sf +__attribute__((noipa)) +foo8 (v4sf a, v4sf b) +{ + return __builtin_shufflevector (a, b, 2, 4, 6, 3); +} + diff --git a/gcc/testsuite/gcc.target/i386/pr53346-4.c b/gcc/testsuite/gcc.target/i386/pr53346-4.c new file mode 100644 index 00000000000..9e4e45bd584 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr53346-4.c @@ -0,0 +1,59 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -msse2" } */ +/* { dg-require-effective-target sse2 } */ + +#include "sse2-check.h" +#include "pr53346-3.c" + +static void +sse2_test () +{ + v4sf a = __extension__(v4sf) { 0, 1, 2, 3 }; + v4sf b = __extension__(v4sf) { 4, 5, 6, 7 }; + v4sf exp = __extension__(v4sf) { 1, 2, 5, 3 }; + v4sf dest; + dest = foo (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 1, 5, 2, 3 }; + dest = foo1 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 1, 2, 3, 5 }; + dest = foo2 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 1, 4, 5, 6 }; + dest = foo3 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 3, 6, 7, 5 }; + dest = foo4 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 2, 4, 7, 6 }; + dest = foo5 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 2, 4, 3, 6 }; + dest = foo6 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 2, 3, 4, 6 }; + dest = foo7 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + + exp = __extension__ (v4sf) { 2, 4, 6, 3 }; + dest = foo8 (a, b); + if (__builtin_memcmp (&dest, &exp, 16)) + __builtin_abort (); + +}