From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1363) id BD199383B413; Fri, 11 Jun 2021 10:32:50 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org BD199383B413 MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain; charset="utf-8" From: Uros Bizjak To: gcc-cvs@gcc.gnu.org Subject: [gcc r12-1384] i386: Try to avoid variable permutation instruction [PR101021] X-Act-Checkin: gcc X-Git-Author: Uros Bizjak X-Git-Refname: refs/heads/master X-Git-Oldrev: 8bf728aecc4fea46b4490e950b9ae229f90597b0 X-Git-Newrev: 1fa991d1d74cb1ce96c48ede70ae0be7a9683ce3 Message-Id: <20210611103250.BD199383B413@sourceware.org> Date: Fri, 11 Jun 2021 10:32:50 +0000 (GMT) X-BeenThere: gcc-cvs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 11 Jun 2021 10:32:50 -0000 https://gcc.gnu.org/g:1fa991d1d74cb1ce96c48ede70ae0be7a9683ce3 commit r12-1384-g1fa991d1d74cb1ce96c48ede70ae0be7a9683ce3 Author: Uros Bizjak Date: Fri Jun 11 12:31:42 2021 +0200 i386: Try to avoid variable permutation instruction [PR101021] Some permutations can be implemented without costly PSHUFB instruction, e.g.: { 8,9,10,11,12,13,14,15, 0,1,2,3,4,5,6,7 } with PALIGNR, { 0,1,2,3, 4,5,6,7, 4,5,6,7, 12,13,14,15 } with PSHUFD, { 0,1, 2,3, 2,3, 6,7, 8,9,10,11,12,13,14,15 } with PSHUFLW and { 0,1,2,3,4,5,6,7, 8,9, 10,11, 10,11, 14,15 } with PSHUFHW. All these instructions have constant shuffle control mask and do not need to load shuffle mask from a memory to a temporary XMM register. 2021-06-11 Uroš Bizjak gcc/ PR target/101021 * config/i386/i386-expand.c (expand_vec_perm_pshufb): Return false if the permutation can be implemented with constant permutation instruction in wider mode. (canonicalize_vector_int_perm): Move above expand_vec_perm_pshufb. Handle V8QImode and V4HImode. gcc/testsuite/ PR target/101021 * gcc.target/i386/pr101021-1.c: New test. * gcc.target/i386/pr101021-2.c: Ditto. Diff: --- gcc/config/i386/i386-expand.c | 109 +++++++++++++++-------------- gcc/testsuite/gcc.target/i386/pr101021-1.c | 35 +++++++++ gcc/testsuite/gcc.target/i386/pr101021-2.c | 21 ++++++ 3 files changed, 114 insertions(+), 51 deletions(-) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 9ee5257adf9..2fa3a18dc6a 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -17354,6 +17354,59 @@ expand_vec_perm_vpermil (struct expand_vec_perm_d *d) return true; } +/* For V*[QHS]Imode permutations, check if the same permutation + can't be performed in a 2x, 4x or 8x wider inner mode. */ + +static bool +canonicalize_vector_int_perm (const struct expand_vec_perm_d *d, + struct expand_vec_perm_d *nd) +{ + int i; + machine_mode mode = VOIDmode; + + switch (d->vmode) + { + case E_V8QImode: mode = V4HImode; break; + case E_V16QImode: mode = V8HImode; break; + case E_V32QImode: mode = V16HImode; break; + case E_V64QImode: mode = V32HImode; break; + case E_V4HImode: mode = V2SImode; break; + case E_V8HImode: mode = V4SImode; break; + case E_V16HImode: mode = V8SImode; break; + case E_V32HImode: mode = V16SImode; break; + case E_V4SImode: mode = V2DImode; break; + case E_V8SImode: mode = V4DImode; break; + case E_V16SImode: mode = V8DImode; break; + default: return false; + } + for (i = 0; i < d->nelt; i += 2) + if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1) + return false; + nd->vmode = mode; + nd->nelt = d->nelt / 2; + for (i = 0; i < nd->nelt; i++) + nd->perm[i] = d->perm[2 * i] / 2; + if (GET_MODE_INNER (mode) != DImode) + canonicalize_vector_int_perm (nd, nd); + if (nd != d) + { + nd->one_operand_p = d->one_operand_p; + nd->testing_p = d->testing_p; + if (d->op0 == d->op1) + nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0); + else + { + nd->op0 = gen_lowpart (nd->vmode, d->op0); + nd->op1 = gen_lowpart (nd->vmode, d->op1); + } + if (d->testing_p) + nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1); + else + nd->target = gen_reg_rtx (nd->vmode); + } + return true; +} + /* Return true if permutation D can be performed as VMODE permutation instead. */ @@ -17391,6 +17444,7 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) unsigned i, nelt, eltsz, mask; unsigned char perm[64]; machine_mode vmode = V16QImode; + struct expand_vec_perm_d nd; rtx rperm[64], vperm, target, op0, op1; nelt = d->nelt; @@ -17539,6 +17593,10 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) return false; } + /* Try to avoid variable permutation instruction. */ + if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd)) + return false; + if (d->testing_p) return true; @@ -17617,57 +17675,6 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) return true; } -/* For V*[QHS]Imode permutations, check if the same permutation - can't be performed in a 2x, 4x or 8x wider inner mode. */ - -static bool -canonicalize_vector_int_perm (const struct expand_vec_perm_d *d, - struct expand_vec_perm_d *nd) -{ - int i; - machine_mode mode = VOIDmode; - - switch (d->vmode) - { - case E_V16QImode: mode = V8HImode; break; - case E_V32QImode: mode = V16HImode; break; - case E_V64QImode: mode = V32HImode; break; - case E_V8HImode: mode = V4SImode; break; - case E_V16HImode: mode = V8SImode; break; - case E_V32HImode: mode = V16SImode; break; - case E_V4SImode: mode = V2DImode; break; - case E_V8SImode: mode = V4DImode; break; - case E_V16SImode: mode = V8DImode; break; - default: return false; - } - for (i = 0; i < d->nelt; i += 2) - if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1) - return false; - nd->vmode = mode; - nd->nelt = d->nelt / 2; - for (i = 0; i < nd->nelt; i++) - nd->perm[i] = d->perm[2 * i] / 2; - if (GET_MODE_INNER (mode) != DImode) - canonicalize_vector_int_perm (nd, nd); - if (nd != d) - { - nd->one_operand_p = d->one_operand_p; - nd->testing_p = d->testing_p; - if (d->op0 == d->op1) - nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0); - else - { - nd->op0 = gen_lowpart (nd->vmode, d->op0); - nd->op1 = gen_lowpart (nd->vmode, d->op1); - } - if (d->testing_p) - nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1); - else - nd->target = gen_reg_rtx (nd->vmode); - } - return true; -} - /* Try to expand one-operand permutation with constant mask. */ static bool diff --git a/gcc/testsuite/gcc.target/i386/pr101021-1.c b/gcc/testsuite/gcc.target/i386/pr101021-1.c new file mode 100644 index 00000000000..f4649c00338 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101021-1.c @@ -0,0 +1,35 @@ +/* PR target/101021 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-not "vpshufb" } } */ + +typedef char S; +typedef S V __attribute__((vector_size(16 * sizeof(S)))); + +V t1 (V x) +{ + return __builtin_shuffle (x, (V) { 8,9,10,11,12,13,14,15, 0,1,2,3,4,5,6,7 }); +} + +/* { dg-final { scan-assembler "vpalignr" } } */ + +V t2 (V x) +{ + return __builtin_shuffle (x, (V) { 0,1,2,3, 4,5,6,7, 4,5,6,7, 12,13,14,15 }); +} + +/* { dg-final { scan-assembler "vpshufd" } } */ + +V t3 (V x) +{ + return __builtin_shuffle (x, (V) { 0,1, 2,3, 2,3, 6,7, 8,9,10,11,12,13,14,15 }); +} + +/* { dg-final { scan-assembler "vpshuflw" } } */ + +V t4 (V x) +{ + return __builtin_shuffle (x, (V) { 0,1,2,3,4,5,6,7, 8,9, 10,11, 10,11, 14,15 }); +} + +/* { dg-final { scan-assembler "vpshufhw" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr101021-2.c b/gcc/testsuite/gcc.target/i386/pr101021-2.c new file mode 100644 index 00000000000..1e046f7d990 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101021-2.c @@ -0,0 +1,21 @@ +/* PR target/101021 */ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-not "vpshufb" } } */ + +typedef char S; +typedef S V __attribute__((vector_size(8 * sizeof(S)))); + +V t1 (V x) +{ + return __builtin_shuffle (x, (V) { 4,5,6,7, 0,1,2,3 }); +} + +/* { dg-final { scan-assembler "vpshufd" } } */ + +V t2 (V x) +{ + return __builtin_shuffle (x, (V) { 0,1, 2,3, 2,3, 6,7 }); +} + +/* { dg-final { scan-assembler "vpshuflw" } } */