From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 2078) id 2E74B388E80E; Mon, 9 May 2022 13:23:28 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 2E74B388E80E MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: hongtao Liu To: gcc-cvs@gcc.gnu.org Subject: [gcc r13-210] Implement permutation with pslldq + psrldq + por when pshufb is not available. X-Act-Checkin: gcc X-Git-Author: liuhongt X-Git-Refname: refs/heads/master X-Git-Oldrev: addedd48e805edcf555c4fef80e531cd7dbf0c45 X-Git-Newrev: fcda0efccad41eba9134c1bd9d024a93d93fb82f Message-Id: <20220509132328.2E74B388E80E@sourceware.org> Date: Mon, 9 May 2022 13:23:28 +0000 (GMT) X-BeenThere: gcc-cvs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 09 May 2022 13:23:28 -0000 https://gcc.gnu.org/g:fcda0efccad41eba9134c1bd9d024a93d93fb82f commit r13-210-gfcda0efccad41eba9134c1bd9d024a93d93fb82f Author: liuhongt Date: Wed Apr 27 16:24:44 2022 +0800 Implement permutation with pslldq + psrldq + por when pshufb is not available. pand/pandn may be used to clear upper/lower bits of the operands, in that case there will be 4-5 instructions for permutation, and it's still better than scalar codes. gcc/ChangeLog: PR target/105354 * config/i386/i386-expand.cc (expand_vec_perm_pslldq_psrldq_por): New function. (ix86_expand_vec_perm_const_1): Try expand_vec_perm_pslldq_psrldq_por for both 3-instruction and 4/5-instruction sequence. gcc/testsuite/ChangeLog: * gcc.target/i386/pr105354-1.c: New test. * gcc.target/i386/pr105354-2.c: New test. Diff: --- gcc/config/i386/i386-expand.cc | 107 ++++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr105354-1.c | 130 +++++++++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr105354-2.c | 110 ++++++++++++++++++++++++ 3 files changed, 347 insertions(+) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index bc806ffa283..0fd3028c205 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -20941,6 +20941,106 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d) return true; } +/* Implement permutation with pslldq + psrldq + por when pshufb is not + available. */ +static bool +expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn) +{ + unsigned i, nelt = d->nelt; + unsigned start1, end1 = -1; + machine_mode vmode = d->vmode, imode; + int start2 = -1; + bool clear_op0, clear_op1; + unsigned inner_size; + rtx op0, op1, dop1; + rtx (*gen_vec_shr) (rtx, rtx, rtx); + rtx (*gen_vec_shl) (rtx, rtx, rtx); + + /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */ + if (!TARGET_SSE2 || (vmode != E_V16QImode && vmode != E_V8HImode)) + return false; + + start1 = d->perm[0]; + for (i = 1; i < nelt; i++) + { + if (d->perm[i] != d->perm[i-1] + 1) + { + if (start2 == -1) + { + start2 = d->perm[i]; + end1 = d->perm[i-1]; + } + else + return false; + } + else if (d->perm[i] >= nelt + && start2 == -1) + { + start2 = d->perm[i]; + end1 = d->perm[i-1]; + } + } + + clear_op0 = end1 != nelt - 1; + clear_op1 = start2 % nelt != 0; + /* pandn/pand is needed to clear upper/lower bits of op0/op1. */ + if (!pandn && (clear_op0 || clear_op1)) + return false; + + if (d->testing_p) + return true; + + gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi; + gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi; + imode = GET_MODE_INNER (vmode); + inner_size = GET_MODE_BITSIZE (imode); + op0 = gen_reg_rtx (vmode); + op1 = gen_reg_rtx (vmode); + + if (start1) + emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size))); + else + emit_move_insn (op0, d->op0); + + dop1 = d->op1; + if (d->one_operand_p) + dop1 = d->op0; + + int shl_offset = end1 - start1 + 1 - start2 % nelt; + if (shl_offset) + emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size))); + else + emit_move_insn (op1, dop1); + + /* Clear lower/upper bits for op0/op1. */ + if (clear_op0 || clear_op1) + { + rtx vec[16]; + rtx const_vec; + rtx clear; + for (i = 0; i != nelt; i++) + { + if (i < (end1 - start1 + 1)) + vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode); + else + vec[i] = CONST0_RTX (imode); + } + const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec)); + const_vec = validize_mem (force_const_mem (vmode, const_vec)); + clear = force_reg (vmode, const_vec); + + if (clear_op0) + emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear)); + if (clear_op1) + emit_move_insn (op1, gen_rtx_AND (vmode, + gen_rtx_NOT (vmode, clear), + op1)); + } + + emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1)); + return true; +} + /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI operands with two "and" and "pack" or two "shift" and "pack" insns. @@ -21853,6 +21953,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_pshufb2 (d)) return true; + if (expand_vec_perm_pslldq_psrldq_por (d, false)) + return true; + if (expand_vec_perm_interleave3 (d)) return true; @@ -21891,6 +21994,10 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_even_odd (d)) return true; + /* Generate four or five instructions. */ + if (expand_vec_perm_pslldq_psrldq_por (d, true)) + return true; + /* Even longer sequences. */ if (expand_vec_perm_vpshufb4_vpermq2 (d)) return true; diff --git a/gcc/testsuite/gcc.target/i386/pr105354-1.c b/gcc/testsuite/gcc.target/i386/pr105354-1.c new file mode 100644 index 00000000000..8d91ded7420 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr105354-1.c @@ -0,0 +1,130 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2 -mno-ssse3" } */ +/* { dg-final { scan-assembler-times {(?n)psrldq[\t ]+} 16 } } */ +/* { dg-final { scan-assembler-times {(?n)pslldq[\t ]+} 16 } } */ +/* { dg-final { scan-assembler-times {(?n)por[\t ]+} 16 } } */ +/* { dg-final { scan-assembler-times {(?n)pandn[\t ]+} 8 } } */ +/* { dg-final { scan-assembler-times {(?n)pand[\t ]+} 8 } } */ + +typedef short v8hi __attribute__((vector_size (16))); +typedef char v16qi __attribute__((vector_size (16))); + +v16qi +__attribute__((noipa)) +foo (v16qi a, v16qi b) +{ + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20); +} + +v16qi +__attribute__((noipa)) +foo1 (v16qi a, v16qi b) +{ + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 18, 19, 20, 21, 22); +} + +v16qi +__attribute__((noipa)) +foo2 (v16qi a, v16qi b) +{ + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 16, 17, 18, 19, 20, 21); +} + +v16qi +__attribute__((noipa)) +foo3 (v16qi a, v16qi b) +{ + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 17, 18, 19, 20, 21, 22); +} + +v8hi +__attribute__((noipa)) +foo4 (v8hi a, v8hi b) +{ + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12); +} + +v8hi +__attribute__((noipa)) +foo5 (v8hi a, v8hi b) +{ + return __builtin_shufflevector (a, b, 5, 6, 7, 9, 10, 11, 12, 13); +} + +v8hi +__attribute__((noipa)) +foo6 (v8hi a, v8hi b) +{ + return __builtin_shufflevector (a, b, 5, 6, 8, 9, 10, 11, 12, 13); +} + +v8hi +__attribute__((noipa)) +foo7 (v8hi a, v8hi b) +{ + return __builtin_shufflevector (a, b, 5, 6, 9, 10, 11, 12, 13, 14); +} + +v16qi +__attribute__((noipa)) +foo8 (v16qi a) +{ + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20); +} + +v16qi +__attribute__((noipa)) +foo9 (v16qi a) +{ + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 18, 19, 20, 21, 22); +} + +v16qi +__attribute__((noipa)) +foo10 (v16qi a) +{ + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 16, 17, 18, 19, 20, 21); +} + +v16qi +__attribute__((noipa)) +foo11 (v16qi a) +{ + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 17, 18, 19, 20, 21, 22); +} + +v8hi +__attribute__((noipa)) +foo12 (v8hi a) +{ + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12); +} + +v8hi +__attribute__((noipa)) +foo13 (v8hi a) +{ + return __builtin_shufflevector (a, a, 5, 6, 7, 9, 10, 11, 12, 13); +} + +v8hi +__attribute__((noipa)) +foo14 (v8hi a) +{ + return __builtin_shufflevector (a, a, 5, 6, 8, 9, 10, 11, 12, 13); +} + +v8hi +__attribute__((noipa)) +foo15 (v8hi a) +{ + return __builtin_shufflevector (a, a, 5, 6, 9, 10, 11, 12, 13, 14); +} diff --git a/gcc/testsuite/gcc.target/i386/pr105354-2.c b/gcc/testsuite/gcc.target/i386/pr105354-2.c new file mode 100644 index 00000000000..b78b62e1e7e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr105354-2.c @@ -0,0 +1,110 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -msse2 -mno-ssse3" } */ +/* { dg-require-effective-target sse2 } */ + +#include "sse2-check.h" + +#include "pr105354-1.c" +void +sse2_test (void) +{ + union128i_b a, b, res_ab, exp_ab; + union128i_w c, d, res_cd, exp_cd; + + for (int i = 0; i != 16;i++) + { + a.a[i] = i; + b.a[i] = i + 16; + res_ab.a[i] = 0; + exp_ab.a[i] = -1; + if (i <= 8) + { + c.a[i] = i; + d.a[i] = i + 8; + res_cd.a[i] = 0; + exp_cd.a[i] = -1; + } + } + + res_ab.x = (__m128i)foo ((v16qi)a.x, (v16qi)b.x); + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 }; + if (check_union128i_b (exp_ab, res_ab.a)) + abort (); + + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22 }; + res_ab.x = (__m128i)foo1 ((v16qi)a.x, (v16qi)b.x); + if (check_union128i_b (exp_ab, res_ab.a)) + abort(); + + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21 }; + res_ab.x = (__m128i)foo2 ((v16qi)a.x, (v16qi)b.x); + if (check_union128i_b (exp_ab, res_ab.a)) + abort(); + + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22 }; + res_ab.x = (__m128i)foo3 ((v16qi)a.x, (v16qi)b.x); + if (check_union128i_b (exp_ab, res_ab.a)) + abort(); + + res_ab.x = (__m128i)foo8 ((v16qi)a.x); + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4 }; + if (check_union128i_b (exp_ab, res_ab.a)) + abort (); + + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 3, 4, 5, 6 }; + res_ab.x = (__m128i)foo9 ((v16qi)a.x); + if (check_union128i_b (exp_ab, res_ab.a)) + abort(); + + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5 }; + res_ab.x = (__m128i)foo10 ((v16qi)a.x); + if (check_union128i_b (exp_ab, res_ab.a)) + abort(); + + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1, 2, 3, 4, 5, 6 }; + res_ab.x = (__m128i)foo11 ((v16qi)a.x); + if (check_union128i_b (exp_ab, res_ab.a)) + abort(); + + res_cd.x = (__m128i)foo4 ((v8hi)c.x, (v8hi)d.x); + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 8, 9, 10, 11, 12 }; + if (check_union128i_w (exp_cd, res_cd.a)) + abort (); + + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 9, 10, 11, 12, 13 }; + res_cd.x = (__m128i)foo5 ((v8hi)c.x, (v8hi)d.x); + if (check_union128i_w (exp_cd, res_cd.a)) + abort(); + + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 8, 9, 10, 11, 12, 13 }; + res_cd.x = (__m128i)foo6 ((v8hi)c.x, (v8hi)d.x); + if (check_union128i_w (exp_cd, res_cd.a)) + abort(); + + res_cd.x = (__m128i)foo7 ((v8hi)c.x, (v8hi)d.x); + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 9, 10, 11, 12, 13, 14 }; + if (check_union128i_w (exp_cd, res_cd.a)) + abort (); + + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 0, 1, 2, 3, 4 }; + res_cd.x = (__m128i)foo12 ((v8hi)c.x); + if (check_union128i_w (exp_cd, res_cd.a)) + abort(); + + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 1, 2, 3, 4, 5 }; + res_cd.x = (__m128i)foo13 ((v8hi)c.x); + if (check_union128i_w (exp_cd, res_cd.a)) + abort(); + + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 0, 1, 2, 3, 4, 5 }; + res_cd.x = (__m128i)foo14 ((v8hi)c.x); + if (check_union128i_w (exp_cd, res_cd.a)) + abort(); + + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 1, 2, 3, 4, 5, 6 }; + res_cd.x = (__m128i)foo15 ((v8hi)c.x); + if (check_union128i_w (exp_cd, res_cd.a)) + abort(); + +} +