* [PATCH] Improve constant vec_perm expansion on i?86 (PR target/68655)
@ 2015-12-03 20:52 Jakub Jelinek
2015-12-04 7:50 ` Uros Bizjak
0 siblings, 1 reply; 2+ messages in thread
From: Jakub Jelinek @ 2015-12-03 20:52 UTC (permalink / raw)
To: Uros Bizjak; +Cc: gcc-patches
Hi!
As discussed in the PR, for some permutation we can get better code
if we try to expand it as if it was a permutation in a mode with the
same vector size, but wider vector element. The first attempt to do this
always had mixed results, lots of improvements, lots of pessimizations,
this one at least on gcc.dg/vshuf*
{-msse2,-msse4,-mavx,-mavx2,-mavx512f,-mavx512bw} shows only
improvements - it tries the original permutation for single insn,
if that doesn't work tries the wider one single insn, and then
as complete fallback, if we don't have any expansion whatsoever, tries
the wider one too.
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
2015-12-03 Jakub Jelinek <jakub@redhat.com>
PR target/68655
* config/i386/i386.c (canonicalize_vector_int_perm): New function.
(expand_vec_perm_1): Use it and recurse if everything else
failed. Use nd.perm instead of perm2.
(expand_vec_perm_even_odd_1): If testing_p, use gen_raw_REG
instead of gen_lowpart for the target.
(ix86_expand_vec_perm_const_1): Use canonicalize_vector_int_perm
and recurse if everything else failed.
* gcc.dg/torture/vshuf-4.inc (TESTS): Add one extra test.
* gcc.dg/torture/vshuf-4.inc (TESTS): Add two extra tests.
--- gcc/config/i386/i386.c.jj 2015-12-02 20:27:00.000000000 +0100
+++ gcc/config/i386/i386.c 2015-12-03 15:03:13.415764986 +0100
@@ -49365,6 +49365,57 @@ expand_vec_perm_pshufb (struct expand_ve
return true;
}
+/* For V*[QHS]Imode permutations, check if the same permutation
+ can't be performed in a 2x, 4x or 8x wider inner mode. */
+
+static bool
+canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
+ struct expand_vec_perm_d *nd)
+{
+ int i;
+ enum machine_mode mode = VOIDmode;
+
+ switch (d->vmode)
+ {
+ case V16QImode: mode = V8HImode; break;
+ case V32QImode: mode = V16HImode; break;
+ case V64QImode: mode = V32HImode; break;
+ case V8HImode: mode = V4SImode; break;
+ case V16HImode: mode = V8SImode; break;
+ case V32HImode: mode = V16SImode; break;
+ case V4SImode: mode = V2DImode; break;
+ case V8SImode: mode = V4DImode; break;
+ case V16SImode: mode = V8DImode; break;
+ default: return false;
+ }
+ for (i = 0; i < d->nelt; i += 2)
+ if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
+ return false;
+ nd->vmode = mode;
+ nd->nelt = d->nelt / 2;
+ for (i = 0; i < nd->nelt; i++)
+ nd->perm[i] = d->perm[2 * i] / 2;
+ if (GET_MODE_INNER (mode) != DImode)
+ canonicalize_vector_int_perm (nd, nd);
+ if (nd != d)
+ {
+ nd->one_operand_p = d->one_operand_p;
+ nd->testing_p = d->testing_p;
+ if (d->op0 == d->op1)
+ nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
+ else
+ {
+ nd->op0 = gen_lowpart (nd->vmode, d->op0);
+ nd->op1 = gen_lowpart (nd->vmode, d->op1);
+ }
+ if (d->testing_p)
+ nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
+ else
+ nd->target = gen_reg_rtx (nd->vmode);
+ }
+ return true;
+}
+
/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
in a single instruction. */
@@ -49372,7 +49423,7 @@ static bool
expand_vec_perm_1 (struct expand_vec_perm_d *d)
{
unsigned i, nelt = d->nelt;
- unsigned char perm2[MAX_VECT_LEN];
+ struct expand_vec_perm_d nd;
/* Check plain VEC_SELECT first, because AVX has instructions that could
match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
@@ -49385,10 +49436,10 @@ expand_vec_perm_1 (struct expand_vec_per
for (i = 0; i < nelt; i++)
{
- perm2[i] = d->perm[i] & mask;
- if (perm2[i] != i)
+ nd.perm[i] = d->perm[i] & mask;
+ if (nd.perm[i] != i)
identity_perm = false;
- if (perm2[i])
+ if (nd.perm[i])
broadcast_perm = false;
}
@@ -49457,7 +49508,7 @@ expand_vec_perm_1 (struct expand_vec_per
}
}
- if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
+ if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
return true;
/* There are plenty of patterns in sse.md that are written for
@@ -49468,10 +49519,10 @@ expand_vec_perm_1 (struct expand_vec_per
every other permutation operand. */
for (i = 0; i < nelt; i += 2)
{
- perm2[i] = d->perm[i] & mask;
- perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
+ nd.perm[i] = d->perm[i] & mask;
+ nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
}
- if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
+ if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
d->testing_p))
return true;
@@ -49480,13 +49531,13 @@ expand_vec_perm_1 (struct expand_vec_per
{
for (i = 0; i < nelt; i += 4)
{
- perm2[i + 0] = d->perm[i + 0] & mask;
- perm2[i + 1] = d->perm[i + 1] & mask;
- perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
- perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
+ nd.perm[i + 0] = d->perm[i + 0] & mask;
+ nd.perm[i + 1] = d->perm[i + 1] & mask;
+ nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
+ nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
}
- if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
+ if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
d->testing_p))
return true;
}
@@ -49507,10 +49558,10 @@ expand_vec_perm_1 (struct expand_vec_per
e -= nelt;
else
e += nelt;
- perm2[i] = e;
+ nd.perm[i] = e;
}
- if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
+ if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
d->testing_p))
return true;
}
@@ -49536,6 +49587,14 @@ expand_vec_perm_1 (struct expand_vec_per
if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
return true;
+ /* See if we can get the same permutation in different vector integer
+ mode. */
+ if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
+ {
+ if (!d->testing_p)
+ emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
+ return true;
+ }
return false;
}
@@ -50968,7 +51027,7 @@ expand_vec_perm_even_odd_1 (struct expan
struct expand_vec_perm_d d_copy = *d;
d_copy.vmode = V4DFmode;
if (d->testing_p)
- d_copy.target = gen_lowpart (V4DFmode, d->target);
+ d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
else
d_copy.target = gen_reg_rtx (V4DFmode);
d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
@@ -51007,7 +51066,7 @@ expand_vec_perm_even_odd_1 (struct expan
struct expand_vec_perm_d d_copy = *d;
d_copy.vmode = V8SFmode;
if (d->testing_p)
- d_copy.target = gen_lowpart (V8SFmode, d->target);
+ d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
else
d_copy.target = gen_reg_rtx (V8SFmode);
d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
@@ -51451,6 +51510,16 @@ ix86_expand_vec_perm_const_1 (struct exp
if (expand_vec_perm_vpshufb4_vpermq2 (d))
return true;
+ /* See if we can get the same permutation in different vector integer
+ mode. */
+ struct expand_vec_perm_d nd;
+ if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
+ {
+ if (!d->testing_p)
+ emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
+ return true;
+ }
+
return false;
}
--- gcc/testsuite/gcc.dg/torture/vshuf-4.inc.jj 2014-10-01 22:39:47.000000000 +0200
+++ gcc/testsuite/gcc.dg/torture/vshuf-4.inc 2015-12-03 15:44:29.252181928 +0100
@@ -24,7 +24,8 @@ T (20, 0, 4, 1, 5) \
T (21, 2, 6, 3, 7) \
T (22, 1, 2, 3, 0) \
T (23, 2, 1, 0, 3) \
-T (24, 2, 5, 6, 3)
+T (24, 2, 5, 6, 3) \
+T (25, 0, 1, 4, 5)
#define EXPTESTS \
T (116, 1, 2, 4, 3) \
T (117, 7, 3, 3, 0) \
--- gcc/testsuite/gcc.dg/torture/vshuf-8.inc.jj 2012-03-20 08:51:25.000000000 +0100
+++ gcc/testsuite/gcc.dg/torture/vshuf-8.inc 2015-12-03 15:45:13.280567206 +0100
@@ -23,7 +23,9 @@ T (19, 7, 6, 5, 4, 3, 2, 1, 0) \
T (20, 0, 8, 1, 9, 2, 10, 3, 11) \
T (21, 4, 12, 5, 13, 6, 14, 7, 15) \
T (22, 1, 2, 3, 4, 5, 6, 7, 0) \
-T (23, 6, 5, 4, 3, 2, 1, 0, 7)
+T (23, 6, 5, 4, 3, 2, 1, 0, 7) \
+T (24, 0, 1, 2, 3, 8, 9, 10, 11) \
+T (25, 0, 1, 2, 3, 12, 13, 14, 15)
#define EXPTESTS \
T (116, 9, 3, 9, 4, 7, 0, 0, 6) \
T (117, 4, 14, 12, 8, 9, 6, 0, 10) \
Jakub
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [PATCH] Improve constant vec_perm expansion on i?86 (PR target/68655)
2015-12-03 20:52 [PATCH] Improve constant vec_perm expansion on i?86 (PR target/68655) Jakub Jelinek
@ 2015-12-04 7:50 ` Uros Bizjak
0 siblings, 0 replies; 2+ messages in thread
From: Uros Bizjak @ 2015-12-04 7:50 UTC (permalink / raw)
To: Jakub Jelinek; +Cc: gcc-patches
On Thu, Dec 3, 2015 at 9:52 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> Hi!
>
> As discussed in the PR, for some permutation we can get better code
> if we try to expand it as if it was a permutation in a mode with the
> same vector size, but wider vector element. The first attempt to do this
> always had mixed results, lots of improvements, lots of pessimizations,
> this one at least on gcc.dg/vshuf*
> {-msse2,-msse4,-mavx,-mavx2,-mavx512f,-mavx512bw} shows only
> improvements - it tries the original permutation for single insn,
> if that doesn't work tries the wider one single insn, and then
> as complete fallback, if we don't have any expansion whatsoever, tries
> the wider one too.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2015-12-03 Jakub Jelinek <jakub@redhat.com>
>
> PR target/68655
> * config/i386/i386.c (canonicalize_vector_int_perm): New function.
> (expand_vec_perm_1): Use it and recurse if everything else
> failed. Use nd.perm instead of perm2.
> (expand_vec_perm_even_odd_1): If testing_p, use gen_raw_REG
> instead of gen_lowpart for the target.
> (ix86_expand_vec_perm_const_1): Use canonicalize_vector_int_perm
> and recurse if everything else failed.
>
> * gcc.dg/torture/vshuf-4.inc (TESTS): Add one extra test.
> * gcc.dg/torture/vshuf-4.inc (TESTS): Add two extra tests.
OK for mainline.
Thanks,
Uros.
> --- gcc/config/i386/i386.c.jj 2015-12-02 20:27:00.000000000 +0100
> +++ gcc/config/i386/i386.c 2015-12-03 15:03:13.415764986 +0100
> @@ -49365,6 +49365,57 @@ expand_vec_perm_pshufb (struct expand_ve
> return true;
> }
>
> +/* For V*[QHS]Imode permutations, check if the same permutation
> + can't be performed in a 2x, 4x or 8x wider inner mode. */
> +
> +static bool
> +canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
> + struct expand_vec_perm_d *nd)
> +{
> + int i;
> + enum machine_mode mode = VOIDmode;
> +
> + switch (d->vmode)
> + {
> + case V16QImode: mode = V8HImode; break;
> + case V32QImode: mode = V16HImode; break;
> + case V64QImode: mode = V32HImode; break;
> + case V8HImode: mode = V4SImode; break;
> + case V16HImode: mode = V8SImode; break;
> + case V32HImode: mode = V16SImode; break;
> + case V4SImode: mode = V2DImode; break;
> + case V8SImode: mode = V4DImode; break;
> + case V16SImode: mode = V8DImode; break;
> + default: return false;
> + }
> + for (i = 0; i < d->nelt; i += 2)
> + if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
> + return false;
> + nd->vmode = mode;
> + nd->nelt = d->nelt / 2;
> + for (i = 0; i < nd->nelt; i++)
> + nd->perm[i] = d->perm[2 * i] / 2;
> + if (GET_MODE_INNER (mode) != DImode)
> + canonicalize_vector_int_perm (nd, nd);
> + if (nd != d)
> + {
> + nd->one_operand_p = d->one_operand_p;
> + nd->testing_p = d->testing_p;
> + if (d->op0 == d->op1)
> + nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
> + else
> + {
> + nd->op0 = gen_lowpart (nd->vmode, d->op0);
> + nd->op1 = gen_lowpart (nd->vmode, d->op1);
> + }
> + if (d->testing_p)
> + nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
> + else
> + nd->target = gen_reg_rtx (nd->vmode);
> + }
> + return true;
> +}
> +
> /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
> in a single instruction. */
>
> @@ -49372,7 +49423,7 @@ static bool
> expand_vec_perm_1 (struct expand_vec_perm_d *d)
> {
> unsigned i, nelt = d->nelt;
> - unsigned char perm2[MAX_VECT_LEN];
> + struct expand_vec_perm_d nd;
>
> /* Check plain VEC_SELECT first, because AVX has instructions that could
> match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
> @@ -49385,10 +49436,10 @@ expand_vec_perm_1 (struct expand_vec_per
>
> for (i = 0; i < nelt; i++)
> {
> - perm2[i] = d->perm[i] & mask;
> - if (perm2[i] != i)
> + nd.perm[i] = d->perm[i] & mask;
> + if (nd.perm[i] != i)
> identity_perm = false;
> - if (perm2[i])
> + if (nd.perm[i])
> broadcast_perm = false;
> }
>
> @@ -49457,7 +49508,7 @@ expand_vec_perm_1 (struct expand_vec_per
> }
> }
>
> - if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
> + if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
> return true;
>
> /* There are plenty of patterns in sse.md that are written for
> @@ -49468,10 +49519,10 @@ expand_vec_perm_1 (struct expand_vec_per
> every other permutation operand. */
> for (i = 0; i < nelt; i += 2)
> {
> - perm2[i] = d->perm[i] & mask;
> - perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
> + nd.perm[i] = d->perm[i] & mask;
> + nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
> }
> - if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
> + if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
> d->testing_p))
> return true;
>
> @@ -49480,13 +49531,13 @@ expand_vec_perm_1 (struct expand_vec_per
> {
> for (i = 0; i < nelt; i += 4)
> {
> - perm2[i + 0] = d->perm[i + 0] & mask;
> - perm2[i + 1] = d->perm[i + 1] & mask;
> - perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
> - perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
> + nd.perm[i + 0] = d->perm[i + 0] & mask;
> + nd.perm[i + 1] = d->perm[i + 1] & mask;
> + nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
> + nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
> }
>
> - if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
> + if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
> d->testing_p))
> return true;
> }
> @@ -49507,10 +49558,10 @@ expand_vec_perm_1 (struct expand_vec_per
> e -= nelt;
> else
> e += nelt;
> - perm2[i] = e;
> + nd.perm[i] = e;
> }
>
> - if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
> + if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
> d->testing_p))
> return true;
> }
> @@ -49536,6 +49587,14 @@ expand_vec_perm_1 (struct expand_vec_per
> if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
> return true;
>
> + /* See if we can get the same permutation in different vector integer
> + mode. */
> + if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
> + {
> + if (!d->testing_p)
> + emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
> + return true;
> + }
> return false;
> }
>
> @@ -50968,7 +51027,7 @@ expand_vec_perm_even_odd_1 (struct expan
> struct expand_vec_perm_d d_copy = *d;
> d_copy.vmode = V4DFmode;
> if (d->testing_p)
> - d_copy.target = gen_lowpart (V4DFmode, d->target);
> + d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
> else
> d_copy.target = gen_reg_rtx (V4DFmode);
> d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
> @@ -51007,7 +51066,7 @@ expand_vec_perm_even_odd_1 (struct expan
> struct expand_vec_perm_d d_copy = *d;
> d_copy.vmode = V8SFmode;
> if (d->testing_p)
> - d_copy.target = gen_lowpart (V8SFmode, d->target);
> + d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
> else
> d_copy.target = gen_reg_rtx (V8SFmode);
> d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
> @@ -51451,6 +51510,16 @@ ix86_expand_vec_perm_const_1 (struct exp
> if (expand_vec_perm_vpshufb4_vpermq2 (d))
> return true;
>
> + /* See if we can get the same permutation in different vector integer
> + mode. */
> + struct expand_vec_perm_d nd;
> + if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
> + {
> + if (!d->testing_p)
> + emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
> + return true;
> + }
> +
> return false;
> }
>
> --- gcc/testsuite/gcc.dg/torture/vshuf-4.inc.jj 2014-10-01 22:39:47.000000000 +0200
> +++ gcc/testsuite/gcc.dg/torture/vshuf-4.inc 2015-12-03 15:44:29.252181928 +0100
> @@ -24,7 +24,8 @@ T (20, 0, 4, 1, 5) \
> T (21, 2, 6, 3, 7) \
> T (22, 1, 2, 3, 0) \
> T (23, 2, 1, 0, 3) \
> -T (24, 2, 5, 6, 3)
> +T (24, 2, 5, 6, 3) \
> +T (25, 0, 1, 4, 5)
> #define EXPTESTS \
> T (116, 1, 2, 4, 3) \
> T (117, 7, 3, 3, 0) \
> --- gcc/testsuite/gcc.dg/torture/vshuf-8.inc.jj 2012-03-20 08:51:25.000000000 +0100
> +++ gcc/testsuite/gcc.dg/torture/vshuf-8.inc 2015-12-03 15:45:13.280567206 +0100
> @@ -23,7 +23,9 @@ T (19, 7, 6, 5, 4, 3, 2, 1, 0) \
> T (20, 0, 8, 1, 9, 2, 10, 3, 11) \
> T (21, 4, 12, 5, 13, 6, 14, 7, 15) \
> T (22, 1, 2, 3, 4, 5, 6, 7, 0) \
> -T (23, 6, 5, 4, 3, 2, 1, 0, 7)
> +T (23, 6, 5, 4, 3, 2, 1, 0, 7) \
> +T (24, 0, 1, 2, 3, 8, 9, 10, 11) \
> +T (25, 0, 1, 2, 3, 12, 13, 14, 15)
> #define EXPTESTS \
> T (116, 9, 3, 9, 4, 7, 0, 0, 6) \
> T (117, 4, 14, 12, 8, 9, 6, 0, 10) \
>
> Jakub
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2015-12-04 7:50 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-12-03 20:52 [PATCH] Improve constant vec_perm expansion on i?86 (PR target/68655) Jakub Jelinek
2015-12-04 7:50 ` Uros Bizjak
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).