[PATCH] Improve -mavx -mno-avx2 32-byte vector permutations (PR target/91560)

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] Improve -mavx -mno-avx2 32-byte vector permutations (PR target/91560)
@ 2019-08-29  9:03 Jakub Jelinek
  2019-08-29 10:01 ` Uros Bizjak
  0 siblings, 1 reply; 2+ messages in thread
From: Jakub Jelinek @ 2019-08-29  9:03 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: gcc-patches

Hi!

The following patch improves especially V8SFmode permutations for
AVX (non-AVX2) ISA, where we punted way too often, even when we can handle
it.
On the
typedef float __v8sf __attribute__((vector_size (32)));
typedef double __v4df __attribute__((vector_size (32)));
typedef int __v8si __attribute__((vector_size (32)));
typedef long long __v4di __attribute__((vector_size (32)));
#ifdef __clang__
#define S(x, y, t, ...) __builtin_shufflevector (x, y, __VA_ARGS__)
#else
#define S(x, y, t, ...) __builtin_shuffle (x, y, (t) { __VA_ARGS__ })
#endif

__v8sf f1 (__v8sf x, __v8sf y) { return S (x, y, __v8si, 0, 8, 9, 10, 11, 12, 13, 14 ); }
__v8sf f2 (__v8sf x, __v8sf y) { return S (x, y, __v8si, 0, 1, 8, 9, 10, 11, 12, 13 ); }
testcase we used to emit terrible code (8 BIT_FIELD_REFs + composition
back), while LLVM emits:
        vpermilps       $144, %xmm1, %xmm2 # xmm2 = xmm1[0,0,1,2]
        vextractf128    $1, %ymm1, %xmm3
        vblendps        $8, %xmm1, %xmm3, %xmm1 # xmm1 = xmm3[0,1,2],xmm1[3]
        vpermilps       $147, %xmm1, %xmm1 # xmm1 = xmm1[3,0,1,2]
        vinsertf128     $1, %xmm1, %ymm2, %ymm1
        vblendps        $1, %ymm0, %ymm1, %ymm0 # ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
and
        vextractf128    $1, %ymm1, %xmm2
        vshufpd $1, %xmm2, %xmm1, %xmm2 # xmm2 = xmm1[1],xmm2[0]
        vmovddup        %xmm1, %xmm1    # xmm1 = xmm1[0,0]
        vinsertf128     $1, %xmm2, %ymm1, %ymm1
        vblendps        $3, %ymm0, %ymm1, %ymm0 # ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
With the patch we emit:
        vpermilps       $144, %ymm1, %ymm2
        vpermilps       .LC0(%rip), %ymm1, %ymm1
        vblendps        $238, %ymm2, %ymm0, %ymm0
        vperm2f128      $1, %ymm1, %ymm1, %ymm1
        vblendps        $16, %ymm1, %ymm0, %ymm0
and
        vshufps $68, %ymm1, %ymm0, %ymm0
        vpermilps       .LC1(%rip), %ymm1, %ymm1
        vperm2f128      $1, %ymm1, %ymm1, %ymm1
        vblendps        $48, %ymm1, %ymm0, %ymm0
so one insn each shorter than what LLVM emits.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2019-08-29  Jakub Jelinek  <jakub@redhat.com>

	PR target/91560
	* config/i386/i386-expand.c (expand_vec_perm_movs,
	expand_vec_perm_blend, expand_vec_perm_vpermil,
	expand_vec_perm_pshufb, expand_vec_perm_1,
	expand_vec_perm_pshuflw_pshufhw, expand_vec_perm_palignr,
	expand_vec_perm_interleave2, expand_vec_perm_vpermq_perm_1,
	expand_vec_perm_vperm2f128, expand_vec_perm_interleave3,
	expand_vec_perm_vperm2f128_vblend, expand_vec_perm_2vperm2f128_vshuf,
	expand_vec_perm_even_odd, expand_vec_perm_broadcast): Adjust function
	comments - replace ix86_expand_vec_perm_builtin_1 with
	ix86_expand_vec_perm_const_1.
	(expand_vec_perm2_vperm2f128_vblend): New function.
	(ix86_expand_vec_perm_const_1): New forward declaration.  Call
	expand_vec_perm2_vperm2f128_vblend as last resort.
	(canonicalize_perm): Formatting fix.

	* gcc.dg/torture/vshuf-8.inc: Add two further permutations.

--- gcc/config/i386/i386-expand.c.jj	2019-08-27 12:26:25.383089132 +0200
+++ gcc/config/i386/i386-expand.c	2019-08-28 15:22:43.911004586 +0200
@@ -16372,7 +16372,7 @@ expand_vselect_vconcat (rtx target, rtx
   return ok;
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
    using movss or movsd.  */
 static bool
 expand_vec_perm_movs (struct expand_vec_perm_d *d)
@@ -16408,7 +16408,7 @@ expand_vec_perm_movs (struct expand_vec_
   return true;
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
 
 static bool
@@ -16633,7 +16633,7 @@ expand_vec_perm_blend (struct expand_vec
   return true;
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
    in terms of the variable form of vpermilps.
 
    Note that we will have already failed the immediate input vpermilps,
@@ -16709,7 +16709,7 @@ valid_perm_using_mode_p (machine_mode vm
   return true;
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
    in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
 
 static bool
@@ -17026,7 +17026,7 @@ ix86_expand_vec_one_operand_perm_avx512
 
 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to instantiate D
    in a single instruction.  */
 
 static bool
@@ -17216,7 +17216,7 @@ expand_vec_perm_1 (struct expand_vec_per
   return false;
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
    in terms of a pair of pshuflw + pshufhw instructions.  */
 
 static bool
@@ -17257,7 +17257,7 @@ expand_vec_perm_pshuflw_pshufhw (struct
   return true;
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
    the permutation using the SSSE3 palignr instruction.  This succeeds
    when all of the elements in PERM fit within one vector and we merely
    need to shift them down so that a single vector permutation has a
@@ -17474,7 +17474,7 @@ expand_vec_perm_pblendv (struct expand_v
 
 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
    a two vector permutation into a single vector permutation by using
    an interleave operation to merge the vectors.  */
 
@@ -17752,7 +17752,7 @@ expand_vec_perm_interleave2 (struct expa
   return true;
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
    a single vector cross-lane permutation into vpermq followed
    by any of the single insn permutations.  */
 
@@ -17833,7 +17833,7 @@ expand_vec_perm_vpermq_perm_1 (struct ex
 
 static bool canonicalize_perm (struct expand_vec_perm_d *d);
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to expand
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to expand
    a vector permutation using two instructions, vperm2f128 resp.
    vperm2i128 followed by any single in-lane permutation.  */
 
@@ -17950,7 +17950,7 @@ expand_vec_perm_vperm2f128 (struct expan
   return false;
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
    a two vector permutation using 2 intra-lane interleave insns
    and cross-lane shuffle for 32-byte vectors.  */
 
@@ -18026,7 +18026,7 @@ expand_vec_perm_interleave3 (struct expa
   return true;
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
    a single vector permutation using a single intra-lane vector
    permutation, vperm2f128 swapping the lanes and vblend* insn blending
    the non-swapped and swapped vectors together.  */
@@ -18094,7 +18094,7 @@ expand_vec_perm_vperm2f128_vblend (struc
   return true;
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement a V4DF
+/* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
    permutation using two vperm2f128, followed by a vshufpd insn blending
    the two vectors together.  */
 
@@ -18145,6 +18145,106 @@ expand_vec_perm_2vperm2f128_vshuf (struc
   return true;
 }
 
+static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
+
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
+   a two vector permutation using two intra-lane vector
+   permutations, vperm2f128 swapping the lanes and vblend* insn blending
+   the non-swapped and swapped vectors together.  */
+
+static bool
+expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
+{
+  struct expand_vec_perm_d dfirst, dsecond, dthird;
+  unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
+  rtx_insn *seq1, *seq2;
+  bool ok;
+  rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
+
+  if (!TARGET_AVX
+      || TARGET_AVX2
+      || (d->vmode != V8SFmode && d->vmode != V4DFmode)
+      || d->one_operand_p)
+    return false;
+
+  dfirst = *d;
+  dsecond = *d;
+  for (i = 0; i < nelt; i++)
+    {
+      dfirst.perm[i] = 0xff;
+      dsecond.perm[i] = 0xff;
+    }
+  for (i = 0, msk = 0; i < nelt; i++)
+    {
+      j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
+      if (j == i)
+	{
+	  dfirst.perm[j] = d->perm[i];
+	  which1 |= (d->perm[i] < nelt ? 1 : 2);
+	}
+      else
+	{
+	  dsecond.perm[j] = d->perm[i];
+	  which2 |= (d->perm[i] < nelt ? 1 : 2);
+	  msk |= (1U << i);
+	}
+    }
+  if (msk == 0 || msk == (1U << nelt) - 1)
+    return false;
+
+  if (!d->testing_p)
+    {
+      dfirst.target = gen_reg_rtx (dfirst.vmode);
+      dsecond.target = gen_reg_rtx (dsecond.vmode);
+    }
+
+  for (i = 0; i < nelt; i++)
+    {
+      if (dfirst.perm[i] == 0xff)
+	dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
+      if (dsecond.perm[i] == 0xff)
+	dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
+    }
+  canonicalize_perm (&dfirst);
+  start_sequence ();
+  ok = ix86_expand_vec_perm_const_1 (&dfirst);
+  seq1 = get_insns ();
+  end_sequence ();
+
+  if (!ok)
+    return false;
+
+  canonicalize_perm (&dsecond);
+  start_sequence ();
+  ok = ix86_expand_vec_perm_const_1 (&dsecond);
+  seq2 = get_insns ();
+  end_sequence ();
+
+  if (!ok)
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  emit_insn (seq1);
+  emit_insn (seq2);
+
+  dthird = *d;
+  dthird.op0 = dsecond.target;
+  dthird.op1 = dsecond.target;
+  dthird.one_operand_p = true;
+  dthird.target = gen_reg_rtx (dthird.vmode);
+  for (i = 0; i < nelt; i++)
+    dthird.perm[i] = i ^ nelt2;
+
+  ok = expand_vec_perm_1 (&dthird);
+  gcc_assert (ok);
+
+  blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
+  emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
+  return true;
+}
+
 /* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
    permutation with two pshufb insns and an ior.  We should have already
    failed all two instruction sequences.  */
@@ -18534,7 +18634,7 @@ expand_vec_perm_even_odd_trunc (struct e
   return true;
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
+/* A subroutine of ix86_expand_vec_perm_const_1.  Implement extract-even
    and extract-odd permutations.  */
 
 static bool
@@ -18743,7 +18843,7 @@ expand_vec_perm_even_odd_1 (struct expan
   return true;
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
+/* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
    extract-even and extract-odd permutations.  */
 
 static bool
@@ -18762,7 +18862,7 @@ expand_vec_perm_even_odd (struct expand_
   return expand_vec_perm_even_odd_1 (d, odd);
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement broadcast
+/* A subroutine of ix86_expand_vec_perm_const_1.  Implement broadcast
    permutations.  We assume that expand_vec_perm_1 has already failed.  */
 
 static bool
@@ -18841,7 +18941,7 @@ expand_vec_perm_broadcast_1 (struct expa
     }
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
+/* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
    broadcast permutations.  */
 
 static bool
@@ -19137,6 +19237,10 @@ ix86_expand_vec_perm_const_1 (struct exp
       return true;
     }
 
+  /* Even longer, including recursion to ix86_expand_vec_perm_const_1.  */
+  if (expand_vec_perm2_vperm2f128_vblend (d))
+    return true;
+
   return false;
 }
 
@@ -19149,7 +19253,7 @@ canonicalize_perm (struct expand_vec_per
   int i, which, nelt = d->nelt;
 
   for (i = which = 0; i < nelt; ++i)
-      which |= (d->perm[i] < nelt ? 1 : 2);
+    which |= (d->perm[i] < nelt ? 1 : 2);
 
   d->one_operand_p = true;
   switch (which)
--- gcc/testsuite/gcc.dg/torture/vshuf-8.inc.jj	2015-12-04 09:24:31.234396066 +0100
+++ gcc/testsuite/gcc.dg/torture/vshuf-8.inc	2019-08-28 15:11:35.778754247 +0200
@@ -25,7 +25,9 @@ T (21,	4, 12, 5, 13, 6, 14, 7, 15) \
 T (22,	1, 2, 3, 4, 5, 6, 7, 0) \
 T (23,	6, 5, 4, 3, 2, 1, 0, 7) \
 T (24,	0, 1, 2, 3, 8, 9, 10, 11) \
-T (25,	0, 1, 2, 3, 12, 13, 14, 15)
+T (25,	0, 1, 2, 3, 12, 13, 14, 15) \
+T (26,	0, 1, 8, 9, 10, 11, 12, 13) \
+T (27,	0, 8, 9, 10, 11, 12, 13, 14)
 #define EXPTESTS \
 T (116,	9, 3, 9, 4, 7, 0, 0, 6) \
 T (117,	4, 14, 12, 8, 9, 6, 0, 10) \

	Jakub

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] Improve -mavx -mno-avx2 32-byte vector permutations (PR target/91560)
  2019-08-29  9:03 [PATCH] Improve -mavx -mno-avx2 32-byte vector permutations (PR target/91560) Jakub Jelinek
@ 2019-08-29 10:01 ` Uros Bizjak
  0 siblings, 0 replies; 2+ messages in thread
From: Uros Bizjak @ 2019-08-29 10:01 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches

On Thu, Aug 29, 2019 at 10:41 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> The following patch improves especially V8SFmode permutations for
> AVX (non-AVX2) ISA, where we punted way too often, even when we can handle
> it.
> On the
> typedef float __v8sf __attribute__((vector_size (32)));
> typedef double __v4df __attribute__((vector_size (32)));
> typedef int __v8si __attribute__((vector_size (32)));
> typedef long long __v4di __attribute__((vector_size (32)));
> #ifdef __clang__
> #define S(x, y, t, ...) __builtin_shufflevector (x, y, __VA_ARGS__)
> #else
> #define S(x, y, t, ...) __builtin_shuffle (x, y, (t) { __VA_ARGS__ })
> #endif
>
> __v8sf f1 (__v8sf x, __v8sf y) { return S (x, y, __v8si, 0, 8, 9, 10, 11, 12, 13, 14 ); }
> __v8sf f2 (__v8sf x, __v8sf y) { return S (x, y, __v8si, 0, 1, 8, 9, 10, 11, 12, 13 ); }
> testcase we used to emit terrible code (8 BIT_FIELD_REFs + composition
> back), while LLVM emits:
>         vpermilps       $144, %xmm1, %xmm2 # xmm2 = xmm1[0,0,1,2]
>         vextractf128    $1, %ymm1, %xmm3
>         vblendps        $8, %xmm1, %xmm3, %xmm1 # xmm1 = xmm3[0,1,2],xmm1[3]
>         vpermilps       $147, %xmm1, %xmm1 # xmm1 = xmm1[3,0,1,2]
>         vinsertf128     $1, %xmm1, %ymm2, %ymm1
>         vblendps        $1, %ymm0, %ymm1, %ymm0 # ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
> and
>         vextractf128    $1, %ymm1, %xmm2
>         vshufpd $1, %xmm2, %xmm1, %xmm2 # xmm2 = xmm1[1],xmm2[0]
>         vmovddup        %xmm1, %xmm1    # xmm1 = xmm1[0,0]
>         vinsertf128     $1, %xmm2, %ymm1, %ymm1
>         vblendps        $3, %ymm0, %ymm1, %ymm0 # ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
> With the patch we emit:
>         vpermilps       $144, %ymm1, %ymm2
>         vpermilps       .LC0(%rip), %ymm1, %ymm1
>         vblendps        $238, %ymm2, %ymm0, %ymm0
>         vperm2f128      $1, %ymm1, %ymm1, %ymm1
>         vblendps        $16, %ymm1, %ymm0, %ymm0
> and
>         vshufps $68, %ymm1, %ymm0, %ymm0
>         vpermilps       .LC1(%rip), %ymm1, %ymm1
>         vperm2f128      $1, %ymm1, %ymm1, %ymm1
>         vblendps        $48, %ymm1, %ymm0, %ymm0
> so one insn each shorter than what LLVM emits.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2019-08-29  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/91560
>         * config/i386/i386-expand.c (expand_vec_perm_movs,
>         expand_vec_perm_blend, expand_vec_perm_vpermil,
>         expand_vec_perm_pshufb, expand_vec_perm_1,
>         expand_vec_perm_pshuflw_pshufhw, expand_vec_perm_palignr,
>         expand_vec_perm_interleave2, expand_vec_perm_vpermq_perm_1,
>         expand_vec_perm_vperm2f128, expand_vec_perm_interleave3,
>         expand_vec_perm_vperm2f128_vblend, expand_vec_perm_2vperm2f128_vshuf,
>         expand_vec_perm_even_odd, expand_vec_perm_broadcast): Adjust function
>         comments - replace ix86_expand_vec_perm_builtin_1 with
>         ix86_expand_vec_perm_const_1.
>         (expand_vec_perm2_vperm2f128_vblend): New function.
>         (ix86_expand_vec_perm_const_1): New forward declaration.  Call
>         expand_vec_perm2_vperm2f128_vblend as last resort.
>         (canonicalize_perm): Formatting fix.
>
>         * gcc.dg/torture/vshuf-8.inc: Add two further permutations.

LGTM, but actually your area ;)

Thanks,
Uros.

> --- gcc/config/i386/i386-expand.c.jj    2019-08-27 12:26:25.383089132 +0200
> +++ gcc/config/i386/i386-expand.c       2019-08-28 15:22:43.911004586 +0200
> @@ -16372,7 +16372,7 @@ expand_vselect_vconcat (rtx target, rtx
>    return ok;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
>     using movss or movsd.  */
>  static bool
>  expand_vec_perm_movs (struct expand_vec_perm_d *d)
> @@ -16408,7 +16408,7 @@ expand_vec_perm_movs (struct expand_vec_
>    return true;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
>     in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
>
>  static bool
> @@ -16633,7 +16633,7 @@ expand_vec_perm_blend (struct expand_vec
>    return true;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
>     in terms of the variable form of vpermilps.
>
>     Note that we will have already failed the immediate input vpermilps,
> @@ -16709,7 +16709,7 @@ valid_perm_using_mode_p (machine_mode vm
>    return true;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
>     in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
>
>  static bool
> @@ -17026,7 +17026,7 @@ ix86_expand_vec_one_operand_perm_avx512
>
>  static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to instantiate D
>     in a single instruction.  */
>
>  static bool
> @@ -17216,7 +17216,7 @@ expand_vec_perm_1 (struct expand_vec_per
>    return false;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
>     in terms of a pair of pshuflw + pshufhw instructions.  */
>
>  static bool
> @@ -17257,7 +17257,7 @@ expand_vec_perm_pshuflw_pshufhw (struct
>    return true;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
>     the permutation using the SSSE3 palignr instruction.  This succeeds
>     when all of the elements in PERM fit within one vector and we merely
>     need to shift them down so that a single vector permutation has a
> @@ -17474,7 +17474,7 @@ expand_vec_perm_pblendv (struct expand_v
>
>  static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
>     a two vector permutation into a single vector permutation by using
>     an interleave operation to merge the vectors.  */
>
> @@ -17752,7 +17752,7 @@ expand_vec_perm_interleave2 (struct expa
>    return true;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
>     a single vector cross-lane permutation into vpermq followed
>     by any of the single insn permutations.  */
>
> @@ -17833,7 +17833,7 @@ expand_vec_perm_vpermq_perm_1 (struct ex
>
>  static bool canonicalize_perm (struct expand_vec_perm_d *d);
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to expand
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to expand
>     a vector permutation using two instructions, vperm2f128 resp.
>     vperm2i128 followed by any single in-lane permutation.  */
>
> @@ -17950,7 +17950,7 @@ expand_vec_perm_vperm2f128 (struct expan
>    return false;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
>     a two vector permutation using 2 intra-lane interleave insns
>     and cross-lane shuffle for 32-byte vectors.  */
>
> @@ -18026,7 +18026,7 @@ expand_vec_perm_interleave3 (struct expa
>    return true;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
>     a single vector permutation using a single intra-lane vector
>     permutation, vperm2f128 swapping the lanes and vblend* insn blending
>     the non-swapped and swapped vectors together.  */
> @@ -18094,7 +18094,7 @@ expand_vec_perm_vperm2f128_vblend (struc
>    return true;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement a V4DF
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
>     permutation using two vperm2f128, followed by a vshufpd insn blending
>     the two vectors together.  */
>
> @@ -18145,6 +18145,106 @@ expand_vec_perm_2vperm2f128_vshuf (struc
>    return true;
>  }
>
> +static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
> +
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
> +   a two vector permutation using two intra-lane vector
> +   permutations, vperm2f128 swapping the lanes and vblend* insn blending
> +   the non-swapped and swapped vectors together.  */
> +
> +static bool
> +expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
> +{
> +  struct expand_vec_perm_d dfirst, dsecond, dthird;
> +  unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
> +  rtx_insn *seq1, *seq2;
> +  bool ok;
> +  rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
> +
> +  if (!TARGET_AVX
> +      || TARGET_AVX2
> +      || (d->vmode != V8SFmode && d->vmode != V4DFmode)
> +      || d->one_operand_p)
> +    return false;
> +
> +  dfirst = *d;
> +  dsecond = *d;
> +  for (i = 0; i < nelt; i++)
> +    {
> +      dfirst.perm[i] = 0xff;
> +      dsecond.perm[i] = 0xff;
> +    }
> +  for (i = 0, msk = 0; i < nelt; i++)
> +    {
> +      j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
> +      if (j == i)
> +       {
> +         dfirst.perm[j] = d->perm[i];
> +         which1 |= (d->perm[i] < nelt ? 1 : 2);
> +       }
> +      else
> +       {
> +         dsecond.perm[j] = d->perm[i];
> +         which2 |= (d->perm[i] < nelt ? 1 : 2);
> +         msk |= (1U << i);
> +       }
> +    }
> +  if (msk == 0 || msk == (1U << nelt) - 1)
> +    return false;
> +
> +  if (!d->testing_p)
> +    {
> +      dfirst.target = gen_reg_rtx (dfirst.vmode);
> +      dsecond.target = gen_reg_rtx (dsecond.vmode);
> +    }
> +
> +  for (i = 0; i < nelt; i++)
> +    {
> +      if (dfirst.perm[i] == 0xff)
> +       dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
> +      if (dsecond.perm[i] == 0xff)
> +       dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
> +    }
> +  canonicalize_perm (&dfirst);
> +  start_sequence ();
> +  ok = ix86_expand_vec_perm_const_1 (&dfirst);
> +  seq1 = get_insns ();
> +  end_sequence ();
> +
> +  if (!ok)
> +    return false;
> +
> +  canonicalize_perm (&dsecond);
> +  start_sequence ();
> +  ok = ix86_expand_vec_perm_const_1 (&dsecond);
> +  seq2 = get_insns ();
> +  end_sequence ();
> +
> +  if (!ok)
> +    return false;
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  emit_insn (seq1);
> +  emit_insn (seq2);
> +
> +  dthird = *d;
> +  dthird.op0 = dsecond.target;
> +  dthird.op1 = dsecond.target;
> +  dthird.one_operand_p = true;
> +  dthird.target = gen_reg_rtx (dthird.vmode);
> +  for (i = 0; i < nelt; i++)
> +    dthird.perm[i] = i ^ nelt2;
> +
> +  ok = expand_vec_perm_1 (&dthird);
> +  gcc_assert (ok);
> +
> +  blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
> +  emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
> +  return true;
> +}
> +
>  /* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
>     permutation with two pshufb insns and an ior.  We should have already
>     failed all two instruction sequences.  */
> @@ -18534,7 +18634,7 @@ expand_vec_perm_even_odd_trunc (struct e
>    return true;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Implement extract-even
>     and extract-odd permutations.  */
>
>  static bool
> @@ -18743,7 +18843,7 @@ expand_vec_perm_even_odd_1 (struct expan
>    return true;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
>     extract-even and extract-odd permutations.  */
>
>  static bool
> @@ -18762,7 +18862,7 @@ expand_vec_perm_even_odd (struct expand_
>    return expand_vec_perm_even_odd_1 (d, odd);
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement broadcast
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Implement broadcast
>     permutations.  We assume that expand_vec_perm_1 has already failed.  */
>
>  static bool
> @@ -18841,7 +18941,7 @@ expand_vec_perm_broadcast_1 (struct expa
>      }
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
>     broadcast permutations.  */
>
>  static bool
> @@ -19137,6 +19237,10 @@ ix86_expand_vec_perm_const_1 (struct exp
>        return true;
>      }
>
> +  /* Even longer, including recursion to ix86_expand_vec_perm_const_1.  */
> +  if (expand_vec_perm2_vperm2f128_vblend (d))
> +    return true;
> +
>    return false;
>  }
>
> @@ -19149,7 +19253,7 @@ canonicalize_perm (struct expand_vec_per
>    int i, which, nelt = d->nelt;
>
>    for (i = which = 0; i < nelt; ++i)
> -      which |= (d->perm[i] < nelt ? 1 : 2);
> +    which |= (d->perm[i] < nelt ? 1 : 2);
>
>    d->one_operand_p = true;
>    switch (which)
> --- gcc/testsuite/gcc.dg/torture/vshuf-8.inc.jj 2015-12-04 09:24:31.234396066 +0100
> +++ gcc/testsuite/gcc.dg/torture/vshuf-8.inc    2019-08-28 15:11:35.778754247 +0200
> @@ -25,7 +25,9 @@ T (21,        4, 12, 5, 13, 6, 14, 7, 15) \
>  T (22, 1, 2, 3, 4, 5, 6, 7, 0) \
>  T (23, 6, 5, 4, 3, 2, 1, 0, 7) \
>  T (24, 0, 1, 2, 3, 8, 9, 10, 11) \
> -T (25, 0, 1, 2, 3, 12, 13, 14, 15)
> +T (25, 0, 1, 2, 3, 12, 13, 14, 15) \
> +T (26, 0, 1, 8, 9, 10, 11, 12, 13) \
> +T (27, 0, 8, 9, 10, 11, 12, 13, 14)
>  #define EXPTESTS \
>  T (116,        9, 3, 9, 4, 7, 0, 0, 6) \
>  T (117,        4, 14, 12, 8, 9, 6, 0, 10) \
>
>         Jakub

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2019-08-29  9:13 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-29  9:03 [PATCH] Improve -mavx -mno-avx2 32-byte vector permutations (PR target/91560) Jakub Jelinek
2019-08-29 10:01 ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).