[PATCH][x86] Match movss and movsd "blend" instructions

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH][x86] Match movss and movsd "blend" instructions
@ 2018-08-01 16:00 Allan Sandfeld Jensen
  2018-08-01 16:51 ` Marc Glisse
  2018-08-11  9:05 ` Allan Sandfeld Jensen
  0 siblings, 2 replies; 16+ messages in thread
From: Allan Sandfeld Jensen @ 2018-08-01 16:00 UTC (permalink / raw)
  To: gcc-patches; +Cc: Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 783 bytes --]

Adds the ability to match movss and movsd as blend patterns,
implemented in a new method to be able to match these before shuffles,
while keeping other blends after.

2018-07-29 Allan Sandfeld Jensen <allan.jensen@qt.io>

gcc/config/i386

    * i386.cc (expand_vec_perm_movs): New method matching movs
    patterns.
    * i386.cc (expand_vec_perm_1): Try the new method.

gcc/testsuite

    * gcc.target/i386/sse2-movs.c: New test.
---
 gcc/config/i386/emmintrin.h               |  2 +-
 gcc/config/i386/i386.c                    | 44 +++++++++++++++++++++++
 gcc/config/i386/xmmintrin.h               |  2 +-
 gcc/testsuite/gcc.target/i386/sse2-movs.c | 21 +++++++++++
 4 files changed, 67 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-movs.c

[-- Attachment #2: 0001-Match-movss-and-movsd-blends.patch --]
[-- Type: text/x-patch, Size: 4634 bytes --]

From e96b3aa9017ad0d19238c923146196405cc4e5af Mon Sep 17 00:00:00 2001
From: Allan Sandfeld Jensen <allan.jensen@qt.io>
Date: Wed, 9 May 2018 12:35:14 +0200
Subject: [PATCH] Match movss and movsd blends

Adds the ability to match movss and movsd as blend patterns,
implemented in a new method to be able to match these before shuffles,
while keeping other blends after.

2018-07-29 Allan Sandfeld Jensen <allan.jensen@qt.io>

gcc/config/i386

    * i386.cc (expand_vec_perm_movs): New method matching movs
    patterns.
    * i386.cc (expand_vec_perm_1): Try the new method.

gcc/testsuite

    * gcc.target/i386/sse2-movs.c: New test.
---
 gcc/config/i386/emmintrin.h               |  2 +-
 gcc/config/i386/i386.c                    | 44 +++++++++++++++++++++++
 gcc/config/i386/xmmintrin.h               |  2 +-
 gcc/testsuite/gcc.target/i386/sse2-movs.c | 21 +++++++++++
 4 files changed, 67 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-movs.c

diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index b940a39d27b..1efd943bac4 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -113,7 +113,7 @@ _mm_setzero_pd (void)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_sd (__m128d __A, __m128d __B)
 {
-  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
+  return __extension__ (__m128d)(__v2df){__B[0],__A[1]};
 }
 
 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ee409cfe7e4..2337ef5ea08 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -46143,6 +46143,46 @@ expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
   return ok;
 }
 
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   using movss or movsd.  */
+static bool
+expand_vec_perm_movs (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  unsigned i, nelt = d->nelt;
+  rtx x;
+
+  if (d->one_operand_p)
+    return false;
+
+  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
+    ;
+  else
+    return false;
+
+  /* Only the first element is changed. */
+  if (d->perm[0] != nelt && d->perm[0] != 0)
+    return false;
+  for (i = 1; i < nelt; ++i) {
+    {
+      if (d->perm[i] != i + nelt - d->perm[0])
+        return false;
+    }
+  }
+
+  if (d->testing_p)
+    return true;
+
+  if (d->perm[0] == nelt)
+    x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
+  else
+    x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
+
+  emit_insn (gen_rtx_SET (d->target, x));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
 
@@ -46885,6 +46925,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	}
     }
 
+  /* Try movss/movsd instructions.  */
+  if (expand_vec_perm_movs (d))
+    return true;
+
   /* Finally, try the fully general two operand permute.  */
   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
 			      d->testing_p))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index f64f3f74a0b..699f681e054 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -1011,7 +1011,7 @@ _mm_storer_ps (float *__P, __m128 __A)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_ss (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
+  return __extension__ (__m128)(__v4sf){__B[0],__A[1],__A[2],__A[3]};
 }
 
 /* Extracts one of the four words of A.  The selector N must be immediate.  */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-movs.c b/gcc/testsuite/gcc.target/i386/sse2-movs.c
new file mode 100644
index 00000000000..79f486cfa82
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-movs.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler "movsd" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+/* { dg-final { scan-assembler-not "shufpd" } } */
+
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef double v2df __attribute__ ((vector_size (16)));
+
+v4sf movss(v4sf a, v4sf b)
+{
+     return (v4sf){b[0],a[1],a[2],a[3]};
+}
+
+v2df movsd(v2df a, v2df b)
+{
+     return (v2df){b[0],a[1]};
+}
-- 
2.17.0


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][x86] Match movss and movsd "blend" instructions
  2018-08-01 16:00 [PATCH][x86] Match movss and movsd "blend" instructions Allan Sandfeld Jensen
@ 2018-08-01 16:51 ` Marc Glisse
  2018-08-02  9:12   ` Allan Sandfeld Jensen
  2018-08-02 20:39   ` Allan Sandfeld Jensen
  2018-08-11  9:05 ` Allan Sandfeld Jensen
  1 sibling, 2 replies; 16+ messages in thread
From: Marc Glisse @ 2018-08-01 16:51 UTC (permalink / raw)
  To: Allan Sandfeld Jensen; +Cc: gcc-patches, Jakub Jelinek

On Wed, 1 Aug 2018, Allan Sandfeld Jensen wrote:

>  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm_move_sd (__m128d __A, __m128d __B)
>  {
> -  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
> +  return __extension__ (__m128d)(__v2df){__B[0],__A[1]};
>  }

If the goal is to have it represented as a VEC_PERM_EXPR internally, I 
wonder if we should be explicit and use __builtin_shuffle instead of 
relying on some forwprop pass to transform it. Maybe not, just asking. And 
the answer need not even be the same for _mm_move_sd and _mm_move_ss.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][x86] Match movss and movsd "blend" instructions
  2018-08-01 16:51 ` Marc Glisse
@ 2018-08-02  9:12   ` Allan Sandfeld Jensen
  2018-08-02  9:18     ` Richard Biener
  2018-08-02 20:39   ` Allan Sandfeld Jensen
  1 sibling, 1 reply; 16+ messages in thread
From: Allan Sandfeld Jensen @ 2018-08-02  9:12 UTC (permalink / raw)
  To: gcc-patches; +Cc: Marc Glisse, Jakub Jelinek

On Mittwoch, 1. August 2018 18:51:41 CEST Marc Glisse wrote:
> On Wed, 1 Aug 2018, Allan Sandfeld Jensen wrote:
> >  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
> > 
> > __artificial__))
> > 
> >  _mm_move_sd (__m128d __A, __m128d __B)
> >  {
> > 
> > -  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
> > +  return __extension__ (__m128d)(__v2df){__B[0],__A[1]};
> > 
> >  }
> 
> If the goal is to have it represented as a VEC_PERM_EXPR internally, I
> wonder if we should be explicit and use __builtin_shuffle instead of
> relying on some forwprop pass to transform it. Maybe not, just asking. And
> the answer need not even be the same for _mm_move_sd and _mm_move_ss.

I wrote it this way because this pattern could later also be used for the 
other _ss intrinsics, such as _mm_add_ss, where a _builtin_shuffle could not. 
To match the other intrinsics the logic that tries to match vector 
construction just needs to be extended to try merge patterns even if one of 
the subexpressions is not simple.

'Allan


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][x86] Match movss and movsd "blend" instructions
  2018-08-02  9:12   ` Allan Sandfeld Jensen
@ 2018-08-02  9:18     ` Richard Biener
  2018-08-02 20:51       ` Allan Sandfeld Jensen
  0 siblings, 1 reply; 16+ messages in thread
From: Richard Biener @ 2018-08-02  9:18 UTC (permalink / raw)
  To: Allan Sandfeld Jensen; +Cc: GCC Patches, Marc Glisse, Jakub Jelinek

On Thu, Aug 2, 2018 at 11:12 AM Allan Sandfeld Jensen
<linux@carewolf.com> wrote:
>
> On Mittwoch, 1. August 2018 18:51:41 CEST Marc Glisse wrote:
> > On Wed, 1 Aug 2018, Allan Sandfeld Jensen wrote:
> > >  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
> > >
> > > __artificial__))
> > >
> > >  _mm_move_sd (__m128d __A, __m128d __B)
> > >  {
> > >
> > > -  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
> > > +  return __extension__ (__m128d)(__v2df){__B[0],__A[1]};
> > >
> > >  }
> >
> > If the goal is to have it represented as a VEC_PERM_EXPR internally, I
> > wonder if we should be explicit and use __builtin_shuffle instead of
> > relying on some forwprop pass to transform it. Maybe not, just asking. And
> > the answer need not even be the same for _mm_move_sd and _mm_move_ss.
>
> I wrote it this way because this pattern could later also be used for the
> other _ss intrinsics, such as _mm_add_ss, where a _builtin_shuffle could not.
> To match the other intrinsics the logic that tries to match vector
> construction just needs to be extended to try merge patterns even if one of
> the subexpressions is not simple.

The question is what users expect and get when they use -O0 with intrinsics?

Richard.

> 'Allan
>
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][x86] Match movss and movsd "blend" instructions
  2018-08-01 16:51 ` Marc Glisse
  2018-08-02  9:12   ` Allan Sandfeld Jensen
@ 2018-08-02 20:39   ` Allan Sandfeld Jensen
  1 sibling, 0 replies; 16+ messages in thread
From: Allan Sandfeld Jensen @ 2018-08-02 20:39 UTC (permalink / raw)
  To: gcc-patches; +Cc: Marc Glisse, Jakub Jelinek

On Mittwoch, 1. August 2018 18:51:41 CEST Marc Glisse wrote:
> On Wed, 1 Aug 2018, Allan Sandfeld Jensen wrote:
> >  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
> > 
> > __artificial__))
> > 
> >  _mm_move_sd (__m128d __A, __m128d __B)
> >  {
> > 
> > -  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
> > +  return __extension__ (__m128d)(__v2df){__B[0],__A[1]};
> > 
> >  }
> 
> If the goal is to have it represented as a VEC_PERM_EXPR internally, I
> wonder if we should be explicit and use __builtin_shuffle instead of
> relying on some forwprop pass to transform it. Maybe not, just asking. And
> the answer need not even be the same for _mm_move_sd and _mm_move_ss.

I forgot. One of the things that makes using __builtin_shuffle ugly is that 
__v4si  as the suffle argument needs to be in _mm_move_ss, is declared
in emmintrin.h, but _mm_move_ss is in xmmintrin.h.

In general the gcc __builtin_shuffle syntax with the argument being a vector 
is kind of ackward. At least for the declaring intrinsics, the clang still 
where the permutator is extra argument is easier to deal with:
__builtin_shuffle(a, b, (__v4si){4, 0, 1, 2})
 vs
 __builtin_shuffle(a, b, 4, 0, 1, 2)

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][x86] Match movss and movsd "blend" instructions
  2018-08-02  9:18     ` Richard Biener
@ 2018-08-02 20:51       ` Allan Sandfeld Jensen
  2018-08-02 21:15         ` Marc Glisse
  2018-08-02 21:46         ` Jakub Jelinek
  0 siblings, 2 replies; 16+ messages in thread
From: Allan Sandfeld Jensen @ 2018-08-02 20:51 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Biener, Marc Glisse, Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 1518 bytes --]

On Donnerstag, 2. August 2018 11:18:41 CEST Richard Biener wrote:
> On Thu, Aug 2, 2018 at 11:12 AM Allan Sandfeld Jensen
> 
> <linux@carewolf.com> wrote:
> > On Mittwoch, 1. August 2018 18:51:41 CEST Marc Glisse wrote:
> > > On Wed, 1 Aug 2018, Allan Sandfeld Jensen wrote:
> > > >  extern __inline __m128d __attribute__((__gnu_inline__,
> > > >  __always_inline__,
> > > > 
> > > > __artificial__))
> > > > 
> > > >  _mm_move_sd (__m128d __A, __m128d __B)
> > > >  {
> > > > 
> > > > -  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
> > > > +  return __extension__ (__m128d)(__v2df){__B[0],__A[1]};
> > > > 
> > > >  }
> > > 
> > > If the goal is to have it represented as a VEC_PERM_EXPR internally, I
> > > wonder if we should be explicit and use __builtin_shuffle instead of
> > > relying on some forwprop pass to transform it. Maybe not, just asking.
> > > And
> > > the answer need not even be the same for _mm_move_sd and _mm_move_ss.
> > 
> > I wrote it this way because this pattern could later also be used for the
> > other _ss intrinsics, such as _mm_add_ss, where a _builtin_shuffle could
> > not. To match the other intrinsics the logic that tries to match vector
> > construction just needs to be extended to try merge patterns even if one
> > of the subexpressions is not simple.
> 
> The question is what users expect and get when they use -O0 with intrinsics?
> 
> Richard.
> 
Here is the version with __builtin_shuffle. It might be more expectable -O0, 
but it is also uglier.


[-- Attachment #2: movss-movsd-blends.patch --]
[-- Type: text/x-patch, Size: 2939 bytes --]

diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index b940a39d27b..6501638f619 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -113,7 +113,7 @@ _mm_setzero_pd (void)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_sd (__m128d __A, __m128d __B)
 {
-  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
+  return __extension__ (__m128d) __builtin_shuffle((__v2df)__A, (__v2df)__B, (__v2di){2, 1});
 }
 
 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ee409cfe7e4..2337ef5ea08 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -46143,6 +46143,46 @@ expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
   return ok;
 }
 
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   using movss or movsd.  */
+static bool
+expand_vec_perm_movs (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  unsigned i, nelt = d->nelt;
+  rtx x;
+
+  if (d->one_operand_p)
+    return false;
+
+  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
+    ;
+  else
+    return false;
+
+  /* Only the first element is changed. */
+  if (d->perm[0] != nelt && d->perm[0] != 0)
+    return false;
+  for (i = 1; i < nelt; ++i) {
+    {
+      if (d->perm[i] != i + nelt - d->perm[0])
+        return false;
+    }
+  }
+
+  if (d->testing_p)
+    return true;
+
+  if (d->perm[0] == nelt)
+    x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
+  else
+    x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
+
+  emit_insn (gen_rtx_SET (d->target, x));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
 
@@ -46885,6 +46925,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	}
     }
 
+  /* Try movss/movsd instructions.  */
+  if (expand_vec_perm_movs (d))
+    return true;
+
   /* Finally, try the fully general two operand permute.  */
   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
 			      d->testing_p))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index f64f3f74a0b..45b99ff87d5 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -1011,7 +1011,8 @@ _mm_storer_ps (float *__P, __m128 __A)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_ss (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
+  return __extension__ (__m128) __builtin_shuffle((__v4sf)__A, (__v4sf)__B,
+                                                  (__attribute__((__vector_size__ (16))) int){4, 1, 2, 3});
 }
 
 /* Extracts one of the four words of A.  The selector N must be immediate.  */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][x86] Match movss and movsd "blend" instructions
  2018-08-02 20:51       ` Allan Sandfeld Jensen
@ 2018-08-02 21:15         ` Marc Glisse
  2018-08-02 21:52           ` Allan Sandfeld Jensen
  2018-08-02 21:46         ` Jakub Jelinek
  1 sibling, 1 reply; 16+ messages in thread
From: Marc Glisse @ 2018-08-02 21:15 UTC (permalink / raw)
  To: Allan Sandfeld Jensen; +Cc: gcc-patches, Jakub Jelinek, Richard Biener

On Thu, 2 Aug 2018, Allan Sandfeld Jensen wrote:

> I forgot. One of the things that makes using __builtin_shuffle ugly is that
> __v4si  as the suffle argument needs to be in _mm_move_ss, is declared
> in emmintrin.h, but _mm_move_ss is in xmmintrin.h.

__v4si is some internal detail, I don't see much issue with moving it to 
xmmintrin.h if you want to use it there.

> In general the gcc __builtin_shuffle syntax with the argument being a vector
> is kind of ackward. At least for the declaring intrinsics, the clang still
> where the permutator is extra argument is easier to deal with:
> __builtin_shuffle(a, b, (__v4si){4, 0, 1, 2})
> vs
> __builtin_shuffle(a, b, 4, 0, 1, 2)

__builtin_shufflevector IIRC

>> The question is what users expect and get when they use -O0 with intrinsics?
>>
> Here is the version with __builtin_shuffle. It might be more expectable -O0,
> but it is also uglier.

I am not convinced -O0 is very important.

If you start extending your approach to _mm_add_sd and others, while one 
instruction is easy enough to recognize, if we put several in a row, they 
will be partially simplified and may become harder to recognize.
{ x*(y+v[0]-z), v[1] } requires that you notice that the upper part of 
this vector is v[1], i.e. the upper part of a vector whose lower part 
appears somewhere in the arbitrarily complex expression for the lower 
part of the result. And you then have to propagate the fact that you are 
doing vector operations all the way back to v[0].

I don't have a strong opinion on what the best approach is.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][x86] Match movss and movsd "blend" instructions
  2018-08-02 20:51       ` Allan Sandfeld Jensen
  2018-08-02 21:15         ` Marc Glisse
@ 2018-08-02 21:46         ` Jakub Jelinek
  2018-08-02 21:51           ` Allan Sandfeld Jensen
  1 sibling, 1 reply; 16+ messages in thread
From: Jakub Jelinek @ 2018-08-02 21:46 UTC (permalink / raw)
  To: Allan Sandfeld Jensen; +Cc: gcc-patches, Richard Biener, Marc Glisse

On Thu, Aug 02, 2018 at 10:50:58PM +0200, Allan Sandfeld Jensen wrote:
> Here is the version with __builtin_shuffle. It might be more expectable -O0, 
> but it is also uglier.

I don't find anything ugly on it, except the formatting glitches (missing
space before (, overlong line, and useless __extension__.
Improving code generated for __builtin_shuffle is desirable too.

> --- a/gcc/config/i386/xmmintrin.h
> +++ b/gcc/config/i386/xmmintrin.h
> @@ -1011,7 +1011,8 @@ _mm_storer_ps (float *__P, __m128 __A)
>  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm_move_ss (__m128 __A, __m128 __B)
>  {
> -  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
> +  return __extension__ (__m128) __builtin_shuffle((__v4sf)__A, (__v4sf)__B,
> +                                                  (__attribute__((__vector_size__ (16))) int){4, 1, 2, 3});

And obviously use __v4si here instead of __attribute__((__vector_size__ (16))) int.

	Jakub

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][x86] Match movss and movsd "blend" instructions
  2018-08-02 21:46         ` Jakub Jelinek
@ 2018-08-02 21:51           ` Allan Sandfeld Jensen
  0 siblings, 0 replies; 16+ messages in thread
From: Allan Sandfeld Jensen @ 2018-08-02 21:51 UTC (permalink / raw)
  To: gcc-patches, Jakub Jelinek; +Cc: Richard Biener, Marc Glisse

On Donnerstag, 2. August 2018 23:46:37 CEST Jakub Jelinek wrote:
> On Thu, Aug 02, 2018 at 10:50:58PM +0200, Allan Sandfeld Jensen wrote:
> > Here is the version with __builtin_shuffle. It might be more expectable
> > -O0, but it is also uglier.
> 
> I don't find anything ugly on it, except the formatting glitches (missing
> space before (, overlong line, and useless __extension__.
> Improving code generated for __builtin_shuffle is desirable too.
> 

__extension__ is needed when using the the {...} initialization otherwise -
std=C89 will produce warnings about standards.  The line is a bit long, but I 
thought it looked better like this rather than adding any emergency line 
breaks. Is there a hard limit?

> > --- a/gcc/config/i386/xmmintrin.h
> > +++ b/gcc/config/i386/xmmintrin.h
> > @@ -1011,7 +1011,8 @@ _mm_storer_ps (float *__P, __m128 __A)
> > 
> >  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
> >  __artificial__)) _mm_move_ss (__m128 __A, __m128 __B)
> >  {
> > 
> > -  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
> > +  return __extension__ (__m128) __builtin_shuffle((__v4sf)__A,
> > (__v4sf)__B, +                                                 
> > (__attribute__((__vector_size__ (16))) int){4, 1, 2, 3});
> And obviously use __v4si here instead of __attribute__((__vector_size__
> (16))) int.
> 
__v4si is declared in emmintrin.h, so I couldn't use it here unless I moved 
the definition. I tried changing as little as possible to not trigger bike 
shedding.

'Allan



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][x86] Match movss and movsd "blend" instructions
  2018-08-02 21:15         ` Marc Glisse
@ 2018-08-02 21:52           ` Allan Sandfeld Jensen
  0 siblings, 0 replies; 16+ messages in thread
From: Allan Sandfeld Jensen @ 2018-08-02 21:52 UTC (permalink / raw)
  To: gcc-patches; +Cc: Marc Glisse, Jakub Jelinek, Richard Biener

On Donnerstag, 2. August 2018 23:15:28 CEST Marc Glisse wrote:
> On Thu, 2 Aug 2018, Allan Sandfeld Jensen wrote:
> > I forgot. One of the things that makes using __builtin_shuffle ugly is
> > that
> > __v4si  as the suffle argument needs to be in _mm_move_ss, is declared
> > in emmintrin.h, but _mm_move_ss is in xmmintrin.h.
> 
> __v4si is some internal detail, I don't see much issue with moving it to
> xmmintrin.h if you want to use it there.
> 
> > In general the gcc __builtin_shuffle syntax with the argument being a
> > vector is kind of ackward. At least for the declaring intrinsics, the
> > clang still where the permutator is extra argument is easier to deal
> > with:
> > __builtin_shuffle(a, b, (__v4si){4, 0, 1, 2})
> > vs
> > __builtin_shuffle(a, b, 4, 0, 1, 2)
> 
> __builtin_shufflevector IIRC
> 
> >> The question is what users expect and get when they use -O0 with
> >> intrinsics?> 
> > Here is the version with __builtin_shuffle. It might be more expectable
> > -O0, but it is also uglier.
> 
> I am not convinced -O0 is very important.
> 
Me neither, and in any case I would argue the logic that recognizes the vector 
constructions patterns are not optimizations but instruction matching.

> If you start extending your approach to _mm_add_sd and others, while one
> instruction is easy enough to recognize, if we put several in a row, they
> will be partially simplified and may become harder to recognize.
> { x*(y+v[0]-z), v[1] } requires that you notice that the upper part of
> this vector is v[1], i.e. the upper part of a vector whose lower part
> appears somewhere in the arbitrarily complex expression for the lower
> part of the result. And you then have to propagate the fact that you are
> doing vector operations all the way back to v[0].
> 
> I don't have a strong opinion on what the best approach is.

Yes, I am not sure all of those could be done exhaustively with the existing 
logic, and it might also be of dubious value as in almost all cases the ps 
instructions have the same latency and bandwidth as the ss instructions, so 
developers should probably use _ps versions as they are scheduled better by 
the compiler (or at least better by gcc).
It was just an idea, and I haven't tried it at this point.

'Allan



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][x86] Match movss and movsd "blend" instructions
  2018-08-01 16:00 [PATCH][x86] Match movss and movsd "blend" instructions Allan Sandfeld Jensen
  2018-08-01 16:51 ` Marc Glisse
@ 2018-08-11  9:05 ` Allan Sandfeld Jensen
  2018-08-11  9:18   ` Jakub Jelinek
  1 sibling, 1 reply; 16+ messages in thread
From: Allan Sandfeld Jensen @ 2018-08-11  9:05 UTC (permalink / raw)
  To: gcc-patches, Jakub Jelinek; +Cc: Marc Glisse

[-- Attachment #1: Type: text/plain, Size: 486 bytes --]

Updated:

Match movss and movsd "blend" instructions

Adds the ability to match movss and movsd as blend patterns,
implemented in a new method to be able to match these before shuffles,
while keeping other blends after.

2018-08-11 Allan Sandfeld Jensen <allan.jensen@qt.io>

gcc/config/i386

    * i386.cc (expand_vec_perm_movs): New method matching movs
    patterns.
    * i386.cc (expand_vec_perm_1): Try the new method.

gcc/testsuite

    * gcc.target/i386/sse2-movs.c: New test.

[-- Attachment #2: movss-movsd-blends-2.patch --]
[-- Type: text/x-patch, Size: 3002 bytes --]

diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index b940a39d27b..6501638f619 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -113,7 +113,7 @@ _mm_setzero_pd (void)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_sd (__m128d __A, __m128d __B)
 {
-  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
+  return __extension__ (__m128d) __builtin_shuffle((__v2df)__A, (__v2df)__B, (__v2di){2, 1});
 }
 
 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 7554fd1f659..485850096e9 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -46145,6 +46145,46 @@ expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
   return ok;
 }
 
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   using movss or movsd.  */
+static bool
+expand_vec_perm_movs (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  unsigned i, nelt = d->nelt;
+  rtx x;
+
+  if (d->one_operand_p)
+    return false;
+
+  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
+    ;
+  else
+    return false;
+
+  /* Only the first element is changed. */
+  if (d->perm[0] != nelt && d->perm[0] != 0)
+    return false;
+  for (i = 1; i < nelt; ++i) {
+    {
+      if (d->perm[i] != i + nelt - d->perm[0])
+        return false;
+    }
+  }
+
+  if (d->testing_p)
+    return true;
+
+  if (d->perm[0] == nelt)
+    x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
+  else
+    x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
+
+  emit_insn (gen_rtx_SET (d->target, x));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
 
@@ -46887,6 +46927,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	}
     }
 
+  /* Try movss/movsd instructions.  */
+  if (expand_vec_perm_movs (d))
+    return true;
+
   /* Finally, try the fully general two operand permute.  */
   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
 			      d->testing_p))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index f64f3f74a0b..f770570295c 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -1011,7 +1011,10 @@ _mm_storer_ps (float *__P, __m128 __A)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_ss (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
+  return (__m128) __builtin_shuffle ((__v4sf)__A, (__v4sf)__B,
+                                     __extension__
+                                     (__attribute__((__vector_size__ (16))) int)
+                                     {4,1,2,3});
 }
 
 /* Extracts one of the four words of A.  The selector N must be immediate.  */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][x86] Match movss and movsd "blend" instructions
  2018-08-11  9:05 ` Allan Sandfeld Jensen
@ 2018-08-11  9:18   ` Jakub Jelinek
  2018-08-11  9:55     ` Allan Sandfeld Jensen
  0 siblings, 1 reply; 16+ messages in thread
From: Jakub Jelinek @ 2018-08-11  9:18 UTC (permalink / raw)
  To: Allan Sandfeld Jensen, Uros Bizjak; +Cc: gcc-patches, Marc Glisse

On Sat, Aug 11, 2018 at 10:59:26AM +0200, Allan Sandfeld Jensen wrote:
> +/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
> +   using movss or movsd.  */
> +static bool
> +expand_vec_perm_movs (struct expand_vec_perm_d *d)
> +{
> +  machine_mode vmode = d->vmode;
> +  unsigned i, nelt = d->nelt;
> +  rtx x;
> +
> +  if (d->one_operand_p)
> +    return false;
> +
> +  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
> +    ;
> +  else
> +    return false;
> +
> +  /* Only the first element is changed. */

Two spaces after .

> +  if (d->perm[0] != nelt && d->perm[0] != 0)
> +    return false;
> +  for (i = 1; i < nelt; ++i) {
> +    {
> +      if (d->perm[i] != i + nelt - d->perm[0])
> +        return false;
> +    }
> +  }

Extraneous {}s (both pairs, the outer ones even badly indented).

Otherwise LGTM.

	Jakub

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][x86] Match movss and movsd "blend" instructions
  2018-08-11  9:18   ` Jakub Jelinek
@ 2018-08-11  9:55     ` Allan Sandfeld Jensen
  2018-08-12 14:39       ` Uros Bizjak
  2018-08-15  4:34       ` Jeff Law
  0 siblings, 2 replies; 16+ messages in thread
From: Allan Sandfeld Jensen @ 2018-08-11  9:55 UTC (permalink / raw)
  To: gcc-patches, Jakub Jelinek; +Cc: Uros Bizjak, Marc Glisse

[-- Attachment #1: Type: text/plain, Size: 1108 bytes --]

On Samstag, 11. August 2018 11:18:39 CEST Jakub Jelinek wrote:
> On Sat, Aug 11, 2018 at 10:59:26AM +0200, Allan Sandfeld Jensen wrote:
> > +/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
> > +   using movss or movsd.  */
> > +static bool
> > +expand_vec_perm_movs (struct expand_vec_perm_d *d)
> > +{
> > +  machine_mode vmode = d->vmode;
> > +  unsigned i, nelt = d->nelt;
> > +  rtx x;
> > +
> > +  if (d->one_operand_p)
> > +    return false;
> > +
> > +  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
> > +    ;
> > +  else
> > +    return false;
> > +
> > +  /* Only the first element is changed. */
> 
> Two spaces after .
> 
> > +  if (d->perm[0] != nelt && d->perm[0] != 0)
> > +    return false;
> > +  for (i = 1; i < nelt; ++i) {
> > +    {
> > +      if (d->perm[i] != i + nelt - d->perm[0])
> > +        return false;
> > +    }
> > +  }
> 
> Extraneous {}s (both pairs, the outer ones even badly indented).
> 
> Otherwise LGTM.
> 
Updated:

Note as an infrequent contributor don't have commit access, so I need someone 
reviewing to also commit.

'Allan

[-- Attachment #2: 0001-Match-movss-and-movsd-blend-instructions.patch --]
[-- Type: text/x-patch, Size: 3846 bytes --]

From e33241e5ddc7fa57c4ba7893669af7f7e636125e Mon Sep 17 00:00:00 2001
From: Allan Sandfeld Jensen <allan.jensen@qt.io>
Date: Sat, 11 Aug 2018 11:52:21 +0200
Subject: [PATCH] Match movss and movsd "blend" instructions

Adds the ability to match movss and movsd as blend patterns,
implemented in a new method to be able to match these before shuffles,
while keeping other blends after.

2018-08-11 Allan Sandfeld Jensen <allan.jensen@qt.io>

gcc/config/i386

    * i386.cc (expand_vec_perm_movs): New method matching movs
    patterns.
    * i386.cc (expand_vec_perm_1): Try the new method.

gcc/testsuite

    * gcc.target/i386/sse2-movs.c: New test.
---
 gcc/config/i386/emmintrin.h |  2 +-
 gcc/config/i386/i386.c      | 41 +++++++++++++++++++++++++++++++++++++
 gcc/config/i386/xmmintrin.h |  5 ++++-
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index b940a39d27b..6501638f619 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -113,7 +113,7 @@ _mm_setzero_pd (void)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_sd (__m128d __A, __m128d __B)
 {
-  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
+  return __extension__ (__m128d) __builtin_shuffle((__v2df)__A, (__v2df)__B, (__v2di){2, 1});
 }
 
 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 7554fd1f659..15a3caa94c3 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -46145,6 +46145,43 @@ expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
   return ok;
 }
 
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   using movss or movsd.  */
+static bool
+expand_vec_perm_movs (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  unsigned i, nelt = d->nelt;
+  rtx x;
+
+  if (d->one_operand_p)
+    return false;
+
+  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
+    ;
+  else
+    return false;
+
+  /* Only the first element is changed.  */
+  if (d->perm[0] != nelt && d->perm[0] != 0)
+    return false;
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != i + nelt - d->perm[0])
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  if (d->perm[0] == nelt)
+    x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
+  else
+    x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
+
+  emit_insn (gen_rtx_SET (d->target, x));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
 
@@ -46887,6 +46924,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	}
     }
 
+  /* Try movss/movsd instructions.  */
+  if (expand_vec_perm_movs (d))
+    return true;
+
   /* Finally, try the fully general two operand permute.  */
   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
 			      d->testing_p))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index f64f3f74a0b..f770570295c 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -1011,7 +1011,10 @@ _mm_storer_ps (float *__P, __m128 __A)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_ss (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
+  return (__m128) __builtin_shuffle ((__v4sf)__A, (__v4sf)__B,
+                                     __extension__
+                                     (__attribute__((__vector_size__ (16))) int)
+                                     {4,1,2,3});
 }
 
 /* Extracts one of the four words of A.  The selector N must be immediate.  */
-- 
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][x86] Match movss and movsd "blend" instructions
  2018-08-11  9:55     ` Allan Sandfeld Jensen
@ 2018-08-12 14:39       ` Uros Bizjak
  2018-08-15  4:34       ` Jeff Law
  1 sibling, 0 replies; 16+ messages in thread
From: Uros Bizjak @ 2018-08-12 14:39 UTC (permalink / raw)
  To: Allan Sandfeld Jensen; +Cc: gcc-patches, Jakub Jelinek, Marc Glisse

On Sat, Aug 11, 2018 at 11:54 AM, Allan Sandfeld Jensen
<linux@carewolf.com> wrote:
> On Samstag, 11. August 2018 11:18:39 CEST Jakub Jelinek wrote:
>> On Sat, Aug 11, 2018 at 10:59:26AM +0200, Allan Sandfeld Jensen wrote:
>> > +/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
>> > +   using movss or movsd.  */
>> > +static bool
>> > +expand_vec_perm_movs (struct expand_vec_perm_d *d)
>> > +{
>> > +  machine_mode vmode = d->vmode;
>> > +  unsigned i, nelt = d->nelt;
>> > +  rtx x;
>> > +
>> > +  if (d->one_operand_p)
>> > +    return false;
>> > +
>> > +  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
>> > +    ;
>> > +  else
>> > +    return false;
>> > +
>> > +  /* Only the first element is changed. */
>>
>> Two spaces after .
>>
>> > +  if (d->perm[0] != nelt && d->perm[0] != 0)
>> > +    return false;
>> > +  for (i = 1; i < nelt; ++i) {
>> > +    {
>> > +      if (d->perm[i] != i + nelt - d->perm[0])
>> > +        return false;
>> > +    }
>> > +  }
>>
>> Extraneous {}s (both pairs, the outer ones even badly indented).
>>
>> Otherwise LGTM.
>>
> Updated:
>
> Note as an infrequent contributor don't have commit access, so I need someone
> reviewing to also commit.

+  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
+    ;
+  else
+    return false;

V4SFmode can be used with TARGET_SSE only.

Uros.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][x86] Match movss and movsd "blend" instructions
  2018-08-11  9:55     ` Allan Sandfeld Jensen
  2018-08-12 14:39       ` Uros Bizjak
@ 2018-08-15  4:34       ` Jeff Law
  2018-08-15 19:37         ` Uros Bizjak
  1 sibling, 1 reply; 16+ messages in thread
From: Jeff Law @ 2018-08-15  4:34 UTC (permalink / raw)
  To: Allan Sandfeld Jensen, gcc-patches, Jakub Jelinek
  Cc: Uros Bizjak, Marc Glisse

On 08/11/2018 03:54 AM, Allan Sandfeld Jensen wrote:
> On Samstag, 11. August 2018 11:18:39 CEST Jakub Jelinek wrote:
>> On Sat, Aug 11, 2018 at 10:59:26AM +0200, Allan Sandfeld Jensen wrote:
>>> +/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
>>> +   using movss or movsd.  */
>>> +static bool
>>> +expand_vec_perm_movs (struct expand_vec_perm_d *d)
>>> +{
>>> +  machine_mode vmode = d->vmode;
>>> +  unsigned i, nelt = d->nelt;
>>> +  rtx x;
>>> +
>>> +  if (d->one_operand_p)
>>> +    return false;
>>> +
>>> +  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
>>> +    ;
>>> +  else
>>> +    return false;
>>> +
>>> +  /* Only the first element is changed. */
>>
>> Two spaces after .
>>
>>> +  if (d->perm[0] != nelt && d->perm[0] != 0)
>>> +    return false;
>>> +  for (i = 1; i < nelt; ++i) {
>>> +    {
>>> +      if (d->perm[i] != i + nelt - d->perm[0])
>>> +        return false;
>>> +    }
>>> +  }
>>
>> Extraneous {}s (both pairs, the outer ones even badly indented).
>>
>> Otherwise LGTM.
>>
> Updated:
> 
> Note as an infrequent contributor don't have commit access, so I need someone 
> reviewing to also commit.
I fixed up the ChangeLog, extracted the test from the original patch and
committed all the bits to the trunk.

Thanks,
jeff

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH][x86] Match movss and movsd "blend" instructions
  2018-08-15  4:34       ` Jeff Law
@ 2018-08-15 19:37         ` Uros Bizjak
  0 siblings, 0 replies; 16+ messages in thread
From: Uros Bizjak @ 2018-08-15 19:37 UTC (permalink / raw)
  To: Jeff Law; +Cc: Allan Sandfeld Jensen, gcc-patches, Jakub Jelinek, Marc Glisse

[-- Attachment #1: Type: text/plain, Size: 1653 bytes --]

On Wed, Aug 15, 2018 at 6:33 AM, Jeff Law <law@redhat.com> wrote:
> On 08/11/2018 03:54 AM, Allan Sandfeld Jensen wrote:
>> On Samstag, 11. August 2018 11:18:39 CEST Jakub Jelinek wrote:
>>> On Sat, Aug 11, 2018 at 10:59:26AM +0200, Allan Sandfeld Jensen wrote:
>>>> +/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
>>>> +   using movss or movsd.  */
>>>> +static bool
>>>> +expand_vec_perm_movs (struct expand_vec_perm_d *d)
>>>> +{
>>>> +  machine_mode vmode = d->vmode;
>>>> +  unsigned i, nelt = d->nelt;
>>>> +  rtx x;
>>>> +
>>>> +  if (d->one_operand_p)
>>>> +    return false;
>>>> +
>>>> +  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
>>>> +    ;
>>>> +  else
>>>> +    return false;
>>>> +
>>>> +  /* Only the first element is changed. */
>>>
>>> Two spaces after .
>>>
>>>> +  if (d->perm[0] != nelt && d->perm[0] != 0)
>>>> +    return false;
>>>> +  for (i = 1; i < nelt; ++i) {
>>>> +    {
>>>> +      if (d->perm[i] != i + nelt - d->perm[0])
>>>> +        return false;
>>>> +    }
>>>> +  }
>>>
>>> Extraneous {}s (both pairs, the outer ones even badly indented).
>>>
>>> Otherwise LGTM.
>>>
>> Updated:
>>
>> Note as an infrequent contributor don't have commit access, so I need someone
>> reviewing to also commit.
> I fixed up the ChangeLog, extracted the test from the original patch and
> committed all the bits to the trunk.

I have amended the committed code with attached fixup patch.

2018-08-15  Uros Bizjak  <ubizjak@gmail.com>

    * config/i386/i386.c (expand_vec_perm_movs): Enable V4SFmode
    for TARGET_SSE.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Uros.

[-- Attachment #2: p.diff.txt --]
[-- Type: text/plain, Size: 528 bytes --]

Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 263563)
+++ config/i386/i386.c	(working copy)
@@ -46157,9 +46157,8 @@ expand_vec_perm_movs (struct expand_vec_perm_d *d)
   if (d->one_operand_p)
     return false;
 
-  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
-    ;
-  else
+  if (!(TARGET_SSE && vmode == V4SFmode)
+      && !(TARGET_SSE2 && vmode == V2DFmode))
     return false;
 
   /* Only the first element is changed.  */

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2018-08-15 19:37 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-01 16:00 [PATCH][x86] Match movss and movsd "blend" instructions Allan Sandfeld Jensen
2018-08-01 16:51 ` Marc Glisse
2018-08-02  9:12   ` Allan Sandfeld Jensen
2018-08-02  9:18     ` Richard Biener
2018-08-02 20:51       ` Allan Sandfeld Jensen
2018-08-02 21:15         ` Marc Glisse
2018-08-02 21:52           ` Allan Sandfeld Jensen
2018-08-02 21:46         ` Jakub Jelinek
2018-08-02 21:51           ` Allan Sandfeld Jensen
2018-08-02 20:39   ` Allan Sandfeld Jensen
2018-08-11  9:05 ` Allan Sandfeld Jensen
2018-08-11  9:18   ` Jakub Jelinek
2018-08-11  9:55     ` Allan Sandfeld Jensen
2018-08-12 14:39       ` Uros Bizjak
2018-08-15  4:34       ` Jeff Law
2018-08-15 19:37         ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).