public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Optimize vector init constructor
@ 2019-03-03 14:32 H.J. Lu
  2019-03-03 14:40 ` Andrew Pinski
  0 siblings, 1 reply; 17+ messages in thread
From: H.J. Lu @ 2019-03-03 14:32 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Guenther

For vector init constructor:

---
typedef float __v4sf __attribute__ ((__vector_size__ (16)));

__v4sf
foo (__v4sf x, float f)
{
  __v4sf y = { f, x[1], x[2], x[3] };
  return y;
}
---

we can optimize vector init constructor with vector copy or permute
followed by a single scalar insert:

  __v4sf D.1912;
  __v4sf D.1913;
  __v4sf D.1914;
  __v4sf y;

  x.0_1 = x;
  D.1912 = x.0_1;
  _2 = D.1912;
  D.1913 = _2;
  BIT_FIELD_REF <D.1913, 32, 0> = f;
  y = D.1913;
  D.1914 = y;
  return D.1914;

instead of

  __v4sf D.1962;
  __v4sf y;

  _1 = BIT_FIELD_REF <x, 32, 32>;
  _2 = BIT_FIELD_REF <x, 32, 64>;
  _3 = BIT_FIELD_REF <x, 32, 96>;
  y = {f, _1, _2, _3};
  D.1962 = y;
  return D.1962;

gcc/

	PR tree-optimization/88828
	* gimplify.c (gimplify_init_constructor): Optimize vector init
	constructor with vector copy or permute followed by a single
	scalar insert.

gcc/testsuite/

	PR tree-optimization/88828
	* gcc.target/i386/pr88828-1.c: New test.
	* gcc.target/i386/pr88828-2.c: Likewise.
	* gcc.target/i386/pr88828-3a.c: Likewise.
	* gcc.target/i386/pr88828-3b.c: Likewise.
	* gcc.target/i386/pr88828-4a.c: Likewise.
	* gcc.target/i386/pr88828-4b.c: Likewise.
	* gcc.target/i386/pr88828-5a.c: Likewise.
	* gcc.target/i386/pr88828-5b.c: Likewise.
	* gcc.target/i386/pr88828-6a.c: Likewise.
	* gcc.target/i386/pr88828-6b.c: Likewise.
---
 gcc/gimplify.c                             | 176 +++++++++++++++++++--
 gcc/testsuite/gcc.target/i386/pr88828-1.c  |  16 ++
 gcc/testsuite/gcc.target/i386/pr88828-2.c  |  17 ++
 gcc/testsuite/gcc.target/i386/pr88828-3a.c |  16 ++
 gcc/testsuite/gcc.target/i386/pr88828-3b.c |  18 +++
 gcc/testsuite/gcc.target/i386/pr88828-4a.c |  17 ++
 gcc/testsuite/gcc.target/i386/pr88828-4b.c |  20 +++
 gcc/testsuite/gcc.target/i386/pr88828-5a.c |  16 ++
 gcc/testsuite/gcc.target/i386/pr88828-5b.c |  18 +++
 gcc/testsuite/gcc.target/i386/pr88828-6a.c |  17 ++
 gcc/testsuite/gcc.target/i386/pr88828-6b.c |  19 +++
 11 files changed, 336 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6b.c

diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index 983635ba21f..893a4311f9e 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -5082,22 +5082,170 @@ gimplify_init_constructor (tree *expr_p, gimple_seq *pre_p, gimple_seq *post_p,
 	    TREE_CONSTANT (ctor) = 0;
 	  }
 
-	/* Vector types use CONSTRUCTOR all the way through gimple
-	   compilation as a general initializer.  */
-	FOR_EACH_VEC_SAFE_ELT (elts, ix, ce)
+	tree rhs_vector = NULL;
+	/* The vector element to replace scalar elements, which
+	   will be overridden by scalar insert.  */
+	tree vector_element = NULL;
+	/* The single scalar element.  */
+	tree scalar_element = NULL;
+	unsigned int scalar_idx = 0;
+	enum { unknown, copy, permute, init } operation = unknown;
+	bool insert = false;
+
+	/* Check if we can generate vector copy or permute followed by
+	   a single scalar insert.  */
+	if (TYPE_VECTOR_SUBPARTS (type).is_constant ())
 	  {
-	    enum gimplify_status tret;
-	    tret = gimplify_expr (&ce->value, pre_p, post_p, is_gimple_val,
-				  fb_rvalue);
-	    if (tret == GS_ERROR)
-	      ret = GS_ERROR;
-	    else if (TREE_STATIC (ctor)
-		     && !initializer_constant_valid_p (ce->value,
-						       TREE_TYPE (ce->value)))
-	      TREE_STATIC (ctor) = 0;
+	    /* If all RHS vector elements come from the same vector,
+	       we can use permute.  If all RHS vector elements come
+	       from the same vector in the same order, we can use
+	       copy.  */
+	    unsigned int nunits
+	      = TYPE_VECTOR_SUBPARTS (type).to_constant ();
+	    unsigned int nscalars = 0;
+	    unsigned int nvectors = 0;
+	    operation = unknown;
+	    FOR_EACH_VEC_SAFE_ELT (elts, ix, ce)
+	      if (TREE_CODE (ce->value) == ARRAY_REF
+		  || TREE_CODE (ce->value) == ARRAY_RANGE_REF)
+		{
+		  if (!vector_element)
+		    vector_element = ce->value;
+		  /* Get the vector index.  */
+		  tree idx = TREE_OPERAND (ce->value, 1);
+		  if (TREE_CODE (idx) == INTEGER_CST)
+		    {
+		      /* Get the RHS vector.  */
+		      tree r = ce->value;
+		      while (handled_component_p (r))
+			r = TREE_OPERAND (r, 0);
+		      if (type == TREE_TYPE (r))
+			{
+			  /* The RHS vector has the same type as
+			     LHS.  */
+			  if (rhs_vector == NULL)
+			    rhs_vector = r;
+
+			  /* Check if all RHS vector elements come
+			     fome the same vector.  */
+			  if (rhs_vector == r)
+			    {
+			      nvectors++;
+			      if (TREE_INT_CST_LOW (idx) == ix
+				  && (operation == unknown
+				      || operation == copy))
+				operation = copy;
+			      else
+				operation = permute;
+			      continue;
+			    }
+			}
+		    }
+
+		  /* Otherwise, use vector init.  */
+		  break;
+		}
+	      else if (TREE_CODE (TYPE_SIZE (TREE_TYPE (ce->value)))
+		       == INTEGER_CST)
+		{
+		  /* Only allow one single scalar insert.  */
+		  if (nscalars != 0)
+		    break;
+		  nscalars = 1;
+		  insert = true;
+		  scalar_idx = ix;
+		  scalar_element = ce->value;
+		}
+
+	    /* Allow a single scalar insert with vector copy or
+	       vector permute.  Vector copy without insert is OK.  */
+	    if (nunits != (nscalars + nvectors)
+		|| (nscalars == 0 && operation != copy))
+	      operation = unknown;
+	  }
+
+	if (operation == unknown)
+	  {
+	    /* Default to the regular vector init constructor.  */
+	    operation = init;
+	    insert = false;
+	  }
+
+	if (operation == copy)
+	  {
+	    /* Generate a vector copy.  */
+	    tree var = create_tmp_var (type);
+	    if (gimplify_expr (&rhs_vector, pre_p, post_p,
+			       is_gimple_val, fb_rvalue) == GS_ERROR)
+	      {
+		ret = GS_ERROR;
+		break;
+	      }
+	    gassign *init = gimple_build_assign (var, rhs_vector);
+	    gimple_seq_add_stmt (pre_p, init);
+	    if (gimplify_expr (&var, pre_p, post_p, is_gimple_val,
+			       fb_rvalue) == GS_ERROR)
+	      {
+		ret = GS_ERROR;
+		break;
+	      }
+	    /* Replace RHS with the vector copy.  */
+	    if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0)))
+	      TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (var, pre_p);
+	    else
+	      TREE_OPERAND (*expr_p, 1) = var;
+	  }
+	else
+	  {
+	    /* Prepare for vector permute by replacing the scalar
+	       element with the vector one.  */
+	    if (operation == permute)
+	      (elts->address())[scalar_idx].value = vector_element;
+
+	    /* Vector types use CONSTRUCTOR all the way through gimple
+	       compilation as a general initializer.  */
+	    FOR_EACH_VEC_SAFE_ELT (elts, ix, ce)
+	      {
+		enum gimplify_status tret;
+		tret = gimplify_expr (&ce->value, pre_p, post_p,
+				      is_gimple_val,
+				      fb_rvalue);
+		if (tret == GS_ERROR)
+		  ret = GS_ERROR;
+		else if (TREE_STATIC (ctor)
+			 && !initializer_constant_valid_p (ce->value,
+							   TREE_TYPE (ce->value)))
+		  TREE_STATIC (ctor) = 0;
+	      }
+	    if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0)))
+	      TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p);
+	  }
+
+	if (insert)
+	  {
+	    /* Generate a single scalar insert after vector copy or
+	       permute.  */
+	    tree rhs = TREE_OPERAND (*expr_p, 1);
+	    tree var = create_tmp_var (type);
+	    gassign *init = gimple_build_assign (var, rhs);
+	    gimple_seq_add_stmt (pre_p, init);
+	    if (gimplify_expr (&scalar_element, pre_p, post_p,
+			       is_gimple_val, fb_rvalue) == GS_ERROR)
+	      {
+		ret = GS_ERROR;
+		break;
+	      }
+	    tree scalar_type = TREE_TYPE (scalar_element);
+	    tree scalar_size = TYPE_SIZE (scalar_type);
+	    tree bitpos = bitsize_int (scalar_idx
+				       * TREE_INT_CST_LOW (scalar_size));
+	    tree ref = build3_loc (EXPR_LOCATION (rhs), BIT_FIELD_REF,
+				   scalar_type, var, scalar_size,
+				   bitpos);
+	    init = gimple_build_assign (ref, scalar_element);
+	    gimplify_seq_add_stmt (pre_p, init);
+	    TREE_OPERAND (*expr_p, 1) = var;
 	  }
-	if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0)))
-	  TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p);
       }
       break;
 
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c b/gcc/testsuite/gcc.target/i386/pr88828-1.c
new file mode 100644
index 00000000000..4ef1feab389
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { f, x[1], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c b/gcc/testsuite/gcc.target/i386/pr88828-2.c
new file mode 100644
index 00000000000..6dc482b6f4b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = x;
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
new file mode 100644
index 00000000000..97eb8e7162a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
new file mode 100644
index 00000000000..ab2ba730716
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
new file mode 100644
index 00000000000..a54689be701
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
new file mode 100644
index 00000000000..0c3a1024d93
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
new file mode 100644
index 00000000000..534808d3cd1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
new file mode 100644
index 00000000000..aebea790979
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6a.c b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
new file mode 100644
index 00000000000..d43a36d9137
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6b.c b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
new file mode 100644
index 00000000000..6856fe6500e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
-- 
2.20.1

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Optimize vector init constructor
  2019-03-03 14:32 [PATCH] Optimize vector init constructor H.J. Lu
@ 2019-03-03 14:40 ` Andrew Pinski
  2019-03-03 21:13   ` H.J. Lu
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Pinski @ 2019-03-03 14:40 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GCC Patches, Richard Guenther

)
,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> For vector init constructor:
>
> ---
> typedef float __v4sf __attribute__ ((__vector_size__ (16)));
>
> __v4sf
> foo (__v4sf x, float f)
> {
>   __v4sf y = { f, x[1], x[2], x[3] };
>   return y;
> }
> ---
>
> we can optimize vector init constructor with vector copy or permute
> followed by a single scalar insert:
>
>   __v4sf D.1912;
>   __v4sf D.1913;
>   __v4sf D.1914;
>   __v4sf y;
>
>   x.0_1 = x;
>   D.1912 = x.0_1;
>   _2 = D.1912;
>   D.1913 = _2;
>   BIT_FIELD_REF <D.1913, 32, 0> = f;
>   y = D.1913;
>   D.1914 = y;
>   return D.1914;
>
> instead of
>
>   __v4sf D.1962;
>   __v4sf y;
>
>   _1 = BIT_FIELD_REF <x, 32, 32>;
>   _2 = BIT_FIELD_REF <x, 32, 64>;
>   _3 = BIT_FIELD_REF <x, 32, 96>;
>   y = {f, _1, _2, _3};
>   D.1962 = y;
>   return D.1962;
>
> gcc/
>
>         PR tree-optimization/88828
>         * gimplify.c (gimplify_init_constructor): Optimize vector init
>         constructor with vector copy or permute followed by a single
>         scalar insert.


Doing this here does not catch things like:
typedef float __v4sf __attribute__ ((__vector_size__ (16)));


__v4sf
vector_init (float f0,float f1, float f2,float f3)
{
  __v4sf y = { f, x[1], x[2], x[3] };
   return y;
}

__v4sf
foo (__v4sf x, float f)
{
  return vector_init (f, x[1], x[2], x[3]) ;
}

>
> gcc/testsuite/
>
>         PR tree-optimization/88828
>         * gcc.target/i386/pr88828-1.c: New test.
>         * gcc.target/i386/pr88828-2.c: Likewise.
>         * gcc.target/i386/pr88828-3a.c: Likewise.
>         * gcc.target/i386/pr88828-3b.c: Likewise.
>         * gcc.target/i386/pr88828-4a.c: Likewise.
>         * gcc.target/i386/pr88828-4b.c: Likewise.
>         * gcc.target/i386/pr88828-5a.c: Likewise.
>         * gcc.target/i386/pr88828-5b.c: Likewise.
>         * gcc.target/i386/pr88828-6a.c: Likewise.
>         * gcc.target/i386/pr88828-6b.c: Likewise.
> ---
>  gcc/gimplify.c                             | 176 +++++++++++++++++++--
>  gcc/testsuite/gcc.target/i386/pr88828-1.c  |  16 ++
>  gcc/testsuite/gcc.target/i386/pr88828-2.c  |  17 ++
>  gcc/testsuite/gcc.target/i386/pr88828-3a.c |  16 ++
>  gcc/testsuite/gcc.target/i386/pr88828-3b.c |  18 +++
>  gcc/testsuite/gcc.target/i386/pr88828-4a.c |  17 ++
>  gcc/testsuite/gcc.target/i386/pr88828-4b.c |  20 +++
>  gcc/testsuite/gcc.target/i386/pr88828-5a.c |  16 ++
>  gcc/testsuite/gcc.target/i386/pr88828-5b.c |  18 +++
>  gcc/testsuite/gcc.target/i386/pr88828-6a.c |  17 ++
>  gcc/testsuite/gcc.target/i386/pr88828-6b.c |  19 +++
>  11 files changed, 336 insertions(+), 14 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6b.c
>
> diff --git a/gcc/gimplify.c b/gcc/gimplify.c
> index 983635ba21f..893a4311f9e 100644
> --- a/gcc/gimplify.c
> +++ b/gcc/gimplify.c
> @@ -5082,22 +5082,170 @@ gimplify_init_constructor (tree *expr_p, gimple_seq *pre_p, gimple_seq *post_p,
>             TREE_CONSTANT (ctor) = 0;
>           }
>
> -       /* Vector types use CONSTRUCTOR all the way through gimple
> -          compilation as a general initializer.  */
> -       FOR_EACH_VEC_SAFE_ELT (elts, ix, ce)
> +       tree rhs_vector = NULL;
> +       /* The vector element to replace scalar elements, which
> +          will be overridden by scalar insert.  */
> +       tree vector_element = NULL;
> +       /* The single scalar element.  */
> +       tree scalar_element = NULL;
> +       unsigned int scalar_idx = 0;
> +       enum { unknown, copy, permute, init } operation = unknown;
> +       bool insert = false;
> +
> +       /* Check if we can generate vector copy or permute followed by
> +          a single scalar insert.  */
> +       if (TYPE_VECTOR_SUBPARTS (type).is_constant ())
>           {
> -           enum gimplify_status tret;
> -           tret = gimplify_expr (&ce->value, pre_p, post_p, is_gimple_val,
> -                                 fb_rvalue);
> -           if (tret == GS_ERROR)
> -             ret = GS_ERROR;
> -           else if (TREE_STATIC (ctor)
> -                    && !initializer_constant_valid_p (ce->value,
> -                                                      TREE_TYPE (ce->value)))
> -             TREE_STATIC (ctor) = 0;
> +           /* If all RHS vector elements come from the same vector,
> +              we can use permute.  If all RHS vector elements come
> +              from the same vector in the same order, we can use
> +              copy.  */
> +           unsigned int nunits
> +             = TYPE_VECTOR_SUBPARTS (type).to_constant ();
> +           unsigned int nscalars = 0;
> +           unsigned int nvectors = 0;
> +           operation = unknown;
> +           FOR_EACH_VEC_SAFE_ELT (elts, ix, ce)
> +             if (TREE_CODE (ce->value) == ARRAY_REF
> +                 || TREE_CODE (ce->value) == ARRAY_RANGE_REF)
> +               {
> +                 if (!vector_element)
> +                   vector_element = ce->value;
> +                 /* Get the vector index.  */
> +                 tree idx = TREE_OPERAND (ce->value, 1);
> +                 if (TREE_CODE (idx) == INTEGER_CST)
> +                   {
> +                     /* Get the RHS vector.  */
> +                     tree r = ce->value;
> +                     while (handled_component_p (r))
> +                       r = TREE_OPERAND (r, 0);
> +                     if (type == TREE_TYPE (r))
> +                       {
> +                         /* The RHS vector has the same type as
> +                            LHS.  */
> +                         if (rhs_vector == NULL)
> +                           rhs_vector = r;
> +
> +                         /* Check if all RHS vector elements come
> +                            fome the same vector.  */
> +                         if (rhs_vector == r)
> +                           {
> +                             nvectors++;
> +                             if (TREE_INT_CST_LOW (idx) == ix
> +                                 && (operation == unknown
> +                                     || operation == copy))
> +                               operation = copy;
> +                             else
> +                               operation = permute;
> +                             continue;
> +                           }
> +                       }
> +                   }
> +
> +                 /* Otherwise, use vector init.  */
> +                 break;
> +               }
> +             else if (TREE_CODE (TYPE_SIZE (TREE_TYPE (ce->value)))
> +                      == INTEGER_CST)
> +               {
> +                 /* Only allow one single scalar insert.  */
> +                 if (nscalars != 0)
> +                   break;
> +                 nscalars = 1;
> +                 insert = true;
> +                 scalar_idx = ix;
> +                 scalar_element = ce->value;
> +               }
> +
> +           /* Allow a single scalar insert with vector copy or
> +              vector permute.  Vector copy without insert is OK.  */
> +           if (nunits != (nscalars + nvectors)
> +               || (nscalars == 0 && operation != copy))
> +             operation = unknown;
> +         }
> +
> +       if (operation == unknown)
> +         {
> +           /* Default to the regular vector init constructor.  */
> +           operation = init;
> +           insert = false;
> +         }
> +
> +       if (operation == copy)
> +         {
> +           /* Generate a vector copy.  */
> +           tree var = create_tmp_var (type);
> +           if (gimplify_expr (&rhs_vector, pre_p, post_p,
> +                              is_gimple_val, fb_rvalue) == GS_ERROR)
> +             {
> +               ret = GS_ERROR;
> +               break;
> +             }
> +           gassign *init = gimple_build_assign (var, rhs_vector);
> +           gimple_seq_add_stmt (pre_p, init);
> +           if (gimplify_expr (&var, pre_p, post_p, is_gimple_val,
> +                              fb_rvalue) == GS_ERROR)
> +             {
> +               ret = GS_ERROR;
> +               break;
> +             }
> +           /* Replace RHS with the vector copy.  */
> +           if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0)))
> +             TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (var, pre_p);
> +           else
> +             TREE_OPERAND (*expr_p, 1) = var;
> +         }
> +       else
> +         {
> +           /* Prepare for vector permute by replacing the scalar
> +              element with the vector one.  */
> +           if (operation == permute)
> +             (elts->address())[scalar_idx].value = vector_element;
> +
> +           /* Vector types use CONSTRUCTOR all the way through gimple
> +              compilation as a general initializer.  */
> +           FOR_EACH_VEC_SAFE_ELT (elts, ix, ce)
> +             {
> +               enum gimplify_status tret;
> +               tret = gimplify_expr (&ce->value, pre_p, post_p,
> +                                     is_gimple_val,
> +                                     fb_rvalue);
> +               if (tret == GS_ERROR)
> +                 ret = GS_ERROR;
> +               else if (TREE_STATIC (ctor)
> +                        && !initializer_constant_valid_p (ce->value,
> +                                                          TREE_TYPE (ce->value)))
> +                 TREE_STATIC (ctor) = 0;
> +             }
> +           if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0)))
> +             TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p);
> +         }
> +
> +       if (insert)
> +         {
> +           /* Generate a single scalar insert after vector copy or
> +              permute.  */
> +           tree rhs = TREE_OPERAND (*expr_p, 1);
> +           tree var = create_tmp_var (type);
> +           gassign *init = gimple_build_assign (var, rhs);
> +           gimple_seq_add_stmt (pre_p, init);
> +           if (gimplify_expr (&scalar_element, pre_p, post_p,
> +                              is_gimple_val, fb_rvalue) == GS_ERROR)
> +             {
> +               ret = GS_ERROR;
> +               break;
> +             }
> +           tree scalar_type = TREE_TYPE (scalar_element);
> +           tree scalar_size = TYPE_SIZE (scalar_type);
> +           tree bitpos = bitsize_int (scalar_idx
> +                                      * TREE_INT_CST_LOW (scalar_size));
> +           tree ref = build3_loc (EXPR_LOCATION (rhs), BIT_FIELD_REF,
> +                                  scalar_type, var, scalar_size,
> +                                  bitpos);
> +           init = gimple_build_assign (ref, scalar_element);
> +           gimplify_seq_add_stmt (pre_p, init);
> +           TREE_OPERAND (*expr_p, 1) = var;
>           }
> -       if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0)))
> -         TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p);
>        }
>        break;
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c b/gcc/testsuite/gcc.target/i386/pr88828-1.c
> new file mode 100644
> index 00000000000..4ef1feab389
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse4" } */
> +/* { dg-final { scan-assembler "movss" } } */
> +/* { dg-final { scan-assembler-not "movaps" } } */
> +/* { dg-final { scan-assembler-not "movlhps" } } */
> +/* { dg-final { scan-assembler-not "unpcklps" } } */
> +/* { dg-final { scan-assembler-not "shufps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = { f, x[1], x[2], x[3] };
> +  return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c b/gcc/testsuite/gcc.target/i386/pr88828-2.c
> new file mode 100644
> index 00000000000..6dc482b6f4b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse4" } */
> +/* { dg-final { scan-assembler "movss" } } */
> +/* { dg-final { scan-assembler-not "movaps" } } */
> +/* { dg-final { scan-assembler-not "movlhps" } } */
> +/* { dg-final { scan-assembler-not "unpcklps" } } */
> +/* { dg-final { scan-assembler-not "shufps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = x;
> +  y[0] = f;
> +  return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
> new file mode 100644
> index 00000000000..97eb8e7162a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse4" } */
> +/* { dg-final { scan-assembler "movss" } } */
> +/* { dg-final { scan-assembler-times "shufps" 1 } } */
> +/* { dg-final { scan-assembler-not "movaps" } } */
> +/* { dg-final { scan-assembler-not "movlhps" } } */
> +/* { dg-final { scan-assembler-not "unpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = { f, x[0], x[2], x[3] };
> +  return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
> new file mode 100644
> index 00000000000..ab2ba730716
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vmovlhps" } } */
> +/* { dg-final { scan-assembler-not "vunpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = { f, x[0], x[2], x[3] };
> +  return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
> new file mode 100644
> index 00000000000..a54689be701
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse4" } */
> +/* { dg-final { scan-assembler "movss" } } */
> +/* { dg-final { scan-assembler-times "shufps" 1 } } */
> +/* { dg-final { scan-assembler-not "movaps" } } */
> +/* { dg-final { scan-assembler-not "movlhps" } } */
> +/* { dg-final { scan-assembler-not "unpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = { x[0], x[2], x[3], x[1] };
> +  y[0] = f;
> +  return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
> new file mode 100644
> index 00000000000..0c3a1024d93
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "vshufps" } } */
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vmovlhps" } } */
> +/* { dg-final { scan-assembler-not "vunpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = { x[0], x[2], x[3], x[1] };
> +  y[0] = f;
> +  return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
> new file mode 100644
> index 00000000000..534808d3cd1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse4" } */
> +/* { dg-final { scan-assembler "movss" } } */
> +/* { dg-final { scan-assembler-times "shufps" 2 } } */
> +/* { dg-final { scan-assembler-times "movaps" 1 } } */
> +/* { dg-final { scan-assembler-not "movlhps" } } */
> +/* { dg-final { scan-assembler-not "unpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = { x[0], x[2], x[3], f };
> +  return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
> new file mode 100644
> index 00000000000..aebea790979
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
> +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
> +/* { dg-final { scan-assembler-not "vmovss" } } */
> +/* { dg-final { scan-assembler-not "vshufps" } } */
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vmovlhps" } } */
> +/* { dg-final { scan-assembler-not "vunpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = { x[0], x[2], x[3], f };
> +  return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6a.c b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
> new file mode 100644
> index 00000000000..d43a36d9137
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse4" } */
> +/* { dg-final { scan-assembler "movss" } } */
> +/* { dg-final { scan-assembler-times "shufps" 2 } } */
> +/* { dg-final { scan-assembler-times "movaps" 1 } } */
> +/* { dg-final { scan-assembler-not "movlhps" } } */
> +/* { dg-final { scan-assembler-not "unpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = { x[0], x[2], x[3], x[0] };
> +  y[3] = f;
> +  return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6b.c b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
> new file mode 100644
> index 00000000000..6856fe6500e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
> +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
> +/* { dg-final { scan-assembler-not "vshufps" } } */
> +/* { dg-final { scan-assembler-not "vmovss" } } */
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vmovlhps" } } */
> +/* { dg-final { scan-assembler-not "vunpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = { x[0], x[2], x[3], x[0] };
> +  y[3] = f;
> +  return y;
> +}
> --
> 2.20.1
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Optimize vector init constructor
  2019-03-03 14:40 ` Andrew Pinski
@ 2019-03-03 21:13   ` H.J. Lu
  2019-03-04 11:55     ` Richard Biener
  0 siblings, 1 reply; 17+ messages in thread
From: H.J. Lu @ 2019-03-03 21:13 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: GCC Patches, Richard Guenther

On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> )
> ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > For vector init constructor:
> >
> > ---
> > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> >
> > __v4sf
> > foo (__v4sf x, float f)
> > {
> >   __v4sf y = { f, x[1], x[2], x[3] };
> >   return y;
> > }
> > ---
> >
> > we can optimize vector init constructor with vector copy or permute
> > followed by a single scalar insert:
> >
> >   __v4sf D.1912;
> >   __v4sf D.1913;
> >   __v4sf D.1914;
> >   __v4sf y;
> >
> >   x.0_1 = x;
> >   D.1912 = x.0_1;
> >   _2 = D.1912;
> >   D.1913 = _2;
> >   BIT_FIELD_REF <D.1913, 32, 0> = f;
> >   y = D.1913;
> >   D.1914 = y;
> >   return D.1914;
> >
> > instead of
> >
> >   __v4sf D.1962;
> >   __v4sf y;
> >
> >   _1 = BIT_FIELD_REF <x, 32, 32>;
> >   _2 = BIT_FIELD_REF <x, 32, 64>;
> >   _3 = BIT_FIELD_REF <x, 32, 96>;
> >   y = {f, _1, _2, _3};
> >   D.1962 = y;
> >   return D.1962;
> >
> > gcc/
> >
> >         PR tree-optimization/88828
> >         * gimplify.c (gimplify_init_constructor): Optimize vector init
> >         constructor with vector copy or permute followed by a single
> >         scalar insert.
> 
> 
> Doing this here does not catch things like:
> typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> 
> 
> __v4sf
> vector_init (float f0,float f1, float f2,float f3)
> {
>   __v4sf y = { f, x[1], x[2], x[3] };
>    return y;
> }
> 
> __v4sf
> foo (__v4sf x, float f)
> {
>   return vector_init (f, x[1], x[2], x[3]) ;
> }
> 

Here is a patch for simplify_vector_constructor to optimize vector init
constructor with vector copy or permute followed by a single scalar
insert.  But this doesn't work correcly:

[hjl@gnu-cfl-2 pr88828]$ cat bar.i
typedef float __v4sf __attribute__ ((__vector_size__ (16)));

static __v4sf
vector_init (float f0,float f1, float f2,float f3)
{
  __v4sf y = { f0, f1, f2, f3 };
   return y;
}

__v4sf
foo (__v4sf x, float f)
{
  return vector_init (f, x[1], x[2], x[3]) ;
}
[hjl@gnu-cfl-2 pr88828]$ make bar.s
/export/build/gnu/tools-build/gcc-wip-debug/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/tools-build/gcc-wip-debug/build-x86_64-linux/gcc/ -O2 -S bar.i
[hjl@gnu-cfl-2 pr88828]$ cat bar.s
	.file	"bar.i"
	.text
	.p2align 4
	.globl	foo
	.type	foo, @function
foo:
.LFB1:
	.cfi_startproc
	ret
	.cfi_endproc
.LFE1:
	.size	foo, .-foo
	.ident	"GCC: (GNU) 9.0.1 20190303 (experimental)"
	.section	.note.GNU-stack,"",@progbits
[hjl@gnu-cfl-2 pr88828]$

Scalar insert is missing.
---
 gcc/tree-ssa-forwprop.c | 77 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 69 insertions(+), 8 deletions(-)

diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index eeb6281c652..b10cfccf7b8 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -2008,7 +2008,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
   unsigned elem_size, i;
   unsigned HOST_WIDE_INT nelts;
   enum tree_code code, conv_code;
-  constructor_elt *elt;
+  constructor_elt *ce;
   bool maybe_ident;
 
   gcc_checking_assert (gimple_assign_rhs_code (stmt) == CONSTRUCTOR);
@@ -2027,18 +2027,41 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
   orig[1] = NULL;
   conv_code = ERROR_MARK;
   maybe_ident = true;
-  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt)
+
+  tree rhs_vector = NULL;
+  /* The single scalar element.  */
+  tree scalar_element = NULL;
+  unsigned int scalar_idx = 0;
+  bool insert = false;
+  unsigned int nscalars = 0;
+  unsigned int nvectors = 0;
+  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, ce)
     {
       tree ref, op1;
 
       if (i >= nelts)
 	return false;
 
-      if (TREE_CODE (elt->value) != SSA_NAME)
+      if (TREE_CODE (ce->value) != SSA_NAME)
 	return false;
-      def_stmt = get_prop_source_stmt (elt->value, false, NULL);
+      def_stmt = get_prop_source_stmt (ce->value, false, NULL);
       if (!def_stmt)
-	return false;
+	{
+	  if ( gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
+	    {
+	      /* Only allow one single scalar insert.  */
+	      if (nscalars != 0)
+		return false;
+
+	      nscalars = 1;
+	      insert = true;
+	      scalar_idx = i;
+	      scalar_element = ce->value;
+	      continue;
+	    }
+	  else
+	    return false;
+	}
       code = gimple_assign_rhs_code (def_stmt);
       if (code == FLOAT_EXPR
 	  || code == FIX_TRUNC_EXPR)
@@ -2046,7 +2069,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	  op1 = gimple_assign_rhs1 (def_stmt);
 	  if (conv_code == ERROR_MARK)
 	    {
-	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (elt->value))),
+	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (ce->value))),
 			    GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1)))))
 		return false;
 	      conv_code = code;
@@ -2095,6 +2118,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	elt += nelts;
       if (elt != i)
 	maybe_ident = false;
+
+       if (type == TREE_TYPE (ref))
+	 {
+	   /* The RHS vector has the same type as LHS.  */
+	   if (rhs_vector == NULL)
+	     rhs_vector = ref;
+	   /* Check if all RHS vector elements come fome the same
+	      vector.  */
+	   if (rhs_vector == ref)
+	     nvectors++;
+	 }
+
       sel.quick_push (elt);
     }
   if (i < nelts)
@@ -2113,6 +2148,12 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	  || conv_code == CALL_EXPR))
     return false;
 
+  /* Replace the scalar element with the vector element.  */
+  if (insert
+      && (TYPE_VECTOR_SUBPARTS (type).to_constant ()
+	  == (nscalars + nvectors)))
+    sel.quick_push (scalar_idx);
+
   if (maybe_ident)
     {
       if (conv_code == ERROR_MARK)
@@ -2127,14 +2168,22 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 
       vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts);
       if (!can_vec_perm_const_p (TYPE_MODE (type), indices))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       mask_type
 	= build_vector_type (build_nonstandard_integer_type (elem_size, 1),
 			     nelts);
       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT
 	  || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)),
 		       GET_MODE_SIZE (TYPE_MODE (type))))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       op2 = vec_perm_indices_to_tree (mask_type, indices);
       if (!orig[1])
 	orig[1] = orig[0];
@@ -2153,6 +2202,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	}
     }
   update_stmt (gsi_stmt (*gsi));
+  if (insert)
+    {
+      /* Generate a single scalar insert.  */
+      /* FIXME: This doesn't work correctly.  */
+      tree lhs = gimple_assign_lhs (stmt);
+      tree bitfield = build3 (BIT_FIELD_REF, elem_type, lhs,
+			      bitsize_int (elem_size),
+			      bitsize_int (scalar_idx * elem_size));
+      gimple *new_stmt = gimple_build_assign (bitfield, scalar_element);
+      gsi_insert_after (gsi, new_stmt, GSI_SAME_STMT);
+      update_stmt (gsi_stmt (*gsi));
+    }
   return true;
 }
 
-- 
2.20.1

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Optimize vector init constructor
  2019-03-03 21:13   ` H.J. Lu
@ 2019-03-04 11:55     ` Richard Biener
  2019-03-04 17:46       ` V2 [PATCH] Optimize vector constructor H.J. Lu
  0 siblings, 1 reply; 17+ messages in thread
From: Richard Biener @ 2019-03-04 11:55 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Andrew Pinski, GCC Patches

On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > )
> > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > For vector init constructor:
> > >
> > > ---
> > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > >
> > > __v4sf
> > > foo (__v4sf x, float f)
> > > {
> > >   __v4sf y = { f, x[1], x[2], x[3] };
> > >   return y;
> > > }
> > > ---
> > >
> > > we can optimize vector init constructor with vector copy or permute
> > > followed by a single scalar insert:
> > >
> > >   __v4sf D.1912;
> > >   __v4sf D.1913;
> > >   __v4sf D.1914;
> > >   __v4sf y;
> > >
> > >   x.0_1 = x;
> > >   D.1912 = x.0_1;
> > >   _2 = D.1912;
> > >   D.1913 = _2;
> > >   BIT_FIELD_REF <D.1913, 32, 0> = f;
> > >   y = D.1913;
> > >   D.1914 = y;
> > >   return D.1914;
> > >
> > > instead of
> > >
> > >   __v4sf D.1962;
> > >   __v4sf y;
> > >
> > >   _1 = BIT_FIELD_REF <x, 32, 32>;
> > >   _2 = BIT_FIELD_REF <x, 32, 64>;
> > >   _3 = BIT_FIELD_REF <x, 32, 96>;
> > >   y = {f, _1, _2, _3};
> > >   D.1962 = y;
> > >   return D.1962;
> > >
> > > gcc/
> > >
> > >         PR tree-optimization/88828
> > >         * gimplify.c (gimplify_init_constructor): Optimize vector init
> > >         constructor with vector copy or permute followed by a single
> > >         scalar insert.
> >
> >
> > Doing this here does not catch things like:
> > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> >
> >
> > __v4sf
> > vector_init (float f0,float f1, float f2,float f3)
> > {
> >   __v4sf y = { f, x[1], x[2], x[3] };
> >    return y;
> > }
> >
> > __v4sf
> > foo (__v4sf x, float f)
> > {
> >   return vector_init (f, x[1], x[2], x[3]) ;
> > }
> >
>
> Here is a patch for simplify_vector_constructor to optimize vector init
> constructor with vector copy or permute followed by a single scalar
> insert.

That's the correct place to fix this indeed.

  But this doesn't work correcly:
>
> [hjl@gnu-cfl-2 pr88828]$ cat bar.i
> typedef float __v4sf __attribute__ ((__vector_size__ (16)));
>
> static __v4sf
> vector_init (float f0,float f1, float f2,float f3)
> {
>   __v4sf y = { f0, f1, f2, f3 };
>    return y;
> }
>
> __v4sf
> foo (__v4sf x, float f)
> {
>   return vector_init (f, x[1], x[2], x[3]) ;
> }
> [hjl@gnu-cfl-2 pr88828]$ make bar.s
> /export/build/gnu/tools-build/gcc-wip-debug/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/tools-build/gcc-wip-debug/build-x86_64-linux/gcc/ -O2 -S bar.i
> [hjl@gnu-cfl-2 pr88828]$ cat bar.s
>         .file   "bar.i"
>         .text
>         .p2align 4
>         .globl  foo
>         .type   foo, @function
> foo:
> .LFB1:
>         .cfi_startproc
>         ret
>         .cfi_endproc
> .LFE1:
>         .size   foo, .-foo
>         .ident  "GCC: (GNU) 9.0.1 20190303 (experimental)"
>         .section        .note.GNU-stack,"",@progbits
> [hjl@gnu-cfl-2 pr88828]$
>
> Scalar insert is missing.
> ---
>  gcc/tree-ssa-forwprop.c | 77 ++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 69 insertions(+), 8 deletions(-)
>
> diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
> index eeb6281c652..b10cfccf7b8 100644
> --- a/gcc/tree-ssa-forwprop.c
> +++ b/gcc/tree-ssa-forwprop.c
> @@ -2008,7 +2008,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
>    unsigned elem_size, i;
>    unsigned HOST_WIDE_INT nelts;
>    enum tree_code code, conv_code;
> -  constructor_elt *elt;
> +  constructor_elt *ce;
>    bool maybe_ident;
>
>    gcc_checking_assert (gimple_assign_rhs_code (stmt) == CONSTRUCTOR);
> @@ -2027,18 +2027,41 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
>    orig[1] = NULL;
>    conv_code = ERROR_MARK;
>    maybe_ident = true;
> -  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt)
> +
> +  tree rhs_vector = NULL;
> +  /* The single scalar element.  */
> +  tree scalar_element = NULL;
> +  unsigned int scalar_idx = 0;
> +  bool insert = false;
> +  unsigned int nscalars = 0;
> +  unsigned int nvectors = 0;
> +  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, ce)
>      {
>        tree ref, op1;
>
>        if (i >= nelts)
>         return false;
>
> -      if (TREE_CODE (elt->value) != SSA_NAME)
> +      if (TREE_CODE (ce->value) != SSA_NAME)
>         return false;
> -      def_stmt = get_prop_source_stmt (elt->value, false, NULL);
> +      def_stmt = get_prop_source_stmt (ce->value, false, NULL);
>        if (!def_stmt)
> -       return false;
> +       {
> +         if ( gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> +           {
> +             /* Only allow one single scalar insert.  */
> +             if (nscalars != 0)
> +               return false;
> +
> +             nscalars = 1;
> +             insert = true;
> +             scalar_idx = i;
> +             scalar_element = ce->value;
> +             continue;
> +           }
> +         else
> +           return false;
> +       }
>        code = gimple_assign_rhs_code (def_stmt);
>        if (code == FLOAT_EXPR
>           || code == FIX_TRUNC_EXPR)
> @@ -2046,7 +2069,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
>           op1 = gimple_assign_rhs1 (def_stmt);
>           if (conv_code == ERROR_MARK)
>             {
> -             if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (elt->value))),
> +             if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (ce->value))),
>                             GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1)))))
>                 return false;
>               conv_code = code;
> @@ -2095,6 +2118,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
>         elt += nelts;
>        if (elt != i)
>         maybe_ident = false;
> +
> +       if (type == TREE_TYPE (ref))
> +        {
> +          /* The RHS vector has the same type as LHS.  */
> +          if (rhs_vector == NULL)
> +            rhs_vector = ref;
> +          /* Check if all RHS vector elements come fome the same
> +             vector.  */
> +          if (rhs_vector == ref)
> +            nvectors++;
> +        }
> +
>        sel.quick_push (elt);
>      }
>    if (i < nelts)
> @@ -2113,6 +2148,12 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
>           || conv_code == CALL_EXPR))
>      return false;
>
> +  /* Replace the scalar element with the vector element.  */
> +  if (insert
> +      && (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> +         == (nscalars + nvectors)))
> +    sel.quick_push (scalar_idx);
> +
>    if (maybe_ident)
>      {
>        if (conv_code == ERROR_MARK)
> @@ -2127,14 +2168,22 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
>
>        vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts);
>        if (!can_vec_perm_const_p (TYPE_MODE (type), indices))
> -       return false;
> +       {
> +         if (insert)
> +           gcc_unreachable ();
> +         return false;
> +       }
>        mask_type
>         = build_vector_type (build_nonstandard_integer_type (elem_size, 1),
>                              nelts);
>        if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT
>           || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)),
>                        GET_MODE_SIZE (TYPE_MODE (type))))
> -       return false;
> +       {
> +         if (insert)
> +           gcc_unreachable ();
> +         return false;
> +       }
>        op2 = vec_perm_indices_to_tree (mask_type, indices);
>        if (!orig[1])
>         orig[1] = orig[0];
> @@ -2153,6 +2202,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
>         }
>      }
>    update_stmt (gsi_stmt (*gsi));
> +  if (insert)
> +    {
> +      /* Generate a single scalar insert.  */
> +      /* FIXME: This doesn't work correctly.  */
> +      tree lhs = gimple_assign_lhs (stmt);
> +      tree bitfield = build3 (BIT_FIELD_REF, elem_type, lhs,
> +                             bitsize_int (elem_size),
> +                             bitsize_int (scalar_idx * elem_size));
> +      gimple *new_stmt = gimple_build_assign (bitfield, scalar_element);

I think you want to generate from the original

    _1 = { .... };

the new

    _2 = copy or permute to _new_ LHS SSA name
    _1 = BIT_INSERT_EXPR <_2, scalar_element, scalar_idx * elem_size>;

> +      gsi_insert_after (gsi, new_stmt, GSI_SAME_STMT);

and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
BIT_INSERT_EXPR.

> +      update_stmt (gsi_stmt (*gsi));
> +    }
>    return true;
>  }
>
> --
> 2.20.1
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* V2 [PATCH] Optimize vector constructor
  2019-03-04 11:55     ` Richard Biener
@ 2019-03-04 17:46       ` H.J. Lu
  2019-03-06  7:54         ` V3 " H.J. Lu
  0 siblings, 1 reply; 17+ messages in thread
From: H.J. Lu @ 2019-03-04 17:46 UTC (permalink / raw)
  To: Richard Biener; +Cc: Andrew Pinski, GCC Patches

On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > )
> > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > For vector init constructor:
> > > >
> > > > ---
> > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > >
> > > > __v4sf
> > > > foo (__v4sf x, float f)
> > > > {
> > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > >   return y;
> > > > }
> > > > ---
> > > >
> > > > we can optimize vector init constructor with vector copy or permute
> > > > followed by a single scalar insert:

> and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> BIT_INSERT_EXPR.

Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.


H.J.
---
We can optimize vector constructor with vector copy or permute followed
by a single scalar insert:

  __v4sf y;
  __v4sf D.1930;
  float _1;
  float _2;
  float _3;

  <bb 2> :
  _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
  _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
  _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
  y_6 = {f_5(D), _3, _2, _1};
  return y_6;

with

 __v4sf y;
  __v4sf D.1930;
  float _1;
  float _2;
  float _3;
  vector(4) float _8;

  <bb 2> :
  _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
  _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
  _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
  _8 = x_9(D);
  y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
  return y_6;

gcc/

	PR tree-optimization/88828
	* tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
	vector init constructor with vector copy or permute followed
	by a single scalar insert.

gcc/testsuite/

	PR tree-optimization/88828
	* gcc.target/i386/pr88828-1a.c: New test.
	* gcc.target/i386/pr88828-2b.c: Likewise.
	* gcc.target/i386/pr88828-2.c: Likewise.
	* gcc.target/i386/pr88828-3a.c: Likewise.
	* gcc.target/i386/pr88828-3b.c: Likewise.
	* gcc.target/i386/pr88828-3c.c: Likewise.
	* gcc.target/i386/pr88828-3d.c: Likewise.
	* gcc.target/i386/pr88828-4a.c: Likewise.
	* gcc.target/i386/pr88828-4b.c: Likewise.
	* gcc.target/i386/pr88828-5a.c: Likewise.
	* gcc.target/i386/pr88828-5b.c: Likewise.
	* gcc.target/i386/pr88828-6a.c: Likewise.
	* gcc.target/i386/pr88828-6b.c: Likewise.
---
 gcc/testsuite/gcc.target/i386/pr88828-1a.c | 16 +++++
 gcc/testsuite/gcc.target/i386/pr88828-1b.c | 22 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-2.c  | 17 +++++
 gcc/testsuite/gcc.target/i386/pr88828-3a.c | 16 +++++
 gcc/testsuite/gcc.target/i386/pr88828-3b.c | 18 +++++
 gcc/testsuite/gcc.target/i386/pr88828-3c.c | 22 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-3d.c | 24 +++++++
 gcc/testsuite/gcc.target/i386/pr88828-4a.c | 17 +++++
 gcc/testsuite/gcc.target/i386/pr88828-4b.c | 20 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-5a.c | 16 +++++
 gcc/testsuite/gcc.target/i386/pr88828-5b.c | 18 +++++
 gcc/testsuite/gcc.target/i386/pr88828-6a.c | 17 +++++
 gcc/testsuite/gcc.target/i386/pr88828-6b.c | 19 +++++
 gcc/testsuite/gcc.target/i386/pr88828-7.c  | 22 ++++++
 gcc/tree-ssa-forwprop.c                    | 84 +++++++++++++++++++---
 15 files changed, 338 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3d.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-7.c

diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1a.c b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
new file mode 100644
index 00000000000..4ef1feab389
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { f, x[1], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1b.c b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
new file mode 100644
index 00000000000..2cddf4263f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  return vector_init (f, x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c b/gcc/testsuite/gcc.target/i386/pr88828-2.c
new file mode 100644
index 00000000000..6dc482b6f4b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = x;
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
new file mode 100644
index 00000000000..97eb8e7162a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
new file mode 100644
index 00000000000..ab2ba730716
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3c.c b/gcc/testsuite/gcc.target/i386/pr88828-3c.c
new file mode 100644
index 00000000000..0db7f9e145b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3c.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  return vector_init (f, x[0], x[1], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3d.c b/gcc/testsuite/gcc.target/i386/pr88828-3d.c
new file mode 100644
index 00000000000..33e2b6e5881
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3d.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  return vector_init (f, x[0], x[1], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
new file mode 100644
index 00000000000..a54689be701
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
new file mode 100644
index 00000000000..0c3a1024d93
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
new file mode 100644
index 00000000000..534808d3cd1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
new file mode 100644
index 00000000000..aebea790979
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6a.c b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
new file mode 100644
index 00000000000..d43a36d9137
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6b.c b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
new file mode 100644
index 00000000000..6856fe6500e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7.c b/gcc/testsuite/gcc.target/i386/pr88828-7.c
new file mode 100644
index 00000000000..2cddf4263f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-7.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  return vector_init (f, x[1], x[2], x[3]) ;
+}
diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index eeb6281c652..ce00c43d7e7 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -2008,7 +2008,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
   unsigned elem_size, i;
   unsigned HOST_WIDE_INT nelts;
   enum tree_code code, conv_code;
-  constructor_elt *elt;
+  constructor_elt *ce;
   bool maybe_ident;
 
   gcc_checking_assert (gimple_assign_rhs_code (stmt) == CONSTRUCTOR);
@@ -2027,18 +2027,41 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
   orig[1] = NULL;
   conv_code = ERROR_MARK;
   maybe_ident = true;
-  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt)
+
+  tree rhs_vector = NULL;
+  /* The single scalar element.  */
+  tree scalar_element = NULL;
+  unsigned int scalar_idx = 0;
+  bool insert = false;
+  unsigned int nscalars = 0;
+  unsigned int nvectors = 0;
+  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, ce)
     {
       tree ref, op1;
 
       if (i >= nelts)
 	return false;
 
-      if (TREE_CODE (elt->value) != SSA_NAME)
+      if (TREE_CODE (ce->value) != SSA_NAME)
 	return false;
-      def_stmt = get_prop_source_stmt (elt->value, false, NULL);
+      def_stmt = get_prop_source_stmt (ce->value, false, NULL);
       if (!def_stmt)
-	return false;
+	{
+	  if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
+	    {
+	      /* Only allow one scalar insert.  */
+	      if (nscalars != 0)
+		return false;
+
+	      nscalars = 1;
+	      insert = true;
+	      scalar_idx = i;
+	      scalar_element = ce->value;
+	      continue;
+	    }
+	  else
+	    return false;
+	}
       code = gimple_assign_rhs_code (def_stmt);
       if (code == FLOAT_EXPR
 	  || code == FIX_TRUNC_EXPR)
@@ -2046,7 +2069,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	  op1 = gimple_assign_rhs1 (def_stmt);
 	  if (conv_code == ERROR_MARK)
 	    {
-	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (elt->value))),
+	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (ce->value))),
 			    GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1)))))
 		return false;
 	      conv_code = code;
@@ -2095,6 +2118,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	elt += nelts;
       if (elt != i)
 	maybe_ident = false;
+
+       if (type == TREE_TYPE (ref))
+	 {
+	   /* The RHS vector has the same type as LHS.  */
+	   if (rhs_vector == NULL)
+	     rhs_vector = ref;
+	   /* Check if all RHS vector elements come fome the same
+	      vector.  */
+	   if (rhs_vector == ref)
+	     nvectors++;
+	 }
+
       sel.quick_push (elt);
     }
   if (i < nelts)
@@ -2113,6 +2148,12 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	  || conv_code == CALL_EXPR))
     return false;
 
+  /* Replace the scalar element with the vector element.  */
+  if (insert
+      && (TYPE_VECTOR_SUBPARTS (type).to_constant ()
+	  == (nscalars + nvectors)))
+    sel.quick_push (scalar_idx);
+
   if (maybe_ident)
     {
       if (conv_code == ERROR_MARK)
@@ -2127,18 +2168,26 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 
       vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts);
       if (!can_vec_perm_const_p (TYPE_MODE (type), indices))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       mask_type
 	= build_vector_type (build_nonstandard_integer_type (elem_size, 1),
 			     nelts);
       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT
 	  || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)),
 		       GET_MODE_SIZE (TYPE_MODE (type))))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       op2 = vec_perm_indices_to_tree (mask_type, indices);
       if (!orig[1])
 	orig[1] = orig[0];
-      if (conv_code == ERROR_MARK)
+      if (conv_code == ERROR_MARK && !insert)
 	gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
 					orig[1], op2);
       else
@@ -2148,10 +2197,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 				   VEC_PERM_EXPR, orig[0], orig[1], op2);
 	  orig[0] = gimple_assign_lhs (perm);
 	  gsi_insert_before (gsi, perm, GSI_SAME_STMT);
-	  gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
+	  gimple_assign_set_rhs_with_ops (gsi,
+					  (conv_code != ERROR_MARK
+					   ? conv_code
+					   : NOP_EXPR),
+					  orig[0],
 					  NULL_TREE, NULL_TREE);
 	}
     }
+  if (insert)
+    {
+      /* Generate a single scalar insert.  */
+      tree var = make_ssa_name (type);
+      tree val = gimple_assign_rhs1 (stmt);
+      gimple *copy = gimple_build_assign (var, val);
+      gsi_insert_before (gsi, copy, GSI_SAME_STMT);
+      tree bitpos = bitsize_int (scalar_idx * elem_size);
+      gimple_assign_set_rhs_with_ops (gsi, BIT_INSERT_EXPR, var,
+				      scalar_element, bitpos);
+    }
   update_stmt (gsi_stmt (*gsi));
   return true;
 }
-- 
2.20.1

^ permalink raw reply	[flat|nested] 17+ messages in thread

* V3 [PATCH] Optimize vector constructor
  2019-03-04 17:46       ` V2 [PATCH] Optimize vector constructor H.J. Lu
@ 2019-03-06  7:54         ` H.J. Lu
  2019-03-06 13:39           ` Richard Biener
  0 siblings, 1 reply; 17+ messages in thread
From: H.J. Lu @ 2019-03-06  7:54 UTC (permalink / raw)
  To: Richard Biener, Hongtao Liu; +Cc: Andrew Pinski, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 2801 bytes --]

On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > )
> > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > For vector init constructor:
> > > > >
> > > > > ---
> > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > >
> > > > > __v4sf
> > > > > foo (__v4sf x, float f)
> > > > > {
> > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > >   return y;
> > > > > }
> > > > > ---
> > > > >
> > > > > we can optimize vector init constructor with vector copy or permute
> > > > > followed by a single scalar insert:
>
> > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > BIT_INSERT_EXPR.
>
> Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
>
>
> H.J.
> ---
> We can optimize vector constructor with vector copy or permute followed
> by a single scalar insert:
>
>   __v4sf y;
>   __v4sf D.1930;
>   float _1;
>   float _2;
>   float _3;
>
>   <bb 2> :
>   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
>   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
>   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
>   y_6 = {f_5(D), _3, _2, _1};
>   return y_6;
>
> with
>
>  __v4sf y;
>   __v4sf D.1930;
>   float _1;
>   float _2;
>   float _3;
>   vector(4) float _8;
>
>   <bb 2> :
>   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
>   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
>   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
>   _8 = x_9(D);
>   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
>   return y_6;
>
> gcc/
>
>         PR tree-optimization/88828
>         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
>         vector init constructor with vector copy or permute followed
>         by a single scalar insert.
>
> gcc/testsuite/
>
>         PR tree-optimization/88828
>         * gcc.target/i386/pr88828-1a.c: New test.
>         * gcc.target/i386/pr88828-2b.c: Likewise.
>         * gcc.target/i386/pr88828-2.c: Likewise.
>         * gcc.target/i386/pr88828-3a.c: Likewise.
>         * gcc.target/i386/pr88828-3b.c: Likewise.
>         * gcc.target/i386/pr88828-3c.c: Likewise.
>         * gcc.target/i386/pr88828-3d.c: Likewise.
>         * gcc.target/i386/pr88828-4a.c: Likewise.
>         * gcc.target/i386/pr88828-4b.c: Likewise.
>         * gcc.target/i386/pr88828-5a.c: Likewise.
>         * gcc.target/i386/pr88828-5b.c: Likewise.
>         * gcc.target/i386/pr88828-6a.c: Likewise.
>         * gcc.target/i386/pr88828-6b.c: Likewise.

Here is the updated patch with run-time tests.

-- 
H.J.

[-- Attachment #2: 0001-Optimize-vector-constructor.patch --]
[-- Type: text/x-patch, Size: 23948 bytes --]

From b2bc0bf3a8ee17d53bf39f0aeabe7025b33e9c96 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 5 Feb 2019 15:39:27 -0800
Subject: [PATCH] Optimize vector constructor

We can optimize vector constructor with vector copy or permute followed
by a single scalar insert:

  __v4sf y;
  __v4sf D.1930;
  float _1;
  float _2;
  float _3;

  <bb 2> :
  _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
  _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
  _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
  y_6 = {f_5(D), _3, _2, _1};
  return y_6;

with

 __v4sf y;
  __v4sf D.1930;
  float _1;
  float _2;
  float _3;
  vector(4) float _8;

  <bb 2> :
  _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
  _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
  _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
  _8 = x_9(D);
  y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
  return y_6;

gcc/

	PR tree-optimization/88828
	* tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
	vector init constructor with vector copy or permute followed
	by a single scalar insert.

gcc/testsuite/

	PR tree-optimization/88828
	* gcc.target/i386/pr88828-1.c: New test.
	* gcc.target/i386/pr88828-1a.c: Likewise.
	* gcc.target/i386/pr88828-1b.c: Likewise.
	* gcc.target/i386/pr88828-1c.c: Likewise.
	* gcc.target/i386/pr88828-2.c: Likewise.
	* gcc.target/i386/pr88828-2a.c: Likewise.
	* gcc.target/i386/pr88828-2b.c: Likewise.
	* gcc.target/i386/pr88828-2c.c: Likewise.
	* gcc.target/i386/pr88828-2d.c: Likewise.
	* gcc.target/i386/pr88828-3.c: Likewise.
	* gcc.target/i386/pr88828-3a.c: Likewise.
	* gcc.target/i386/pr88828-3b.c: Likewise.
	* gcc.target/i386/pr88828-3c.c: Likewise.
	* gcc.target/i386/pr88828-3d.c: Likewise.
	* gcc.target/i386/pr88828-4a.c: Likewise.
	* gcc.target/i386/pr88828-4b.c: Likewise.
	* gcc.target/i386/pr88828-5a.c: Likewise.
	* gcc.target/i386/pr88828-5b.c: Likewise.
---
 gcc/testsuite/gcc.target/i386/pr88828-1.c  | 49 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-1a.c | 17 +++++
 gcc/testsuite/gcc.target/i386/pr88828-1b.c | 23 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-1c.c | 18 +++++
 gcc/testsuite/gcc.target/i386/pr88828-2.c  | 51 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-2a.c | 17 +++++
 gcc/testsuite/gcc.target/i386/pr88828-2b.c | 19 +++++
 gcc/testsuite/gcc.target/i386/pr88828-2c.c | 23 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-2d.c | 25 +++++++
 gcc/testsuite/gcc.target/i386/pr88828-3.c  | 54 ++++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-3a.c | 17 +++++
 gcc/testsuite/gcc.target/i386/pr88828-3b.c | 19 +++++
 gcc/testsuite/gcc.target/i386/pr88828-3c.c | 25 +++++++
 gcc/testsuite/gcc.target/i386/pr88828-4a.c | 18 +++++
 gcc/testsuite/gcc.target/i386/pr88828-4b.c | 21 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-5a.c | 18 +++++
 gcc/testsuite/gcc.target/i386/pr88828-5b.c | 20 +++++
 gcc/tree-ssa-forwprop.c                    | 85 +++++++++++++++++++---
 18 files changed, 509 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2d.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c

diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c b/gcc/testsuite/gcc.target/i386/pr88828-1.c
new file mode 100644
index 00000000000..a15d1fea3f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c
@@ -0,0 +1,49 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-1a.c"
+#include "pr88828-1b.c"
+#include "pr88828-1c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+  y = foo3 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1a.c b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
new file mode 100644
index 00000000000..d37b24c6661
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[1], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1b.c b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
new file mode 100644
index 00000000000..af4aced65f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  return vector_init (f, x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1c.c b/gcc/testsuite/gcc.target/i386/pr88828-1c.c
new file mode 100644
index 00000000000..a117f3ec7b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1c.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo3 (__v4sf x, float f)
+{
+  __v4sf y = x;
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c b/gcc/testsuite/gcc.target/i386/pr88828-2.c
new file mode 100644
index 00000000000..011fd486bb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c
@@ -0,0 +1,51 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-2a.c"
+#include "pr88828-2c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else if (i == 1)
+      {
+	if (y[i] != f[0])
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2a.c b/gcc/testsuite/gcc.target/i386/pr88828-2a.c
new file mode 100644
index 00000000000..85e49535ebd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2b.c b/gcc/testsuite/gcc.target/i386/pr88828-2b.c
new file mode 100644
index 00000000000..adfd7002a4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2b.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2c.c b/gcc/testsuite/gcc.target/i386/pr88828-2c.c
new file mode 100644
index 00000000000..149967ea0b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2c.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  return vector_init (f, x[0], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2d.c b/gcc/testsuite/gcc.target/i386/pr88828-2d.c
new file mode 100644
index 00000000000..21088496730
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2d.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  return vector_init (f, x[0], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3.c b/gcc/testsuite/gcc.target/i386/pr88828-3.c
new file mode 100644
index 00000000000..adbc46dbf3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3.c
@@ -0,0 +1,54 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-3a.c"
+#include "pr88828-3b.c"
+#include "pr88828-3c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 3)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else if (i == 0)
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i + 1])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+  y = foo3 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
new file mode 100644
index 00000000000..e5cb95c1275
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
new file mode 100644
index 00000000000..0349f35b08a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3c.c b/gcc/testsuite/gcc.target/i386/pr88828-3c.c
new file mode 100644
index 00000000000..fb668a55f1d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3c.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo3 (__v4sf x, float f)
+{
+  return vector_init (x[0], x[2], x[3], f);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
new file mode 100644
index 00000000000..64043b9855f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
new file mode 100644
index 00000000000..ad8d2b985d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
new file mode 100644
index 00000000000..5e908faef5c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
new file mode 100644
index 00000000000..988a48823e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index eeb6281c652..85d9f86288b 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -2008,7 +2008,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
   unsigned elem_size, i;
   unsigned HOST_WIDE_INT nelts;
   enum tree_code code, conv_code;
-  constructor_elt *elt;
+  constructor_elt *ce;
   bool maybe_ident;
 
   gcc_checking_assert (gimple_assign_rhs_code (stmt) == CONSTRUCTOR);
@@ -2027,18 +2027,42 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
   orig[1] = NULL;
   conv_code = ERROR_MARK;
   maybe_ident = true;
-  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt)
+
+  tree rhs_vector = NULL;
+  /* The single scalar element.  */
+  tree scalar_element = NULL;
+  unsigned int scalar_idx = 0;
+  bool insert = false;
+  unsigned int nscalars = 0;
+  unsigned int nvectors = 0;
+  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, ce)
     {
       tree ref, op1;
 
       if (i >= nelts)
 	return false;
 
-      if (TREE_CODE (elt->value) != SSA_NAME)
+      if (TREE_CODE (ce->value) != SSA_NAME)
 	return false;
-      def_stmt = get_prop_source_stmt (elt->value, false, NULL);
+      def_stmt = get_prop_source_stmt (ce->value, false, NULL);
       if (!def_stmt)
-	return false;
+	{
+	  if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
+	    {
+	      /* Only allow one scalar insert.  */
+	      if (nscalars != 0)
+		return false;
+
+	      nscalars = 1;
+	      insert = true;
+	      scalar_idx = i;
+	      sel.quick_push (i);
+	      scalar_element = ce->value;
+	      continue;
+	    }
+	  else
+	    return false;
+	}
       code = gimple_assign_rhs_code (def_stmt);
       if (code == FLOAT_EXPR
 	  || code == FIX_TRUNC_EXPR)
@@ -2046,7 +2070,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	  op1 = gimple_assign_rhs1 (def_stmt);
 	  if (conv_code == ERROR_MARK)
 	    {
-	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (elt->value))),
+	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (ce->value))),
 			    GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1)))))
 		return false;
 	      conv_code = code;
@@ -2095,11 +2119,29 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	elt += nelts;
       if (elt != i)
 	maybe_ident = false;
+
+       if (type == TREE_TYPE (ref))
+	 {
+	   /* The RHS vector has the same type as LHS.  */
+	   if (rhs_vector == NULL)
+	     rhs_vector = ref;
+	   /* Check if all RHS vector elements come fome the same
+	      vector.  */
+	   if (rhs_vector == ref)
+	     nvectors++;
+	 }
+
       sel.quick_push (elt);
     }
   if (i < nelts)
     return false;
 
+  if (insert
+      && (nvectors == 0
+	  || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
+	      != (nscalars + nvectors))))
+    return false;
+
   if (! VECTOR_TYPE_P (TREE_TYPE (orig[0]))
       || maybe_ne (TYPE_VECTOR_SUBPARTS (type),
 		   TYPE_VECTOR_SUBPARTS (TREE_TYPE (orig[0]))))
@@ -2127,18 +2169,26 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 
       vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts);
       if (!can_vec_perm_const_p (TYPE_MODE (type), indices))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       mask_type
 	= build_vector_type (build_nonstandard_integer_type (elem_size, 1),
 			     nelts);
       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT
 	  || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)),
 		       GET_MODE_SIZE (TYPE_MODE (type))))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       op2 = vec_perm_indices_to_tree (mask_type, indices);
       if (!orig[1])
 	orig[1] = orig[0];
-      if (conv_code == ERROR_MARK)
+      if (conv_code == ERROR_MARK && !insert)
 	gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
 					orig[1], op2);
       else
@@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 				   VEC_PERM_EXPR, orig[0], orig[1], op2);
 	  orig[0] = gimple_assign_lhs (perm);
 	  gsi_insert_before (gsi, perm, GSI_SAME_STMT);
-	  gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
+	  gimple_assign_set_rhs_with_ops (gsi,
+					  (conv_code != ERROR_MARK
+					   ? conv_code
+					   : NOP_EXPR),
+					  orig[0],
 					  NULL_TREE, NULL_TREE);
 	}
     }
+  if (insert)
+    {
+      /* Generate a single scalar insert.  */
+      tree var = make_ssa_name (type);
+      tree val = gimple_assign_rhs1 (stmt);
+      gimple *copy = gimple_build_assign (var, val);
+      gsi_insert_before (gsi, copy, GSI_SAME_STMT);
+      tree bitpos = bitsize_int (scalar_idx * elem_size);
+      gimple_assign_set_rhs_with_ops (gsi, BIT_INSERT_EXPR, var,
+				      scalar_element, bitpos);
+    }
   update_stmt (gsi_stmt (*gsi));
   return true;
 }
-- 
2.20.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: V3 [PATCH] Optimize vector constructor
  2019-03-06  7:54         ` V3 " H.J. Lu
@ 2019-03-06 13:39           ` Richard Biener
  2019-03-07  7:12             ` V4 " H.J. Lu
  0 siblings, 1 reply; 17+ messages in thread
From: Richard Biener @ 2019-03-06 13:39 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Hongtao Liu, Andrew Pinski, GCC Patches

On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > )
> > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > For vector init constructor:
> > > > > >
> > > > > > ---
> > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > >
> > > > > > __v4sf
> > > > > > foo (__v4sf x, float f)
> > > > > > {
> > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > >   return y;
> > > > > > }
> > > > > > ---
> > > > > >
> > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > followed by a single scalar insert:
> >
> > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > BIT_INSERT_EXPR.
> >
> > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> >
> >
> > H.J.
> > ---
> > We can optimize vector constructor with vector copy or permute followed
> > by a single scalar insert:
> >
> >   __v4sf y;
> >   __v4sf D.1930;
> >   float _1;
> >   float _2;
> >   float _3;
> >
> >   <bb 2> :
> >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> >   y_6 = {f_5(D), _3, _2, _1};
> >   return y_6;
> >
> > with
> >
> >  __v4sf y;
> >   __v4sf D.1930;
> >   float _1;
> >   float _2;
> >   float _3;
> >   vector(4) float _8;
> >
> >   <bb 2> :
> >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> >   _8 = x_9(D);
> >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> >   return y_6;
> >
> > gcc/
> >
> >         PR tree-optimization/88828
> >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> >         vector init constructor with vector copy or permute followed
> >         by a single scalar insert.
> >
> > gcc/testsuite/
> >
> >         PR tree-optimization/88828
> >         * gcc.target/i386/pr88828-1a.c: New test.
> >         * gcc.target/i386/pr88828-2b.c: Likewise.
> >         * gcc.target/i386/pr88828-2.c: Likewise.
> >         * gcc.target/i386/pr88828-3a.c: Likewise.
> >         * gcc.target/i386/pr88828-3b.c: Likewise.
> >         * gcc.target/i386/pr88828-3c.c: Likewise.
> >         * gcc.target/i386/pr88828-3d.c: Likewise.
> >         * gcc.target/i386/pr88828-4a.c: Likewise.
> >         * gcc.target/i386/pr88828-4b.c: Likewise.
> >         * gcc.target/i386/pr88828-5a.c: Likewise.
> >         * gcc.target/i386/pr88828-5b.c: Likewise.
> >         * gcc.target/i386/pr88828-6a.c: Likewise.
> >         * gcc.target/i386/pr88828-6b.c: Likewise.
>
> Here is the updated patch with run-time tests.

-      if (TREE_CODE (elt->value) != SSA_NAME)
+      if (TREE_CODE (ce->value) != SSA_NAME)
        return false;

hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
scalar value can be a constant as well.

       if (!def_stmt)
-       return false;
+       {
+         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))

if (SSA_NAME_IS_DEFAULT_DEF (ce->value))

+           {

also you seem to disallow

  { i + 1, v[1], v[2], v[3] }

because get_prop_source_stmt will return the definition computing
i + 1 in this case and your code will be skipped?

I think you can simplify the code by treating scalar_element != NULL
as nscalars == 1 and eliding nscalars.

-      if (conv_code == ERROR_MARK)
+      if (conv_code == ERROR_MARK && !insert)
        gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
                                        orig[1], op2);
       else
@@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
                                   VEC_PERM_EXPR, orig[0], orig[1], op2);
          orig[0] = gimple_assign_lhs (perm);
          gsi_insert_before (gsi, perm, GSI_SAME_STMT);
-         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
+         gimple_assign_set_rhs_with_ops (gsi,
+                                         (conv_code != ERROR_MARK
+                                          ? conv_code
+                                          : NOP_EXPR),
+                                         orig[0],
                                          NULL_TREE, NULL_TREE);

I believe you should elide the last stmt for conv_code == ERROR_MARK,
that is, why did you need to add the && !insert check in the guarding condition
(this path should already do the correct thing?).  Note that in all
cases it looks
that with conv_code != ERROR_MARK you may end up doing a float->int
or int->float conversion on a value it wasn't done on before which might
raise exceptions?  That is, do we need to make sure we permute a
value we already do convert into the place we're going to insert to?

+  if (insert)
+    {
+      /* Generate a single scalar insert.  */
+      tree var = make_ssa_name (type);
+      tree val = gimple_assign_rhs1 (stmt);
+      gimple *copy = gimple_build_assign (var, val);

I believe this doesn't properly copy the stmt in case it is a permute.
You can use (note the use of gsi_stmt - gimple_assign_set_rhs_with_ops
can re-allocate the stmt)

        gimple *copy = gimple_copy (gsi_stmt (*gsi));
        gimple_assign_set_lhs (copy, var);

+      gsi_insert_before (gsi, copy, GSI_SAME_STMT);
+      tree bitpos = bitsize_int (scalar_idx * elem_size);
+      gimple_assign_set_rhs_with_ops (gsi, BIT_INSERT_EXPR, var,
+                                     scalar_element, bitpos);
+    }

Otherwise looks OK to me.

As separate followup patch it might be interesting to support

 { 0, a[1], a[2], 3 }

kinds as well, thus combining a VECTOR_CST (which is
reasonably cheap to create) with another vector.  That should
be maybe done as a first patch given this is just a two-vector
permute which the code already handles apart from not
recognizing the implicit constant vector participating.

Similar

 { 0, a[1], b[2], 3 }

where the combination of a and b is blended with another
constant vector.  I'm not sure if handling an arbitrary number
of scalar elements should be done in a similar way, that is,
implementing

 { s1, a[1], a[2], s2, s3, b[0], b[1], b[2] }

as

  tem = VEC_PERM <a, b, { ... }>
  tem2 = { s1, 0, 0, s2, s3, 0, 0, 0 }
  res = VEC_PERM <tem, tem2, { blend-mask }>

where constructing tem2 should take at most
N-1 inserts (the first element to insert into tem2
can use a splat or if element zero a zero-extending move).

Doing this effectively lifts the restriction of only
handling two vectors - we'd incrementally do
two-vector permute plus blend of the rest which has
its constructor re-processed.

But as said - the code is already a bit awkward so changing
this in multiple reivisions is preferred and the single-element
case is certainly sth to do via a BIT_INSERT_EXPR.

Thanks,
Richard.

> --
> H.J.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* V4 [PATCH] Optimize vector constructor
  2019-03-06 13:39           ` Richard Biener
@ 2019-03-07  7:12             ` H.J. Lu
  2019-03-08  9:56               ` V5 " H.J. Lu
  0 siblings, 1 reply; 17+ messages in thread
From: H.J. Lu @ 2019-03-07  7:12 UTC (permalink / raw)
  To: Richard Biener; +Cc: Hongtao Liu, Andrew Pinski, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 9241 bytes --]

On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > )
> > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > >
> > > > > > > For vector init constructor:
> > > > > > >
> > > > > > > ---
> > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > >
> > > > > > > __v4sf
> > > > > > > foo (__v4sf x, float f)
> > > > > > > {
> > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > >   return y;
> > > > > > > }
> > > > > > > ---
> > > > > > >
> > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > followed by a single scalar insert:
> > >
> > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > BIT_INSERT_EXPR.
> > >
> > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > >
> > >
> > > H.J.
> > > ---
> > > We can optimize vector constructor with vector copy or permute followed
> > > by a single scalar insert:
> > >
> > >   __v4sf y;
> > >   __v4sf D.1930;
> > >   float _1;
> > >   float _2;
> > >   float _3;
> > >
> > >   <bb 2> :
> > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > >   y_6 = {f_5(D), _3, _2, _1};
> > >   return y_6;
> > >
> > > with
> > >
> > >  __v4sf y;
> > >   __v4sf D.1930;
> > >   float _1;
> > >   float _2;
> > >   float _3;
> > >   vector(4) float _8;
> > >
> > >   <bb 2> :
> > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > >   _8 = x_9(D);
> > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > >   return y_6;
> > >
> > > gcc/
> > >
> > >         PR tree-optimization/88828
> > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > >         vector init constructor with vector copy or permute followed
> > >         by a single scalar insert.
> > >
> > > gcc/testsuite/
> > >
> > >         PR tree-optimization/88828
> > >         * gcc.target/i386/pr88828-1a.c: New test.
> > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> >
> > Here is the updated patch with run-time tests.
>
> -      if (TREE_CODE (elt->value) != SSA_NAME)
> +      if (TREE_CODE (ce->value) != SSA_NAME)
>         return false;
>
> hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> scalar value can be a constant as well.

Fixed.

>        if (!def_stmt)
> -       return false;
> +       {
> +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
>
> if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
>
> +           {
>
> also you seem to disallow
>
>   { i + 1, v[1], v[2], v[3] }

Fixed by

     if (code != BIT_FIELD_REF)
        {
          /* Only allow one scalar insert.  */
          if (nscalars != 0)
            return false;

          nscalars = 1;
          insert = true;
          scalar_idx = i;
          sel.quick_push (i);
          scalar_element = ce->value;
          continue;
        }

> because get_prop_source_stmt will return the definition computing
> i + 1 in this case and your code will be skipped?
>
> I think you can simplify the code by treating scalar_element != NULL
> as nscalars == 1 and eliding nscalars.

It works only if

TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)

We need to check both nscalars and nvectors.  Elide nscalar
check doesn't help much here.

> -      if (conv_code == ERROR_MARK)
> +      if (conv_code == ERROR_MARK && !insert)
>         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
>                                         orig[1], op2);
>        else
> @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
>                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
>           orig[0] = gimple_assign_lhs (perm);
>           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> +         gimple_assign_set_rhs_with_ops (gsi,
> +                                         (conv_code != ERROR_MARK
> +                                          ? conv_code
> +                                          : NOP_EXPR),
> +                                         orig[0],
>                                           NULL_TREE, NULL_TREE);
>
> I believe you should elide the last stmt for conv_code == ERROR_MARK,
> that is, why did you need to add the && !insert check in the guarding condition

When conv_code == ERROR_MARK, we still need

       gimple *perm
            = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
                                   VEC_PERM_EXPR, orig[0], orig[1], op2);
          orig[0] = gimple_assign_lhs (perm);
          gsi_insert_before (gsi, perm, GSI_SAME_STMT);
          gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
                                          orig[0],
                                          NULL_TREE, NULL_TREE);

Otherwise, scalar insert won't work.

> (this path should already do the correct thing?).  Note that in all
> cases it looks
> that with conv_code != ERROR_MARK you may end up doing a float->int
> or int->float conversion on a value it wasn't done on before which might
> raise exceptions?  That is, do we need to make sure we permute a
> value we already do convert into the place we're going to insert to?

This couldn't happen:

      if (type == TREE_TYPE (ref))
         {
           /* The RHS vector has the same type as LHS.  */
           if (rhs_vector == NULL)
             rhs_vector = ref;
           /* Check if all RHS vector elements come fome the same
              vector.  */
           if (rhs_vector == ref)
             nvectors++;
         }
...
  if (insert
      && (nvectors == 0
          || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
              != (nscalars + nvectors))))
    return false;

> +  if (insert)
> +    {
> +      /* Generate a single scalar insert.  */
> +      tree var = make_ssa_name (type);
> +      tree val = gimple_assign_rhs1 (stmt);
> +      gimple *copy = gimple_build_assign (var, val);
>
> I believe this doesn't properly copy the stmt in case it is a permute.
> You can use (note the use of gsi_stmt - gimple_assign_set_rhs_with_ops
> can re-allocate the stmt)
>
>         gimple *copy = gimple_copy (gsi_stmt (*gsi));
>         gimple_assign_set_lhs (copy, var);

Fixed.

> +      gsi_insert_before (gsi, copy, GSI_SAME_STMT);
> +      tree bitpos = bitsize_int (scalar_idx * elem_size);
> +      gimple_assign_set_rhs_with_ops (gsi, BIT_INSERT_EXPR, var,
> +                                     scalar_element, bitpos);
> +    }
>
> Otherwise looks OK to me.
>
> As separate followup patch it might be interesting to support
>
>  { 0, a[1], a[2], 3 }
>
> kinds as well, thus combining a VECTOR_CST (which is
> reasonably cheap to create) with another vector.  That should
> be maybe done as a first patch given this is just a two-vector
> permute which the code already handles apart from not
> recognizing the implicit constant vector participating.
>
> Similar
>
>  { 0, a[1], b[2], 3 }
>
> where the combination of a and b is blended with another
> constant vector.  I'm not sure if handling an arbitrary number
> of scalar elements should be done in a similar way, that is,
> implementing
>
>  { s1, a[1], a[2], s2, s3, b[0], b[1], b[2] }
>
> as
>
>   tem = VEC_PERM <a, b, { ... }>
>   tem2 = { s1, 0, 0, s2, s3, 0, 0, 0 }
>   res = VEC_PERM <tem, tem2, { blend-mask }>
>
> where constructing tem2 should take at most
> N-1 inserts (the first element to insert into tem2
> can use a splat or if element zero a zero-extending move).
>
> Doing this effectively lifts the restriction of only
> handling two vectors - we'd incrementally do
> two-vector permute plus blend of the rest which has
> its constructor re-processed.
>
> But as said - the code is already a bit awkward so changing
> this in multiple reivisions is preferred and the single-element
> case is certainly sth to do via a BIT_INSERT_EXPR.

Agreed.

I am testing this updated patch.

-- 
H.J.

[-- Attachment #2: 0001-Optimize-vector-constructor.patch --]
[-- Type: text/x-patch, Size: 33477 bytes --]

From 91e060c13b3d729772c530640ee1e692d80667e9 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 5 Feb 2019 15:39:27 -0800
Subject: [PATCH] Optimize vector constructor

We can optimize vector constructor with vector copy or permute followed
by a single scalar insert:

  __v4sf y;
  __v4sf D.1930;
  float _1;
  float _2;
  float _3;

  <bb 2> :
  _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
  _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
  _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
  y_6 = {f_5(D), _3, _2, _1};
  return y_6;

with

 __v4sf y;
  __v4sf D.1930;
  float _1;
  float _2;
  float _3;
  vector(4) float _8;

  <bb 2> :
  _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
  _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
  _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
  _8 = x_9(D);
  y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
  return y_6;

gcc/

	PR tree-optimization/88828
	* tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
	vector init constructor with vector copy or permute followed
	by a single scalar insert.

gcc/testsuite/

	PR tree-optimization/88828
	* gcc.target/i386/pr88828-1.c: New test.
	* gcc.target/i386/pr88828-1a.c: Likewise.
	* gcc.target/i386/pr88828-1b.c: Likewise.
	* gcc.target/i386/pr88828-1c.c: Likewise.
	* gcc.target/i386/pr88828-2.c: Likewise.
	* gcc.target/i386/pr88828-2a.c: Likewise.
	* gcc.target/i386/pr88828-2b.c: Likewise.
	* gcc.target/i386/pr88828-2c.c: Likewise.
	* gcc.target/i386/pr88828-2d.c: Likewise.
	* gcc.target/i386/pr88828-3.c: Likewise.
	* gcc.target/i386/pr88828-3a.c: Likewise.
	* gcc.target/i386/pr88828-3b.c: Likewise.
	* gcc.target/i386/pr88828-3c.c: Likewise.
	* gcc.target/i386/pr88828-3d.c: Likewise.
	* gcc.target/i386/pr88828-4a.c: Likewise.
	* gcc.target/i386/pr88828-4b.c: Likewise.
	* gcc.target/i386/pr88828-5a.c: Likewise.
	* gcc.target/i386/pr88828-5b.c: Likewise.
	* gcc.target/i386/pr88828-6.c: Likewise.
	* gcc.target/i386/pr88828-6a.c: Likewise.
	* gcc.target/i386/pr88828-6b.c: Likewise.
	* gcc.target/i386/pr88828-7.c: Likewise.
	* gcc.target/i386/pr88828-7a.c: Likewise.
	* gcc.target/i386/pr88828-7b.c: Likewise.
	* gcc.target/i386/pr88828-8.c: Likewise.
	* gcc.target/i386/pr88828-8a.c: Likewise.
	* gcc.target/i386/pr88828-8b.c: Likewise.
---
 gcc/testsuite/gcc.target/i386/pr88828-1.c  | 49 +++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-1a.c | 17 ++++
 gcc/testsuite/gcc.target/i386/pr88828-1b.c | 23 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-1c.c | 18 ++++
 gcc/testsuite/gcc.target/i386/pr88828-2.c  | 51 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-2a.c | 17 ++++
 gcc/testsuite/gcc.target/i386/pr88828-2b.c | 19 +++++
 gcc/testsuite/gcc.target/i386/pr88828-2c.c | 23 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-2d.c | 25 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-3.c  | 54 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-3a.c | 17 ++++
 gcc/testsuite/gcc.target/i386/pr88828-3b.c | 19 +++++
 gcc/testsuite/gcc.target/i386/pr88828-3c.c | 25 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-4a.c | 18 ++++
 gcc/testsuite/gcc.target/i386/pr88828-4b.c | 21 +++++
 gcc/testsuite/gcc.target/i386/pr88828-5a.c | 18 ++++
 gcc/testsuite/gcc.target/i386/pr88828-5b.c | 20 +++++
 gcc/testsuite/gcc.target/i386/pr88828-6.c  | 47 +++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-6a.c | 16 ++++
 gcc/testsuite/gcc.target/i386/pr88828-6b.c | 22 +++++
 gcc/testsuite/gcc.target/i386/pr88828-7.c  | 53 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-7a.c | 16 ++++
 gcc/testsuite/gcc.target/i386/pr88828-7b.c | 22 +++++
 gcc/testsuite/gcc.target/i386/pr88828-8.c  | 46 +++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-8a.c | 15 ++++
 gcc/testsuite/gcc.target/i386/pr88828-8b.c | 21 +++++
 gcc/tree-ssa-forwprop.c                    | 96 +++++++++++++++++++---
 27 files changed, 776 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2d.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-7a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-7b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-8a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-8b.c

diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c b/gcc/testsuite/gcc.target/i386/pr88828-1.c
new file mode 100644
index 00000000000..a15d1fea3f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c
@@ -0,0 +1,49 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-1a.c"
+#include "pr88828-1b.c"
+#include "pr88828-1c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+  y = foo3 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1a.c b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
new file mode 100644
index 00000000000..d37b24c6661
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[1], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1b.c b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
new file mode 100644
index 00000000000..af4aced65f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  return vector_init (f, x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1c.c b/gcc/testsuite/gcc.target/i386/pr88828-1c.c
new file mode 100644
index 00000000000..a117f3ec7b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1c.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo3 (__v4sf x, float f)
+{
+  __v4sf y = x;
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c b/gcc/testsuite/gcc.target/i386/pr88828-2.c
new file mode 100644
index 00000000000..011fd486bb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c
@@ -0,0 +1,51 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-2a.c"
+#include "pr88828-2c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else if (i == 1)
+      {
+	if (y[i] != f[0])
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2a.c b/gcc/testsuite/gcc.target/i386/pr88828-2a.c
new file mode 100644
index 00000000000..85e49535ebd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2b.c b/gcc/testsuite/gcc.target/i386/pr88828-2b.c
new file mode 100644
index 00000000000..adfd7002a4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2b.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2c.c b/gcc/testsuite/gcc.target/i386/pr88828-2c.c
new file mode 100644
index 00000000000..149967ea0b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2c.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  return vector_init (f, x[0], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2d.c b/gcc/testsuite/gcc.target/i386/pr88828-2d.c
new file mode 100644
index 00000000000..21088496730
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2d.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  return vector_init (f, x[0], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3.c b/gcc/testsuite/gcc.target/i386/pr88828-3.c
new file mode 100644
index 00000000000..adbc46dbf3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3.c
@@ -0,0 +1,54 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-3a.c"
+#include "pr88828-3b.c"
+#include "pr88828-3c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 3)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else if (i == 0)
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i + 1])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+  y = foo3 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
new file mode 100644
index 00000000000..e5cb95c1275
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
new file mode 100644
index 00000000000..0349f35b08a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3c.c b/gcc/testsuite/gcc.target/i386/pr88828-3c.c
new file mode 100644
index 00000000000..fb668a55f1d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3c.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo3 (__v4sf x, float f)
+{
+  return vector_init (x[0], x[2], x[3], f);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
new file mode 100644
index 00000000000..64043b9855f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
new file mode 100644
index 00000000000..ad8d2b985d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
new file mode 100644
index 00000000000..5e908faef5c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
new file mode 100644
index 00000000000..988a48823e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6.c b/gcc/testsuite/gcc.target/i386/pr88828-6.c
new file mode 100644
index 00000000000..8d920396896
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6.c
@@ -0,0 +1,47 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-6a.c"
+#include "pr88828-6b.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf x, float f1[4], float f2[4])
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (x[i] != (f1[i] + f2[i]))
+	  abort ();
+      }
+    else
+      {
+	if (x[i] != f1[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f1[4] = { -11, 2, 55553, -4 };
+  float f2[4] = { 111, 3.3, -55.553, 4.8 };
+  __v4sf x = { f1[0], f1[1], f1[2], f1[3] };
+  __v4sf y = { f2[0], f2[1], f2[2], f2[3] };
+  __v4sf z;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f1[i] || y[i] != f2[i] )
+      abort ();
+
+  z = foo1 (x, y);
+  do_check (z, f1, f2);
+  x = foo2 (x, y);
+  do_check (z, f1, f2);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6a.c b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
new file mode 100644
index 00000000000..4094f25a1fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "addss" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo1 (__v4sf x, __v4sf y)
+{
+  __v4sf z = { x[0] + y[0], x[1], x[2], x[3] };
+  return z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6b.c b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
new file mode 100644
index 00000000000..a423a089963
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "addss" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo2 (__v4sf x, __v4sf y)
+{
+  return vector_init (x[0] + y[0], x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7.c b/gcc/testsuite/gcc.target/i386/pr88828-7.c
new file mode 100644
index 00000000000..471028d417d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-7.c
@@ -0,0 +1,53 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-7a.c"
+#include "pr88828-7b.c"
+
+extern void abort ();
+
+float
+bar (float x, float y)
+{
+  return x / y - y * x;
+}
+
+void
+do_check (__v4sf x, float f1[4], float f2[4])
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (x[i] != bar (f1[i], f2[i]))
+	  abort ();
+      }
+    else
+      {
+	if (x[i] != f1[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f1[4] = { -11, 2, 55553, -4 };
+  float f2[4] = { 111, 3.3, -55.553, 4.8 };
+  __v4sf x = { f1[0], f1[1], f1[2], f1[3] };
+  __v4sf y = { f2[0], f2[1], f2[2], f2[3] };
+  __v4sf z;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f1[i] || y[i] != f2[i] )
+      abort ();
+
+  z = foo1 (x, y);
+  do_check (z, f1, f2);
+  x = foo2 (x, y);
+  do_check (z, f1, f2);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7a.c b/gcc/testsuite/gcc.target/i386/pr88828-7a.c
new file mode 100644
index 00000000000..f1ae57422d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-7a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+extern float bar (float, float);
+
+__v4sf
+foo1 (__v4sf x, __v4sf y)
+{
+  __v4sf z = { bar (x[0], y[0]), x[1], x[2], x[3] };
+  return z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7b.c b/gcc/testsuite/gcc.target/i386/pr88828-7b.c
new file mode 100644
index 00000000000..c027c56948d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-7b.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+extern float bar (float, float);
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo2 (__v4sf x, __v4sf y)
+{
+  return vector_init (bar (x[0], y[0]), x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8.c b/gcc/testsuite/gcc.target/i386/pr88828-8.c
new file mode 100644
index 00000000000..3b8eabd225f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-8.c
@@ -0,0 +1,46 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-8a.c"
+#include "pr88828-8b.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 11.4;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x);
+  do_check (y, f, z);
+  y = foo2 (x);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8a.c b/gcc/testsuite/gcc.target/i386/pr88828-8a.c
new file mode 100644
index 00000000000..5d383dfd081
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-8a.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo1 (__v4sf x)
+{
+  __v4sf z = { 11.4, x[1], x[2], x[3] };
+  return z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8b.c b/gcc/testsuite/gcc.target/i386/pr88828-8b.c
new file mode 100644
index 00000000000..5ffbc9c3103
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-8b.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo2 (__v4sf x)
+{
+  return vector_init (11.4, x[1], x[2], x[3]) ;
+}
diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index eeb6281c652..f40eaffdcc1 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -2008,7 +2008,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
   unsigned elem_size, i;
   unsigned HOST_WIDE_INT nelts;
   enum tree_code code, conv_code;
-  constructor_elt *elt;
+  constructor_elt *ce;
   bool maybe_ident;
 
   gcc_checking_assert (gimple_assign_rhs_code (stmt) == CONSTRUCTOR);
@@ -2027,18 +2027,38 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
   orig[1] = NULL;
   conv_code = ERROR_MARK;
   maybe_ident = true;
-  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt)
+
+  tree rhs_vector = NULL;
+  /* The single scalar element.  */
+  tree scalar_element = NULL;
+  unsigned int scalar_idx = 0;
+  bool insert = false;
+  unsigned int nscalars = 0;
+  unsigned int nvectors = 0;
+  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, ce)
     {
       tree ref, op1;
 
       if (i >= nelts)
 	return false;
 
-      if (TREE_CODE (elt->value) != SSA_NAME)
-	return false;
-      def_stmt = get_prop_source_stmt (elt->value, false, NULL);
+      if (TREE_CODE (ce->value) == SSA_NAME)
+	def_stmt = get_prop_source_stmt (ce->value, false, NULL);
+      else
+	def_stmt = NULL;
       if (!def_stmt)
-	return false;
+	{
+	  /* Only allow one scalar insert.  */
+	  if (nscalars != 0)
+	    return false;
+
+	  nscalars = 1;
+	  insert = true;
+	  scalar_idx = i;
+	  sel.quick_push (i);
+	  scalar_element = ce->value;
+	  continue;
+	}
       code = gimple_assign_rhs_code (def_stmt);
       if (code == FLOAT_EXPR
 	  || code == FIX_TRUNC_EXPR)
@@ -2046,7 +2066,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	  op1 = gimple_assign_rhs1 (def_stmt);
 	  if (conv_code == ERROR_MARK)
 	    {
-	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (elt->value))),
+	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (ce->value))),
 			    GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1)))))
 		return false;
 	      conv_code = code;
@@ -2061,7 +2081,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	  code = gimple_assign_rhs_code (def_stmt);
 	}
       if (code != BIT_FIELD_REF)
-	return false;
+	{
+	  /* Only allow one scalar insert.  */
+	  if (nscalars != 0)
+	    return false;
+
+	  nscalars = 1;
+	  insert = true;
+	  scalar_idx = i;
+	  sel.quick_push (i);
+	  scalar_element = ce->value;
+	  continue;
+	}
       op1 = gimple_assign_rhs1 (def_stmt);
       ref = TREE_OPERAND (op1, 0);
       unsigned int j;
@@ -2095,11 +2126,29 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	elt += nelts;
       if (elt != i)
 	maybe_ident = false;
+
+       if (type == TREE_TYPE (ref))
+	 {
+	   /* The RHS vector has the same type as LHS.  */
+	   if (rhs_vector == NULL)
+	     rhs_vector = ref;
+	   /* Check if all RHS vector elements come fome the same
+	      vector.  */
+	   if (rhs_vector == ref)
+	     nvectors++;
+	 }
+
       sel.quick_push (elt);
     }
   if (i < nelts)
     return false;
 
+  if (insert
+      && (nvectors == 0
+	  || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
+	      != (nscalars + nvectors))))
+    return false;
+
   if (! VECTOR_TYPE_P (TREE_TYPE (orig[0]))
       || maybe_ne (TYPE_VECTOR_SUBPARTS (type),
 		   TYPE_VECTOR_SUBPARTS (TREE_TYPE (orig[0]))))
@@ -2127,18 +2176,26 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 
       vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts);
       if (!can_vec_perm_const_p (TYPE_MODE (type), indices))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       mask_type
 	= build_vector_type (build_nonstandard_integer_type (elem_size, 1),
 			     nelts);
       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT
 	  || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)),
 		       GET_MODE_SIZE (TYPE_MODE (type))))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       op2 = vec_perm_indices_to_tree (mask_type, indices);
       if (!orig[1])
 	orig[1] = orig[0];
-      if (conv_code == ERROR_MARK)
+      if (conv_code == ERROR_MARK && !insert)
 	gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
 					orig[1], op2);
       else
@@ -2148,10 +2205,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 				   VEC_PERM_EXPR, orig[0], orig[1], op2);
 	  orig[0] = gimple_assign_lhs (perm);
 	  gsi_insert_before (gsi, perm, GSI_SAME_STMT);
-	  gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
+	  gimple_assign_set_rhs_with_ops (gsi,
+					  (conv_code != ERROR_MARK
+					   ? conv_code
+					   : NOP_EXPR),
+					  orig[0],
 					  NULL_TREE, NULL_TREE);
 	}
     }
+  if (insert)
+    {
+      /* Generate a single scalar insert.  */
+      tree var = make_ssa_name (type);
+      gimple *copy = gimple_copy (gsi_stmt (*gsi));
+      gimple_assign_set_lhs (copy, var);
+      gsi_insert_before (gsi, copy, GSI_SAME_STMT);
+      tree bitpos = bitsize_int (scalar_idx * elem_size);
+      gimple_assign_set_rhs_with_ops (gsi, BIT_INSERT_EXPR, var,
+				      scalar_element, bitpos);
+    }
   update_stmt (gsi_stmt (*gsi));
   return true;
 }
-- 
2.20.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* V5 [PATCH] Optimize vector constructor
  2019-03-07  7:12             ` V4 " H.J. Lu
@ 2019-03-08  9:56               ` H.J. Lu
  2019-03-08 11:23                 ` Richard Biener
  0 siblings, 1 reply; 17+ messages in thread
From: H.J. Lu @ 2019-03-08  9:56 UTC (permalink / raw)
  To: Richard Biener; +Cc: Hongtao Liu, Andrew Pinski, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 10236 bytes --]

On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > > )
> > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > >
> > > > > > > > For vector init constructor:
> > > > > > > >
> > > > > > > > ---
> > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > > >
> > > > > > > > __v4sf
> > > > > > > > foo (__v4sf x, float f)
> > > > > > > > {
> > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > >   return y;
> > > > > > > > }
> > > > > > > > ---
> > > > > > > >
> > > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > > followed by a single scalar insert:
> > > >
> > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > > BIT_INSERT_EXPR.
> > > >
> > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > > >
> > > >
> > > > H.J.
> > > > ---
> > > > We can optimize vector constructor with vector copy or permute followed
> > > > by a single scalar insert:
> > > >
> > > >   __v4sf y;
> > > >   __v4sf D.1930;
> > > >   float _1;
> > > >   float _2;
> > > >   float _3;
> > > >
> > > >   <bb 2> :
> > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > >   y_6 = {f_5(D), _3, _2, _1};
> > > >   return y_6;
> > > >
> > > > with
> > > >
> > > >  __v4sf y;
> > > >   __v4sf D.1930;
> > > >   float _1;
> > > >   float _2;
> > > >   float _3;
> > > >   vector(4) float _8;
> > > >
> > > >   <bb 2> :
> > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > >   _8 = x_9(D);
> > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > >   return y_6;
> > > >
> > > > gcc/
> > > >
> > > >         PR tree-optimization/88828
> > > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > > >         vector init constructor with vector copy or permute followed
> > > >         by a single scalar insert.
> > > >
> > > > gcc/testsuite/
> > > >
> > > >         PR tree-optimization/88828
> > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > >
> > > Here is the updated patch with run-time tests.
> >
> > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > +      if (TREE_CODE (ce->value) != SSA_NAME)
> >         return false;
> >
> > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> > scalar value can be a constant as well.
>
> Fixed.
>
> >        if (!def_stmt)
> > -       return false;
> > +       {
> > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> >
> > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> >
> > +           {
> >
> > also you seem to disallow
> >
> >   { i + 1, v[1], v[2], v[3] }
>
> Fixed by
>
>      if (code != BIT_FIELD_REF)
>         {
>           /* Only allow one scalar insert.  */
>           if (nscalars != 0)
>             return false;
>
>           nscalars = 1;
>           insert = true;
>           scalar_idx = i;
>           sel.quick_push (i);
>           scalar_element = ce->value;
>           continue;
>         }
>
> > because get_prop_source_stmt will return the definition computing
> > i + 1 in this case and your code will be skipped?
> >
> > I think you can simplify the code by treating scalar_element != NULL
> > as nscalars == 1 and eliding nscalars.
>
> It works only if
>
> TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)
>
> We need to check both nscalars and nvectors.  Elide nscalar
> check doesn't help much here.
>
> > -      if (conv_code == ERROR_MARK)
> > +      if (conv_code == ERROR_MARK && !insert)
> >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
> >                                         orig[1], op2);
> >        else
> > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> >           orig[0] = gimple_assign_lhs (perm);
> >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> > +         gimple_assign_set_rhs_with_ops (gsi,
> > +                                         (conv_code != ERROR_MARK
> > +                                          ? conv_code
> > +                                          : NOP_EXPR),
> > +                                         orig[0],
> >                                           NULL_TREE, NULL_TREE);
> >
> > I believe you should elide the last stmt for conv_code == ERROR_MARK,
> > that is, why did you need to add the && !insert check in the guarding condition
>
> When conv_code == ERROR_MARK, we still need
>
>        gimple *perm
>             = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
>                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
>           orig[0] = gimple_assign_lhs (perm);
>           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
>           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
>                                           orig[0],
>                                           NULL_TREE, NULL_TREE);
>
> Otherwise, scalar insert won't work.
>
> > (this path should already do the correct thing?).  Note that in all
> > cases it looks
> > that with conv_code != ERROR_MARK you may end up doing a float->int
> > or int->float conversion on a value it wasn't done on before which might
> > raise exceptions?  That is, do we need to make sure we permute a
> > value we already do convert into the place we're going to insert to?
>
> This couldn't happen:
>
>       if (type == TREE_TYPE (ref))
>          {
>            /* The RHS vector has the same type as LHS.  */
>            if (rhs_vector == NULL)
>              rhs_vector = ref;
>            /* Check if all RHS vector elements come fome the same
>               vector.  */
>            if (rhs_vector == ref)
>              nvectors++;
>          }
> ...
>   if (insert
>       && (nvectors == 0
>           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
>               != (nscalars + nvectors))))
>     return false;
>
> > +  if (insert)
> > +    {
> > +      /* Generate a single scalar insert.  */
> > +      tree var = make_ssa_name (type);
> > +      tree val = gimple_assign_rhs1 (stmt);
> > +      gimple *copy = gimple_build_assign (var, val);
> >
> > I believe this doesn't properly copy the stmt in case it is a permute.
> > You can use (note the use of gsi_stmt - gimple_assign_set_rhs_with_ops
> > can re-allocate the stmt)
> >
> >         gimple *copy = gimple_copy (gsi_stmt (*gsi));
> >         gimple_assign_set_lhs (copy, var);
>
> Fixed.
>
> > +      gsi_insert_before (gsi, copy, GSI_SAME_STMT);
> > +      tree bitpos = bitsize_int (scalar_idx * elem_size);
> > +      gimple_assign_set_rhs_with_ops (gsi, BIT_INSERT_EXPR, var,
> > +                                     scalar_element, bitpos);
> > +    }
> >
> > Otherwise looks OK to me.
> >
> > As separate followup patch it might be interesting to support
> >
> >  { 0, a[1], a[2], 3 }
> >
> > kinds as well, thus combining a VECTOR_CST (which is
> > reasonably cheap to create) with another vector.  That should
> > be maybe done as a first patch given this is just a two-vector
> > permute which the code already handles apart from not
> > recognizing the implicit constant vector participating.
> >
> > Similar
> >
> >  { 0, a[1], b[2], 3 }
> >
> > where the combination of a and b is blended with another
> > constant vector.  I'm not sure if handling an arbitrary number
> > of scalar elements should be done in a similar way, that is,
> > implementing
> >
> >  { s1, a[1], a[2], s2, s3, b[0], b[1], b[2] }
> >
> > as
> >
> >   tem = VEC_PERM <a, b, { ... }>
> >   tem2 = { s1, 0, 0, s2, s3, 0, 0, 0 }
> >   res = VEC_PERM <tem, tem2, { blend-mask }>
> >
> > where constructing tem2 should take at most
> > N-1 inserts (the first element to insert into tem2
> > can use a splat or if element zero a zero-extending move).
> >
> > Doing this effectively lifts the restriction of only
> > handling two vectors - we'd incrementally do
> > two-vector permute plus blend of the rest which has
> > its constructor re-processed.
> >
> > But as said - the code is already a bit awkward so changing
> > this in multiple reivisions is preferred and the single-element
> > case is certainly sth to do via a BIT_INSERT_EXPR.
>
> Agreed.
>
> I am testing this updated patch.
>

This is the fully tested patch.  I used

+
+      if (useless_type_conversion_p (type, TREE_TYPE (ref)))
+ {
+    /* The RHS vector has the same type as LHS.  */
+    if (rhs_vector == NULL)
+      rhs_vector = ref;
+    /* Check if all RHS vector elements come fome the same
+       vector.  */
+    if (rhs_vector == ref)
+      nvectors++;
+ }
+

to support cast between __v4sf and __m128.

-- 
H.J.

[-- Attachment #2: 0001-Optimize-vector-constructor.patch --]
[-- Type: text/x-patch, Size: 36631 bytes --]

From 65c61509182afceb4f1b35839e16adcf5c3503d8 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 5 Feb 2019 15:39:27 -0800
Subject: [PATCH] Optimize vector constructor

We can optimize vector constructor with vector copy or permute followed
by a single scalar insert:

  __v4sf y;
  __v4sf D.1930;
  float _1;
  float _2;
  float _3;

  <bb 2> :
  _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
  _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
  _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
  y_6 = {f_5(D), _3, _2, _1};
  return y_6;

with

 __v4sf y;
  __v4sf D.1930;
  float _1;
  float _2;
  float _3;
  vector(4) float _8;

  <bb 2> :
  _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
  _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
  _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
  _8 = x_9(D);
  y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
  return y_6;

gcc/

	PR tree-optimization/88828
	* tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
	vector init constructor with vector copy or permute followed
	by a single scalar insert.

gcc/testsuite/

	PR tree-optimization/88828
	* gcc.target/i386/pr88828-1.c: New test.
	* gcc.target/i386/pr88828-1a.c: Likewise.
	* gcc.target/i386/pr88828-1b.c: Likewise.
	* gcc.target/i386/pr88828-1c.c: Likewise.
	* gcc.target/i386/pr88828-2.c: Likewise.
	* gcc.target/i386/pr88828-2a.c: Likewise.
	* gcc.target/i386/pr88828-2b.c: Likewise.
	* gcc.target/i386/pr88828-2c.c: Likewise.
	* gcc.target/i386/pr88828-2d.c: Likewise.
	* gcc.target/i386/pr88828-3.c: Likewise.
	* gcc.target/i386/pr88828-3a.c: Likewise.
	* gcc.target/i386/pr88828-3b.c: Likewise.
	* gcc.target/i386/pr88828-3c.c: Likewise.
	* gcc.target/i386/pr88828-3d.c: Likewise.
	* gcc.target/i386/pr88828-4a.c: Likewise.
	* gcc.target/i386/pr88828-4b.c: Likewise.
	* gcc.target/i386/pr88828-5a.c: Likewise.
	* gcc.target/i386/pr88828-5b.c: Likewise.
	* gcc.target/i386/pr88828-6.c: Likewise.
	* gcc.target/i386/pr88828-6a.c: Likewise.
	* gcc.target/i386/pr88828-6b.c: Likewise.
	* gcc.target/i386/pr88828-7.c: Likewise.
	* gcc.target/i386/pr88828-7a.c: Likewise.
	* gcc.target/i386/pr88828-7b.c: Likewise.
	* gcc.target/i386/pr88828-8.c: Likewise.
	* gcc.target/i386/pr88828-8a.c: Likewise.
	* gcc.target/i386/pr88828-8b.c: Likewise.
	* gcc.target/i386/pr88828-9.c: Likewise.
	* gcc.target/i386/pr88828-9a.c: Likewise.
	* gcc.target/i386/pr88828-9b.c: Likewise.
---
 gcc/testsuite/gcc.target/i386/pr88828-1.c  | 49 +++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-1a.c | 17 ++++
 gcc/testsuite/gcc.target/i386/pr88828-1b.c | 23 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-1c.c | 18 ++++
 gcc/testsuite/gcc.target/i386/pr88828-2.c  | 51 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-2a.c | 17 ++++
 gcc/testsuite/gcc.target/i386/pr88828-2b.c | 19 +++++
 gcc/testsuite/gcc.target/i386/pr88828-2c.c | 23 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-2d.c | 25 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-3.c  | 54 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-3a.c | 17 ++++
 gcc/testsuite/gcc.target/i386/pr88828-3b.c | 19 +++++
 gcc/testsuite/gcc.target/i386/pr88828-3c.c | 25 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-4a.c | 18 ++++
 gcc/testsuite/gcc.target/i386/pr88828-4b.c | 21 +++++
 gcc/testsuite/gcc.target/i386/pr88828-5a.c | 18 ++++
 gcc/testsuite/gcc.target/i386/pr88828-5b.c | 20 +++++
 gcc/testsuite/gcc.target/i386/pr88828-6.c  | 47 +++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-6a.c | 16 ++++
 gcc/testsuite/gcc.target/i386/pr88828-6b.c | 22 +++++
 gcc/testsuite/gcc.target/i386/pr88828-7.c  | 53 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-7a.c | 16 ++++
 gcc/testsuite/gcc.target/i386/pr88828-7b.c | 22 +++++
 gcc/testsuite/gcc.target/i386/pr88828-8.c  | 46 +++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-8a.c | 15 ++++
 gcc/testsuite/gcc.target/i386/pr88828-8b.c | 21 +++++
 gcc/testsuite/gcc.target/i386/pr88828-9.c  | 46 +++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-9a.c | 16 ++++
 gcc/testsuite/gcc.target/i386/pr88828-9b.c | 23 ++++++
 gcc/tree-ssa-forwprop.c                    | 96 +++++++++++++++++++---
 30 files changed, 861 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2d.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-7a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-7b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-8a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-8b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-9.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-9a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-9b.c

diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c b/gcc/testsuite/gcc.target/i386/pr88828-1.c
new file mode 100644
index 00000000000..a15d1fea3f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c
@@ -0,0 +1,49 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-1a.c"
+#include "pr88828-1b.c"
+#include "pr88828-1c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+  y = foo3 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1a.c b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
new file mode 100644
index 00000000000..d37b24c6661
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[1], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1b.c b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
new file mode 100644
index 00000000000..af4aced65f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  return vector_init (f, x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1c.c b/gcc/testsuite/gcc.target/i386/pr88828-1c.c
new file mode 100644
index 00000000000..a117f3ec7b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1c.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo3 (__v4sf x, float f)
+{
+  __v4sf y = x;
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c b/gcc/testsuite/gcc.target/i386/pr88828-2.c
new file mode 100644
index 00000000000..011fd486bb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c
@@ -0,0 +1,51 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-2a.c"
+#include "pr88828-2c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else if (i == 1)
+      {
+	if (y[i] != f[0])
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2a.c b/gcc/testsuite/gcc.target/i386/pr88828-2a.c
new file mode 100644
index 00000000000..85e49535ebd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2b.c b/gcc/testsuite/gcc.target/i386/pr88828-2b.c
new file mode 100644
index 00000000000..adfd7002a4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2b.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2c.c b/gcc/testsuite/gcc.target/i386/pr88828-2c.c
new file mode 100644
index 00000000000..149967ea0b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2c.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  return vector_init (f, x[0], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2d.c b/gcc/testsuite/gcc.target/i386/pr88828-2d.c
new file mode 100644
index 00000000000..21088496730
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2d.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  return vector_init (f, x[0], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3.c b/gcc/testsuite/gcc.target/i386/pr88828-3.c
new file mode 100644
index 00000000000..adbc46dbf3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3.c
@@ -0,0 +1,54 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-3a.c"
+#include "pr88828-3b.c"
+#include "pr88828-3c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 3)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else if (i == 0)
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i + 1])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+  y = foo3 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
new file mode 100644
index 00000000000..e5cb95c1275
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
new file mode 100644
index 00000000000..0349f35b08a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3c.c b/gcc/testsuite/gcc.target/i386/pr88828-3c.c
new file mode 100644
index 00000000000..fb668a55f1d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3c.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo3 (__v4sf x, float f)
+{
+  return vector_init (x[0], x[2], x[3], f);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
new file mode 100644
index 00000000000..64043b9855f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
new file mode 100644
index 00000000000..ad8d2b985d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
new file mode 100644
index 00000000000..5e908faef5c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
new file mode 100644
index 00000000000..988a48823e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6.c b/gcc/testsuite/gcc.target/i386/pr88828-6.c
new file mode 100644
index 00000000000..8d920396896
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6.c
@@ -0,0 +1,47 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-6a.c"
+#include "pr88828-6b.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf x, float f1[4], float f2[4])
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (x[i] != (f1[i] + f2[i]))
+	  abort ();
+      }
+    else
+      {
+	if (x[i] != f1[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f1[4] = { -11, 2, 55553, -4 };
+  float f2[4] = { 111, 3.3, -55.553, 4.8 };
+  __v4sf x = { f1[0], f1[1], f1[2], f1[3] };
+  __v4sf y = { f2[0], f2[1], f2[2], f2[3] };
+  __v4sf z;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f1[i] || y[i] != f2[i] )
+      abort ();
+
+  z = foo1 (x, y);
+  do_check (z, f1, f2);
+  x = foo2 (x, y);
+  do_check (z, f1, f2);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6a.c b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
new file mode 100644
index 00000000000..4094f25a1fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "addss" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo1 (__v4sf x, __v4sf y)
+{
+  __v4sf z = { x[0] + y[0], x[1], x[2], x[3] };
+  return z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6b.c b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
new file mode 100644
index 00000000000..a423a089963
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "addss" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo2 (__v4sf x, __v4sf y)
+{
+  return vector_init (x[0] + y[0], x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7.c b/gcc/testsuite/gcc.target/i386/pr88828-7.c
new file mode 100644
index 00000000000..471028d417d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-7.c
@@ -0,0 +1,53 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-7a.c"
+#include "pr88828-7b.c"
+
+extern void abort ();
+
+float
+bar (float x, float y)
+{
+  return x / y - y * x;
+}
+
+void
+do_check (__v4sf x, float f1[4], float f2[4])
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (x[i] != bar (f1[i], f2[i]))
+	  abort ();
+      }
+    else
+      {
+	if (x[i] != f1[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f1[4] = { -11, 2, 55553, -4 };
+  float f2[4] = { 111, 3.3, -55.553, 4.8 };
+  __v4sf x = { f1[0], f1[1], f1[2], f1[3] };
+  __v4sf y = { f2[0], f2[1], f2[2], f2[3] };
+  __v4sf z;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f1[i] || y[i] != f2[i] )
+      abort ();
+
+  z = foo1 (x, y);
+  do_check (z, f1, f2);
+  x = foo2 (x, y);
+  do_check (z, f1, f2);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7a.c b/gcc/testsuite/gcc.target/i386/pr88828-7a.c
new file mode 100644
index 00000000000..f1ae57422d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-7a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+extern float bar (float, float);
+
+__v4sf
+foo1 (__v4sf x, __v4sf y)
+{
+  __v4sf z = { bar (x[0], y[0]), x[1], x[2], x[3] };
+  return z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7b.c b/gcc/testsuite/gcc.target/i386/pr88828-7b.c
new file mode 100644
index 00000000000..c027c56948d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-7b.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+extern float bar (float, float);
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo2 (__v4sf x, __v4sf y)
+{
+  return vector_init (bar (x[0], y[0]), x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8.c b/gcc/testsuite/gcc.target/i386/pr88828-8.c
new file mode 100644
index 00000000000..3b8eabd225f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-8.c
@@ -0,0 +1,46 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-8a.c"
+#include "pr88828-8b.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 11.4;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x);
+  do_check (y, f, z);
+  y = foo2 (x);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8a.c b/gcc/testsuite/gcc.target/i386/pr88828-8a.c
new file mode 100644
index 00000000000..5d383dfd081
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-8a.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo1 (__v4sf x)
+{
+  __v4sf z = { 11.4, x[1], x[2], x[3] };
+  return z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8b.c b/gcc/testsuite/gcc.target/i386/pr88828-8b.c
new file mode 100644
index 00000000000..5ffbc9c3103
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-8b.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo2 (__v4sf x)
+{
+  return vector_init (11.4, x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-9.c b/gcc/testsuite/gcc.target/i386/pr88828-9.c
new file mode 100644
index 00000000000..c33907b4a6f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-9.c
@@ -0,0 +1,46 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-9a.c"
+#include "pr88828-9b.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 11.4;
+  __m128 x = (__m128) (__v4sf) { f[0], f[1], f[2], f[3] };
+  __m128 y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x);
+  do_check (y, f, z);
+  y = foo2 (x);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-9a.c b/gcc/testsuite/gcc.target/i386/pr88828-9a.c
new file mode 100644
index 00000000000..7f830657732
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-9a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+
+__m128
+foo1 (__m128 x)
+{
+  __v4sf z = { 11.4, ((__v4sf) x)[1], ((__v4sf) x)[2], ((__v4sf) x) [3] };
+  return (__m128) z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-9b.c b/gcc/testsuite/gcc.target/i386/pr88828-9b.c
new file mode 100644
index 00000000000..6588ad15a9b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-9b.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+
+static __m128
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return (__m128) y;
+}
+
+__m128
+foo2 (__m128 x)
+{
+  return vector_init (11.4, ((__v4sf) x)[1], ((__v4sf) x)[2],
+		      ((__v4sf) x) [3]);
+}
diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index eeb6281c652..32a3af5687e 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -2008,7 +2008,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
   unsigned elem_size, i;
   unsigned HOST_WIDE_INT nelts;
   enum tree_code code, conv_code;
-  constructor_elt *elt;
+  constructor_elt *ce;
   bool maybe_ident;
 
   gcc_checking_assert (gimple_assign_rhs_code (stmt) == CONSTRUCTOR);
@@ -2027,18 +2027,38 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
   orig[1] = NULL;
   conv_code = ERROR_MARK;
   maybe_ident = true;
-  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt)
+
+  tree rhs_vector = NULL;
+  /* The single scalar element.  */
+  tree scalar_element = NULL;
+  unsigned int scalar_idx = 0;
+  bool insert = false;
+  unsigned int nscalars = 0;
+  unsigned int nvectors = 0;
+  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, ce)
     {
       tree ref, op1;
 
       if (i >= nelts)
 	return false;
 
-      if (TREE_CODE (elt->value) != SSA_NAME)
-	return false;
-      def_stmt = get_prop_source_stmt (elt->value, false, NULL);
+      if (TREE_CODE (ce->value) == SSA_NAME)
+	def_stmt = get_prop_source_stmt (ce->value, false, NULL);
+      else
+	def_stmt = NULL;
       if (!def_stmt)
-	return false;
+	{
+	  /* Only allow one scalar insert.  */
+	  if (nscalars != 0)
+	    return false;
+
+	  nscalars = 1;
+	  insert = true;
+	  scalar_idx = i;
+	  sel.quick_push (i);
+	  scalar_element = ce->value;
+	  continue;
+	}
       code = gimple_assign_rhs_code (def_stmt);
       if (code == FLOAT_EXPR
 	  || code == FIX_TRUNC_EXPR)
@@ -2046,7 +2066,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	  op1 = gimple_assign_rhs1 (def_stmt);
 	  if (conv_code == ERROR_MARK)
 	    {
-	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (elt->value))),
+	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (ce->value))),
 			    GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1)))))
 		return false;
 	      conv_code = code;
@@ -2061,7 +2081,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	  code = gimple_assign_rhs_code (def_stmt);
 	}
       if (code != BIT_FIELD_REF)
-	return false;
+	{
+	  /* Only allow one scalar insert.  */
+	  if (nscalars != 0)
+	    return false;
+
+	  nscalars = 1;
+	  insert = true;
+	  scalar_idx = i;
+	  sel.quick_push (i);
+	  scalar_element = ce->value;
+	  continue;
+	}
       op1 = gimple_assign_rhs1 (def_stmt);
       ref = TREE_OPERAND (op1, 0);
       unsigned int j;
@@ -2095,11 +2126,29 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	elt += nelts;
       if (elt != i)
 	maybe_ident = false;
+
+      if (useless_type_conversion_p (type, TREE_TYPE (ref)))
+	 {
+	   /* The RHS vector has the same type as LHS.  */
+	   if (rhs_vector == NULL)
+	     rhs_vector = ref;
+	   /* Check if all RHS vector elements come fome the same
+	      vector.  */
+	   if (rhs_vector == ref)
+	     nvectors++;
+	 }
+
       sel.quick_push (elt);
     }
   if (i < nelts)
     return false;
 
+  if (insert
+      && (nvectors == 0
+	  || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
+	      != (nscalars + nvectors))))
+    return false;
+
   if (! VECTOR_TYPE_P (TREE_TYPE (orig[0]))
       || maybe_ne (TYPE_VECTOR_SUBPARTS (type),
 		   TYPE_VECTOR_SUBPARTS (TREE_TYPE (orig[0]))))
@@ -2127,18 +2176,26 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 
       vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts);
       if (!can_vec_perm_const_p (TYPE_MODE (type), indices))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       mask_type
 	= build_vector_type (build_nonstandard_integer_type (elem_size, 1),
 			     nelts);
       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT
 	  || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)),
 		       GET_MODE_SIZE (TYPE_MODE (type))))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       op2 = vec_perm_indices_to_tree (mask_type, indices);
       if (!orig[1])
 	orig[1] = orig[0];
-      if (conv_code == ERROR_MARK)
+      if (conv_code == ERROR_MARK && !insert)
 	gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
 					orig[1], op2);
       else
@@ -2148,10 +2205,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 				   VEC_PERM_EXPR, orig[0], orig[1], op2);
 	  orig[0] = gimple_assign_lhs (perm);
 	  gsi_insert_before (gsi, perm, GSI_SAME_STMT);
-	  gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
+	  gimple_assign_set_rhs_with_ops (gsi,
+					  (conv_code != ERROR_MARK
+					   ? conv_code
+					   : NOP_EXPR),
+					  orig[0],
 					  NULL_TREE, NULL_TREE);
 	}
     }
+  if (insert)
+    {
+      /* Generate a single scalar insert.  */
+      tree var = make_ssa_name (type);
+      gimple *copy = gimple_copy (gsi_stmt (*gsi));
+      gimple_assign_set_lhs (copy, var);
+      gsi_insert_before (gsi, copy, GSI_SAME_STMT);
+      tree bitpos = bitsize_int (scalar_idx * elem_size);
+      gimple_assign_set_rhs_with_ops (gsi, BIT_INSERT_EXPR, var,
+				      scalar_element, bitpos);
+    }
   update_stmt (gsi_stmt (*gsi));
   return true;
 }
-- 
2.20.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: V5 [PATCH] Optimize vector constructor
  2019-03-08  9:56               ` V5 " H.J. Lu
@ 2019-03-08 11:23                 ` Richard Biener
  2019-03-11  7:58                   ` H.J. Lu
  0 siblings, 1 reply; 17+ messages in thread
From: Richard Biener @ 2019-03-08 11:23 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Hongtao Liu, Andrew Pinski, GCC Patches

On Fri, Mar 8, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> > <richard.guenther@gmail.com> wrote:
> > >
> > > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > >
> > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > > > )
> > > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > >
> > > > > > > > > For vector init constructor:
> > > > > > > > >
> > > > > > > > > ---
> > > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > > > >
> > > > > > > > > __v4sf
> > > > > > > > > foo (__v4sf x, float f)
> > > > > > > > > {
> > > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > > >   return y;
> > > > > > > > > }
> > > > > > > > > ---
> > > > > > > > >
> > > > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > > > followed by a single scalar insert:
> > > > >
> > > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > > > BIT_INSERT_EXPR.
> > > > >
> > > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > > > >
> > > > >
> > > > > H.J.
> > > > > ---
> > > > > We can optimize vector constructor with vector copy or permute followed
> > > > > by a single scalar insert:
> > > > >
> > > > >   __v4sf y;
> > > > >   __v4sf D.1930;
> > > > >   float _1;
> > > > >   float _2;
> > > > >   float _3;
> > > > >
> > > > >   <bb 2> :
> > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > >   y_6 = {f_5(D), _3, _2, _1};
> > > > >   return y_6;
> > > > >
> > > > > with
> > > > >
> > > > >  __v4sf y;
> > > > >   __v4sf D.1930;
> > > > >   float _1;
> > > > >   float _2;
> > > > >   float _3;
> > > > >   vector(4) float _8;
> > > > >
> > > > >   <bb 2> :
> > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > >   _8 = x_9(D);
> > > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > > >   return y_6;
> > > > >
> > > > > gcc/
> > > > >
> > > > >         PR tree-optimization/88828
> > > > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > > > >         vector init constructor with vector copy or permute followed
> > > > >         by a single scalar insert.
> > > > >
> > > > > gcc/testsuite/
> > > > >
> > > > >         PR tree-optimization/88828
> > > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > > >
> > > > Here is the updated patch with run-time tests.
> > >
> > > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > > +      if (TREE_CODE (ce->value) != SSA_NAME)
> > >         return false;
> > >
> > > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> > > scalar value can be a constant as well.
> >
> > Fixed.
> >
> > >        if (!def_stmt)
> > > -       return false;
> > > +       {
> > > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> > >
> > > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> > >
> > > +           {
> > >
> > > also you seem to disallow
> > >
> > >   { i + 1, v[1], v[2], v[3] }
> >
> > Fixed by
> >
> >      if (code != BIT_FIELD_REF)
> >         {
> >           /* Only allow one scalar insert.  */
> >           if (nscalars != 0)
> >             return false;
> >
> >           nscalars = 1;
> >           insert = true;
> >           scalar_idx = i;
> >           sel.quick_push (i);
> >           scalar_element = ce->value;
> >           continue;
> >         }
> >
> > > because get_prop_source_stmt will return the definition computing
> > > i + 1 in this case and your code will be skipped?
> > >
> > > I think you can simplify the code by treating scalar_element != NULL
> > > as nscalars == 1 and eliding nscalars.
> >
> > It works only if
> >
> > TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)
> >
> > We need to check both nscalars and nvectors.  Elide nscalar
> > check doesn't help much here.
> >
> > > -      if (conv_code == ERROR_MARK)
> > > +      if (conv_code == ERROR_MARK && !insert)
> > >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
> > >                                         orig[1], op2);
> > >        else
> > > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > >           orig[0] = gimple_assign_lhs (perm);
> > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> > > +         gimple_assign_set_rhs_with_ops (gsi,
> > > +                                         (conv_code != ERROR_MARK
> > > +                                          ? conv_code
> > > +                                          : NOP_EXPR),
> > > +                                         orig[0],
> > >                                           NULL_TREE, NULL_TREE);
> > >
> > > I believe you should elide the last stmt for conv_code == ERROR_MARK,
> > > that is, why did you need to add the && !insert check in the guarding condition
> >
> > When conv_code == ERROR_MARK, we still need
> >
> >        gimple *perm
> >             = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
> >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> >           orig[0] = gimple_assign_lhs (perm);
> >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> >           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
> >                                           orig[0],
> >                                           NULL_TREE, NULL_TREE);
> >
> > Otherwise, scalar insert won't work.
> >
> > > (this path should already do the correct thing?).  Note that in all
> > > cases it looks
> > > that with conv_code != ERROR_MARK you may end up doing a float->int
> > > or int->float conversion on a value it wasn't done on before which might
> > > raise exceptions?  That is, do we need to make sure we permute a
> > > value we already do convert into the place we're going to insert to?
> >
> > This couldn't happen:
> >
> >       if (type == TREE_TYPE (ref))
> >          {
> >            /* The RHS vector has the same type as LHS.  */
> >            if (rhs_vector == NULL)
> >              rhs_vector = ref;
> >            /* Check if all RHS vector elements come fome the same
> >               vector.  */
> >            if (rhs_vector == ref)
> >              nvectors++;
> >          }
> > ...
> >   if (insert
> >       && (nvectors == 0
> >           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> >               != (nscalars + nvectors))))
> >     return false;

I see - that looks like a missed case then?

 { 1., (float)v[1], (float)v[2], (float)v[3] }

with integer vector v?

I'll have a look at the full patch next week (it's GCC 10 material in any case).

Richard.

> > > +  if (insert)
> > > +    {
> > > +      /* Generate a single scalar insert.  */
> > > +      tree var = make_ssa_name (type);
> > > +      tree val = gimple_assign_rhs1 (stmt);
> > > +      gimple *copy = gimple_build_assign (var, val);
> > >
> > > I believe this doesn't properly copy the stmt in case it is a permute.
> > > You can use (note the use of gsi_stmt - gimple_assign_set_rhs_with_ops
> > > can re-allocate the stmt)
> > >
> > >         gimple *copy = gimple_copy (gsi_stmt (*gsi));
> > >         gimple_assign_set_lhs (copy, var);
> >
> > Fixed.
> >
> > > +      gsi_insert_before (gsi, copy, GSI_SAME_STMT);
> > > +      tree bitpos = bitsize_int (scalar_idx * elem_size);
> > > +      gimple_assign_set_rhs_with_ops (gsi, BIT_INSERT_EXPR, var,
> > > +                                     scalar_element, bitpos);
> > > +    }
> > >
> > > Otherwise looks OK to me.
> > >
> > > As separate followup patch it might be interesting to support
> > >
> > >  { 0, a[1], a[2], 3 }
> > >
> > > kinds as well, thus combining a VECTOR_CST (which is
> > > reasonably cheap to create) with another vector.  That should
> > > be maybe done as a first patch given this is just a two-vector
> > > permute which the code already handles apart from not
> > > recognizing the implicit constant vector participating.
> > >
> > > Similar
> > >
> > >  { 0, a[1], b[2], 3 }
> > >
> > > where the combination of a and b is blended with another
> > > constant vector.  I'm not sure if handling an arbitrary number
> > > of scalar elements should be done in a similar way, that is,
> > > implementing
> > >
> > >  { s1, a[1], a[2], s2, s3, b[0], b[1], b[2] }
> > >
> > > as
> > >
> > >   tem = VEC_PERM <a, b, { ... }>
> > >   tem2 = { s1, 0, 0, s2, s3, 0, 0, 0 }
> > >   res = VEC_PERM <tem, tem2, { blend-mask }>
> > >
> > > where constructing tem2 should take at most
> > > N-1 inserts (the first element to insert into tem2
> > > can use a splat or if element zero a zero-extending move).
> > >
> > > Doing this effectively lifts the restriction of only
> > > handling two vectors - we'd incrementally do
> > > two-vector permute plus blend of the rest which has
> > > its constructor re-processed.
> > >
> > > But as said - the code is already a bit awkward so changing
> > > this in multiple reivisions is preferred and the single-element
> > > case is certainly sth to do via a BIT_INSERT_EXPR.
> >
> > Agreed.
> >
> > I am testing this updated patch.
> >
>
> This is the fully tested patch.  I used
>
> +
> +      if (useless_type_conversion_p (type, TREE_TYPE (ref)))
> + {
> +    /* The RHS vector has the same type as LHS.  */
> +    if (rhs_vector == NULL)
> +      rhs_vector = ref;
> +    /* Check if all RHS vector elements come fome the same
> +       vector.  */
> +    if (rhs_vector == ref)
> +      nvectors++;
> + }
> +
>
> to support cast between __v4sf and __m128.
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: V5 [PATCH] Optimize vector constructor
  2019-03-08 11:23                 ` Richard Biener
@ 2019-03-11  7:58                   ` H.J. Lu
  2019-05-02 14:54                     ` Richard Biener
  0 siblings, 1 reply; 17+ messages in thread
From: H.J. Lu @ 2019-03-11  7:58 UTC (permalink / raw)
  To: Richard Biener; +Cc: Hongtao Liu, Andrew Pinski, GCC Patches

On Fri, Mar 8, 2019 at 7:03 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Fri, Mar 8, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> > > <richard.guenther@gmail.com> wrote:
> > > >
> > > > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > >
> > > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > > > > )
> > > > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > >
> > > > > > > > > > For vector init constructor:
> > > > > > > > > >
> > > > > > > > > > ---
> > > > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > > > > >
> > > > > > > > > > __v4sf
> > > > > > > > > > foo (__v4sf x, float f)
> > > > > > > > > > {
> > > > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > > > >   return y;
> > > > > > > > > > }
> > > > > > > > > > ---
> > > > > > > > > >
> > > > > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > > > > followed by a single scalar insert:
> > > > > >
> > > > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > > > > BIT_INSERT_EXPR.
> > > > > >
> > > > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > > > > >
> > > > > >
> > > > > > H.J.
> > > > > > ---
> > > > > > We can optimize vector constructor with vector copy or permute followed
> > > > > > by a single scalar insert:
> > > > > >
> > > > > >   __v4sf y;
> > > > > >   __v4sf D.1930;
> > > > > >   float _1;
> > > > > >   float _2;
> > > > > >   float _3;
> > > > > >
> > > > > >   <bb 2> :
> > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > >   y_6 = {f_5(D), _3, _2, _1};
> > > > > >   return y_6;
> > > > > >
> > > > > > with
> > > > > >
> > > > > >  __v4sf y;
> > > > > >   __v4sf D.1930;
> > > > > >   float _1;
> > > > > >   float _2;
> > > > > >   float _3;
> > > > > >   vector(4) float _8;
> > > > > >
> > > > > >   <bb 2> :
> > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > >   _8 = x_9(D);
> > > > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > > > >   return y_6;
> > > > > >
> > > > > > gcc/
> > > > > >
> > > > > >         PR tree-optimization/88828
> > > > > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > > > > >         vector init constructor with vector copy or permute followed
> > > > > >         by a single scalar insert.
> > > > > >
> > > > > > gcc/testsuite/
> > > > > >
> > > > > >         PR tree-optimization/88828
> > > > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > > > >
> > > > > Here is the updated patch with run-time tests.
> > > >
> > > > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > > > +      if (TREE_CODE (ce->value) != SSA_NAME)
> > > >         return false;
> > > >
> > > > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> > > > scalar value can be a constant as well.
> > >
> > > Fixed.
> > >
> > > >        if (!def_stmt)
> > > > -       return false;
> > > > +       {
> > > > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> > > >
> > > > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> > > >
> > > > +           {
> > > >
> > > > also you seem to disallow
> > > >
> > > >   { i + 1, v[1], v[2], v[3] }
> > >
> > > Fixed by
> > >
> > >      if (code != BIT_FIELD_REF)
> > >         {
> > >           /* Only allow one scalar insert.  */
> > >           if (nscalars != 0)
> > >             return false;
> > >
> > >           nscalars = 1;
> > >           insert = true;
> > >           scalar_idx = i;
> > >           sel.quick_push (i);
> > >           scalar_element = ce->value;
> > >           continue;
> > >         }
> > >
> > > > because get_prop_source_stmt will return the definition computing
> > > > i + 1 in this case and your code will be skipped?
> > > >
> > > > I think you can simplify the code by treating scalar_element != NULL
> > > > as nscalars == 1 and eliding nscalars.
> > >
> > > It works only if
> > >
> > > TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)
> > >
> > > We need to check both nscalars and nvectors.  Elide nscalar
> > > check doesn't help much here.
> > >
> > > > -      if (conv_code == ERROR_MARK)
> > > > +      if (conv_code == ERROR_MARK && !insert)
> > > >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
> > > >                                         orig[1], op2);
> > > >        else
> > > > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > >           orig[0] = gimple_assign_lhs (perm);
> > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> > > > +         gimple_assign_set_rhs_with_ops (gsi,
> > > > +                                         (conv_code != ERROR_MARK
> > > > +                                          ? conv_code
> > > > +                                          : NOP_EXPR),
> > > > +                                         orig[0],
> > > >                                           NULL_TREE, NULL_TREE);
> > > >
> > > > I believe you should elide the last stmt for conv_code == ERROR_MARK,
> > > > that is, why did you need to add the && !insert check in the guarding condition
> > >
> > > When conv_code == ERROR_MARK, we still need
> > >
> > >        gimple *perm
> > >             = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
> > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > >           orig[0] = gimple_assign_lhs (perm);
> > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > >           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
> > >                                           orig[0],
> > >                                           NULL_TREE, NULL_TREE);
> > >
> > > Otherwise, scalar insert won't work.
> > >
> > > > (this path should already do the correct thing?).  Note that in all
> > > > cases it looks
> > > > that with conv_code != ERROR_MARK you may end up doing a float->int
> > > > or int->float conversion on a value it wasn't done on before which might
> > > > raise exceptions?  That is, do we need to make sure we permute a
> > > > value we already do convert into the place we're going to insert to?
> > >
> > > This couldn't happen:
> > >
> > >       if (type == TREE_TYPE (ref))
> > >          {
> > >            /* The RHS vector has the same type as LHS.  */
> > >            if (rhs_vector == NULL)
> > >              rhs_vector = ref;
> > >            /* Check if all RHS vector elements come fome the same
> > >               vector.  */
> > >            if (rhs_vector == ref)
> > >              nvectors++;
> > >          }
> > > ...
> > >   if (insert
> > >       && (nvectors == 0
> > >           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> > >               != (nscalars + nvectors))))
> > >     return false;
>
> I see - that looks like a missed case then?
>
>  { 1., (float)v[1], (float)v[2], (float)v[3] }
>
> with integer vector v?

True.

> I'll have a look at the full patch next week (it's GCC 10 material in any case).
>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: V5 [PATCH] Optimize vector constructor
  2019-03-11  7:58                   ` H.J. Lu
@ 2019-05-02 14:54                     ` Richard Biener
  2019-05-02 14:55                       ` Richard Biener
  0 siblings, 1 reply; 17+ messages in thread
From: Richard Biener @ 2019-05-02 14:54 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Hongtao Liu, Andrew Pinski, GCC Patches

On Mon, Mar 11, 2019 at 8:03 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Mar 8, 2019 at 7:03 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Fri, Mar 8, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> > > > <richard.guenther@gmail.com> wrote:
> > > > >
> > > > > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > >
> > > > > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > >
> > > > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > > > > > )
> > > > > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > For vector init constructor:
> > > > > > > > > > >
> > > > > > > > > > > ---
> > > > > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > > > > > >
> > > > > > > > > > > __v4sf
> > > > > > > > > > > foo (__v4sf x, float f)
> > > > > > > > > > > {
> > > > > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > > > > >   return y;
> > > > > > > > > > > }
> > > > > > > > > > > ---
> > > > > > > > > > >
> > > > > > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > > > > > followed by a single scalar insert:
> > > > > > >
> > > > > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > > > > > BIT_INSERT_EXPR.
> > > > > > >
> > > > > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > > > > > >
> > > > > > >
> > > > > > > H.J.
> > > > > > > ---
> > > > > > > We can optimize vector constructor with vector copy or permute followed
> > > > > > > by a single scalar insert:
> > > > > > >
> > > > > > >   __v4sf y;
> > > > > > >   __v4sf D.1930;
> > > > > > >   float _1;
> > > > > > >   float _2;
> > > > > > >   float _3;
> > > > > > >
> > > > > > >   <bb 2> :
> > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > >   y_6 = {f_5(D), _3, _2, _1};
> > > > > > >   return y_6;
> > > > > > >
> > > > > > > with
> > > > > > >
> > > > > > >  __v4sf y;
> > > > > > >   __v4sf D.1930;
> > > > > > >   float _1;
> > > > > > >   float _2;
> > > > > > >   float _3;
> > > > > > >   vector(4) float _8;
> > > > > > >
> > > > > > >   <bb 2> :
> > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > >   _8 = x_9(D);
> > > > > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > > > > >   return y_6;
> > > > > > >
> > > > > > > gcc/
> > > > > > >
> > > > > > >         PR tree-optimization/88828
> > > > > > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > > > > > >         vector init constructor with vector copy or permute followed
> > > > > > >         by a single scalar insert.
> > > > > > >
> > > > > > > gcc/testsuite/
> > > > > > >
> > > > > > >         PR tree-optimization/88828
> > > > > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > > > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > > > > >
> > > > > > Here is the updated patch with run-time tests.
> > > > >
> > > > > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > > > > +      if (TREE_CODE (ce->value) != SSA_NAME)
> > > > >         return false;
> > > > >
> > > > > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> > > > > scalar value can be a constant as well.
> > > >
> > > > Fixed.
> > > >
> > > > >        if (!def_stmt)
> > > > > -       return false;
> > > > > +       {
> > > > > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> > > > >
> > > > > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> > > > >
> > > > > +           {
> > > > >
> > > > > also you seem to disallow
> > > > >
> > > > >   { i + 1, v[1], v[2], v[3] }
> > > >
> > > > Fixed by
> > > >
> > > >      if (code != BIT_FIELD_REF)
> > > >         {
> > > >           /* Only allow one scalar insert.  */
> > > >           if (nscalars != 0)
> > > >             return false;
> > > >
> > > >           nscalars = 1;
> > > >           insert = true;
> > > >           scalar_idx = i;
> > > >           sel.quick_push (i);
> > > >           scalar_element = ce->value;
> > > >           continue;
> > > >         }
> > > >
> > > > > because get_prop_source_stmt will return the definition computing
> > > > > i + 1 in this case and your code will be skipped?
> > > > >
> > > > > I think you can simplify the code by treating scalar_element != NULL
> > > > > as nscalars == 1 and eliding nscalars.
> > > >
> > > > It works only if
> > > >
> > > > TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)
> > > >
> > > > We need to check both nscalars and nvectors.  Elide nscalar
> > > > check doesn't help much here.
> > > >
> > > > > -      if (conv_code == ERROR_MARK)
> > > > > +      if (conv_code == ERROR_MARK && !insert)
> > > > >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
> > > > >                                         orig[1], op2);
> > > > >        else
> > > > > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> > > > > +         gimple_assign_set_rhs_with_ops (gsi,
> > > > > +                                         (conv_code != ERROR_MARK
> > > > > +                                          ? conv_code
> > > > > +                                          : NOP_EXPR),
> > > > > +                                         orig[0],
> > > > >                                           NULL_TREE, NULL_TREE);
> > > > >
> > > > > I believe you should elide the last stmt for conv_code == ERROR_MARK,
> > > > > that is, why did you need to add the && !insert check in the guarding condition
> > > >
> > > > When conv_code == ERROR_MARK, we still need
> > > >
> > > >        gimple *perm
> > > >             = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
> > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > >           orig[0] = gimple_assign_lhs (perm);
> > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > >           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
> > > >                                           orig[0],
> > > >                                           NULL_TREE, NULL_TREE);
> > > >
> > > > Otherwise, scalar insert won't work.
> > > >
> > > > > (this path should already do the correct thing?).  Note that in all
> > > > > cases it looks
> > > > > that with conv_code != ERROR_MARK you may end up doing a float->int
> > > > > or int->float conversion on a value it wasn't done on before which might
> > > > > raise exceptions?  That is, do we need to make sure we permute a
> > > > > value we already do convert into the place we're going to insert to?
> > > >
> > > > This couldn't happen:
> > > >
> > > >       if (type == TREE_TYPE (ref))
> > > >          {
> > > >            /* The RHS vector has the same type as LHS.  */
> > > >            if (rhs_vector == NULL)
> > > >              rhs_vector = ref;
> > > >            /* Check if all RHS vector elements come fome the same
> > > >               vector.  */
> > > >            if (rhs_vector == ref)
> > > >              nvectors++;
> > > >          }
> > > > ...
> > > >   if (insert
> > > >       && (nvectors == 0
> > > >           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> > > >               != (nscalars + nvectors))))
> > > >     return false;
> >
> > I see - that looks like a missed case then?
> >
> >  { 1., (float)v[1], (float)v[2], (float)v[3] }
> >
> > with integer vector v?
>
> True.
>
> > I'll have a look at the full patch next week (it's GCC 10 material in any case).
> >

Now looking again.  I still don't like the new "structure" of the loop
very much.
A refactoring like the attached should make it easier to clearly separate the
cases where we reach a vector def and where not.

Do you want me to take over the patch?

Thanks,
Richard.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: V5 [PATCH] Optimize vector constructor
  2019-05-02 14:54                     ` Richard Biener
@ 2019-05-02 14:55                       ` Richard Biener
  2019-05-02 17:53                         ` H.J. Lu
  0 siblings, 1 reply; 17+ messages in thread
From: Richard Biener @ 2019-05-02 14:55 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Hongtao Liu, Andrew Pinski, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 10077 bytes --]

On Thu, May 2, 2019 at 4:54 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Mon, Mar 11, 2019 at 8:03 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Mar 8, 2019 at 7:03 PM Richard Biener
> > <richard.guenther@gmail.com> wrote:
> > >
> > > On Fri, Mar 8, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> > > > > <richard.guenther@gmail.com> wrote:
> > > > > >
> > > > > > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > >
> > > > > > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > > > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > > > > > > )
> > > > > > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > For vector init constructor:
> > > > > > > > > > > >
> > > > > > > > > > > > ---
> > > > > > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > > > > > > >
> > > > > > > > > > > > __v4sf
> > > > > > > > > > > > foo (__v4sf x, float f)
> > > > > > > > > > > > {
> > > > > > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > > > > > >   return y;
> > > > > > > > > > > > }
> > > > > > > > > > > > ---
> > > > > > > > > > > >
> > > > > > > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > > > > > > followed by a single scalar insert:
> > > > > > > >
> > > > > > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > > > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > > > > > > BIT_INSERT_EXPR.
> > > > > > > >
> > > > > > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > > > > > > >
> > > > > > > >
> > > > > > > > H.J.
> > > > > > > > ---
> > > > > > > > We can optimize vector constructor with vector copy or permute followed
> > > > > > > > by a single scalar insert:
> > > > > > > >
> > > > > > > >   __v4sf y;
> > > > > > > >   __v4sf D.1930;
> > > > > > > >   float _1;
> > > > > > > >   float _2;
> > > > > > > >   float _3;
> > > > > > > >
> > > > > > > >   <bb 2> :
> > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > >   y_6 = {f_5(D), _3, _2, _1};
> > > > > > > >   return y_6;
> > > > > > > >
> > > > > > > > with
> > > > > > > >
> > > > > > > >  __v4sf y;
> > > > > > > >   __v4sf D.1930;
> > > > > > > >   float _1;
> > > > > > > >   float _2;
> > > > > > > >   float _3;
> > > > > > > >   vector(4) float _8;
> > > > > > > >
> > > > > > > >   <bb 2> :
> > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > >   _8 = x_9(D);
> > > > > > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > > > > > >   return y_6;
> > > > > > > >
> > > > > > > > gcc/
> > > > > > > >
> > > > > > > >         PR tree-optimization/88828
> > > > > > > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > > > > > > >         vector init constructor with vector copy or permute followed
> > > > > > > >         by a single scalar insert.
> > > > > > > >
> > > > > > > > gcc/testsuite/
> > > > > > > >
> > > > > > > >         PR tree-optimization/88828
> > > > > > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > > > > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > > > > > >
> > > > > > > Here is the updated patch with run-time tests.
> > > > > >
> > > > > > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > > > > > +      if (TREE_CODE (ce->value) != SSA_NAME)
> > > > > >         return false;
> > > > > >
> > > > > > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> > > > > > scalar value can be a constant as well.
> > > > >
> > > > > Fixed.
> > > > >
> > > > > >        if (!def_stmt)
> > > > > > -       return false;
> > > > > > +       {
> > > > > > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> > > > > >
> > > > > > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> > > > > >
> > > > > > +           {
> > > > > >
> > > > > > also you seem to disallow
> > > > > >
> > > > > >   { i + 1, v[1], v[2], v[3] }
> > > > >
> > > > > Fixed by
> > > > >
> > > > >      if (code != BIT_FIELD_REF)
> > > > >         {
> > > > >           /* Only allow one scalar insert.  */
> > > > >           if (nscalars != 0)
> > > > >             return false;
> > > > >
> > > > >           nscalars = 1;
> > > > >           insert = true;
> > > > >           scalar_idx = i;
> > > > >           sel.quick_push (i);
> > > > >           scalar_element = ce->value;
> > > > >           continue;
> > > > >         }
> > > > >
> > > > > > because get_prop_source_stmt will return the definition computing
> > > > > > i + 1 in this case and your code will be skipped?
> > > > > >
> > > > > > I think you can simplify the code by treating scalar_element != NULL
> > > > > > as nscalars == 1 and eliding nscalars.
> > > > >
> > > > > It works only if
> > > > >
> > > > > TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)
> > > > >
> > > > > We need to check both nscalars and nvectors.  Elide nscalar
> > > > > check doesn't help much here.
> > > > >
> > > > > > -      if (conv_code == ERROR_MARK)
> > > > > > +      if (conv_code == ERROR_MARK && !insert)
> > > > > >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
> > > > > >                                         orig[1], op2);
> > > > > >        else
> > > > > > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> > > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> > > > > > +         gimple_assign_set_rhs_with_ops (gsi,
> > > > > > +                                         (conv_code != ERROR_MARK
> > > > > > +                                          ? conv_code
> > > > > > +                                          : NOP_EXPR),
> > > > > > +                                         orig[0],
> > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > >
> > > > > > I believe you should elide the last stmt for conv_code == ERROR_MARK,
> > > > > > that is, why did you need to add the && !insert check in the guarding condition
> > > > >
> > > > > When conv_code == ERROR_MARK, we still need
> > > > >
> > > > >        gimple *perm
> > > > >             = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
> > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > >           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
> > > > >                                           orig[0],
> > > > >                                           NULL_TREE, NULL_TREE);
> > > > >
> > > > > Otherwise, scalar insert won't work.
> > > > >
> > > > > > (this path should already do the correct thing?).  Note that in all
> > > > > > cases it looks
> > > > > > that with conv_code != ERROR_MARK you may end up doing a float->int
> > > > > > or int->float conversion on a value it wasn't done on before which might
> > > > > > raise exceptions?  That is, do we need to make sure we permute a
> > > > > > value we already do convert into the place we're going to insert to?
> > > > >
> > > > > This couldn't happen:
> > > > >
> > > > >       if (type == TREE_TYPE (ref))
> > > > >          {
> > > > >            /* The RHS vector has the same type as LHS.  */
> > > > >            if (rhs_vector == NULL)
> > > > >              rhs_vector = ref;
> > > > >            /* Check if all RHS vector elements come fome the same
> > > > >               vector.  */
> > > > >            if (rhs_vector == ref)
> > > > >              nvectors++;
> > > > >          }
> > > > > ...
> > > > >   if (insert
> > > > >       && (nvectors == 0
> > > > >           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> > > > >               != (nscalars + nvectors))))
> > > > >     return false;
> > >
> > > I see - that looks like a missed case then?
> > >
> > >  { 1., (float)v[1], (float)v[2], (float)v[3] }
> > >
> > > with integer vector v?
> >
> > True.
> >
> > > I'll have a look at the full patch next week (it's GCC 10 material in any case).
> > >
>
> Now looking again.  I still don't like the new "structure" of the loop
> very much.
> A refactoring like the attached should make it easier to clearly separate the
> cases where we reach a vector def and where not.

Now attached.

> Do you want me to take over the patch?
>
> Thanks,
> Richard.

[-- Attachment #2: p --]
[-- Type: application/octet-stream, Size: 2723 bytes --]

Index: gcc/tree-ssa-forwprop.c
===================================================================
--- gcc/tree-ssa-forwprop.c	(revision 270791)
+++ gcc/tree-ssa-forwprop.c	(working copy)
@@ -1997,6 +1997,44 @@ simplify_permutation (gimple_stmt_iterat
   return 0;
 }
 
+/* Get the BIT_FIELD_REF definition of VAL, if any, looking through
+   conversions with code CONV_CODE or update it if still ERROR_MARK.
+   Return NULL_TREE if no such matching def was found.  */
+
+static tree
+get_bit_field_ref_def (tree val, enum tree_code &conv_code)
+{
+  if (TREE_CODE (val) != SSA_NAME)
+    return NULL_TREE ;
+  gimple *def_stmt = get_prop_source_stmt (val, false, NULL);
+  if (!def_stmt)
+    return NULL_TREE;
+  enum tree_code code = gimple_assign_rhs_code (def_stmt);
+  if (code == FLOAT_EXPR
+      || code == FIX_TRUNC_EXPR)
+    {
+      tree op1 = gimple_assign_rhs1 (def_stmt);
+      if (conv_code == ERROR_MARK)
+	{
+	  if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (val))),
+			GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1)))))
+	    return NULL_TREE;
+	  conv_code = code;
+	}
+      else if (conv_code != code)
+	return NULL_TREE;
+      if (TREE_CODE (op1) != SSA_NAME)
+	return NULL_TREE;
+      def_stmt = SSA_NAME_DEF_STMT (op1);
+      if (! is_gimple_assign (def_stmt))
+	return NULL_TREE;
+      code = gimple_assign_rhs_code (def_stmt);
+    }
+  if (code != BIT_FIELD_REF)
+    return NULL_TREE;
+  return gimple_assign_rhs1 (def_stmt);
+}
+
 /* Recognize a VEC_PERM_EXPR.  Returns true if there were any changes.  */
 
 static bool
@@ -2034,35 +2072,9 @@ simplify_vector_constructor (gimple_stmt
       if (i >= nelts)
 	return false;
 
-      if (TREE_CODE (elt->value) != SSA_NAME)
-	return false;
-      def_stmt = get_prop_source_stmt (elt->value, false, NULL);
-      if (!def_stmt)
-	return false;
-      code = gimple_assign_rhs_code (def_stmt);
-      if (code == FLOAT_EXPR
-	  || code == FIX_TRUNC_EXPR)
-	{
-	  op1 = gimple_assign_rhs1 (def_stmt);
-	  if (conv_code == ERROR_MARK)
-	    {
-	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (elt->value))),
-			    GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1)))))
-		return false;
-	      conv_code = code;
-	    }
-	  else if (conv_code != code)
-	    return false;
-	  if (TREE_CODE (op1) != SSA_NAME)
-	    return false;
-	  def_stmt = SSA_NAME_DEF_STMT (op1);
-	  if (! is_gimple_assign (def_stmt))
-	    return false;
-	  code = gimple_assign_rhs_code (def_stmt);
-	}
-      if (code != BIT_FIELD_REF)
+      op1 = get_bit_field_ref_def (elt->value, conv_code);
+      if (!op1)
 	return false;
-      op1 = gimple_assign_rhs1 (def_stmt);
       ref = TREE_OPERAND (op1, 0);
       unsigned int j;
       for (j = 0; j < 2; ++j)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: V5 [PATCH] Optimize vector constructor
  2019-05-02 14:55                       ` Richard Biener
@ 2019-05-02 17:53                         ` H.J. Lu
  2019-05-03 16:54                           ` V6 " H.J. Lu
  0 siblings, 1 reply; 17+ messages in thread
From: H.J. Lu @ 2019-05-02 17:53 UTC (permalink / raw)
  To: Richard Biener; +Cc: Hongtao Liu, Andrew Pinski, GCC Patches

On Thu, May 2, 2019 at 7:55 AM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Thu, May 2, 2019 at 4:54 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Mon, Mar 11, 2019 at 8:03 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Fri, Mar 8, 2019 at 7:03 PM Richard Biener
> > > <richard.guenther@gmail.com> wrote:
> > > >
> > > > On Fri, Mar 8, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> > > > > > <richard.guenther@gmail.com> wrote:
> > > > > > >
> > > > > > > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > >
> > > > > > > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > >
> > > > > > > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > > > > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > > > > > > > )
> > > > > > > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > > > >
> > > > > > > > > > > > > For vector init constructor:
> > > > > > > > > > > > >
> > > > > > > > > > > > > ---
> > > > > > > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > > > > > > > >
> > > > > > > > > > > > > __v4sf
> > > > > > > > > > > > > foo (__v4sf x, float f)
> > > > > > > > > > > > > {
> > > > > > > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > > > > > > >   return y;
> > > > > > > > > > > > > }
> > > > > > > > > > > > > ---
> > > > > > > > > > > > >
> > > > > > > > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > > > > > > > followed by a single scalar insert:
> > > > > > > > >
> > > > > > > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > > > > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > > > > > > > BIT_INSERT_EXPR.
> > > > > > > > >
> > > > > > > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > H.J.
> > > > > > > > > ---
> > > > > > > > > We can optimize vector constructor with vector copy or permute followed
> > > > > > > > > by a single scalar insert:
> > > > > > > > >
> > > > > > > > >   __v4sf y;
> > > > > > > > >   __v4sf D.1930;
> > > > > > > > >   float _1;
> > > > > > > > >   float _2;
> > > > > > > > >   float _3;
> > > > > > > > >
> > > > > > > > >   <bb 2> :
> > > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > > >   y_6 = {f_5(D), _3, _2, _1};
> > > > > > > > >   return y_6;
> > > > > > > > >
> > > > > > > > > with
> > > > > > > > >
> > > > > > > > >  __v4sf y;
> > > > > > > > >   __v4sf D.1930;
> > > > > > > > >   float _1;
> > > > > > > > >   float _2;
> > > > > > > > >   float _3;
> > > > > > > > >   vector(4) float _8;
> > > > > > > > >
> > > > > > > > >   <bb 2> :
> > > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > > >   _8 = x_9(D);
> > > > > > > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > > > > > > >   return y_6;
> > > > > > > > >
> > > > > > > > > gcc/
> > > > > > > > >
> > > > > > > > >         PR tree-optimization/88828
> > > > > > > > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > > > > > > > >         vector init constructor with vector copy or permute followed
> > > > > > > > >         by a single scalar insert.
> > > > > > > > >
> > > > > > > > > gcc/testsuite/
> > > > > > > > >
> > > > > > > > >         PR tree-optimization/88828
> > > > > > > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > > > > > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > > > > > > >
> > > > > > > > Here is the updated patch with run-time tests.
> > > > > > >
> > > > > > > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > > > > > > +      if (TREE_CODE (ce->value) != SSA_NAME)
> > > > > > >         return false;
> > > > > > >
> > > > > > > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> > > > > > > scalar value can be a constant as well.
> > > > > >
> > > > > > Fixed.
> > > > > >
> > > > > > >        if (!def_stmt)
> > > > > > > -       return false;
> > > > > > > +       {
> > > > > > > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> > > > > > >
> > > > > > > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> > > > > > >
> > > > > > > +           {
> > > > > > >
> > > > > > > also you seem to disallow
> > > > > > >
> > > > > > >   { i + 1, v[1], v[2], v[3] }
> > > > > >
> > > > > > Fixed by
> > > > > >
> > > > > >      if (code != BIT_FIELD_REF)
> > > > > >         {
> > > > > >           /* Only allow one scalar insert.  */
> > > > > >           if (nscalars != 0)
> > > > > >             return false;
> > > > > >
> > > > > >           nscalars = 1;
> > > > > >           insert = true;
> > > > > >           scalar_idx = i;
> > > > > >           sel.quick_push (i);
> > > > > >           scalar_element = ce->value;
> > > > > >           continue;
> > > > > >         }
> > > > > >
> > > > > > > because get_prop_source_stmt will return the definition computing
> > > > > > > i + 1 in this case and your code will be skipped?
> > > > > > >
> > > > > > > I think you can simplify the code by treating scalar_element != NULL
> > > > > > > as nscalars == 1 and eliding nscalars.
> > > > > >
> > > > > > It works only if
> > > > > >
> > > > > > TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)
> > > > > >
> > > > > > We need to check both nscalars and nvectors.  Elide nscalar
> > > > > > check doesn't help much here.
> > > > > >
> > > > > > > -      if (conv_code == ERROR_MARK)
> > > > > > > +      if (conv_code == ERROR_MARK && !insert)
> > > > > > >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
> > > > > > >                                         orig[1], op2);
> > > > > > >        else
> > > > > > > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> > > > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > > > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> > > > > > > +         gimple_assign_set_rhs_with_ops (gsi,
> > > > > > > +                                         (conv_code != ERROR_MARK
> > > > > > > +                                          ? conv_code
> > > > > > > +                                          : NOP_EXPR),
> > > > > > > +                                         orig[0],
> > > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > > >
> > > > > > > I believe you should elide the last stmt for conv_code == ERROR_MARK,
> > > > > > > that is, why did you need to add the && !insert check in the guarding condition
> > > > > >
> > > > > > When conv_code == ERROR_MARK, we still need
> > > > > >
> > > > > >        gimple *perm
> > > > > >             = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
> > > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > >           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
> > > > > >                                           orig[0],
> > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > >
> > > > > > Otherwise, scalar insert won't work.
> > > > > >
> > > > > > > (this path should already do the correct thing?).  Note that in all
> > > > > > > cases it looks
> > > > > > > that with conv_code != ERROR_MARK you may end up doing a float->int
> > > > > > > or int->float conversion on a value it wasn't done on before which might
> > > > > > > raise exceptions?  That is, do we need to make sure we permute a
> > > > > > > value we already do convert into the place we're going to insert to?
> > > > > >
> > > > > > This couldn't happen:
> > > > > >
> > > > > >       if (type == TREE_TYPE (ref))
> > > > > >          {
> > > > > >            /* The RHS vector has the same type as LHS.  */
> > > > > >            if (rhs_vector == NULL)
> > > > > >              rhs_vector = ref;
> > > > > >            /* Check if all RHS vector elements come fome the same
> > > > > >               vector.  */
> > > > > >            if (rhs_vector == ref)
> > > > > >              nvectors++;
> > > > > >          }
> > > > > > ...
> > > > > >   if (insert
> > > > > >       && (nvectors == 0
> > > > > >           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> > > > > >               != (nscalars + nvectors))))
> > > > > >     return false;
> > > >
> > > > I see - that looks like a missed case then?
> > > >
> > > >  { 1., (float)v[1], (float)v[2], (float)v[3] }
> > > >
> > > > with integer vector v?
> > >
> > > True.
> > >
> > > > I'll have a look at the full patch next week (it's GCC 10 material in any case).
> > > >
> >
> > Now looking again.  I still don't like the new "structure" of the loop
> > very much.
> > A refactoring like the attached should make it easier to clearly separate the
> > cases where we reach a vector def and where not.
>
> Now attached.
>
> > Do you want me to take over the patch?
> >

Sure.

Thanks.


-- 
H.J.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* V6 [PATCH] Optimize vector constructor
  2019-05-02 17:53                         ` H.J. Lu
@ 2019-05-03 16:54                           ` H.J. Lu
  2019-05-08 12:04                             ` Richard Biener
  0 siblings, 1 reply; 17+ messages in thread
From: H.J. Lu @ 2019-05-03 16:54 UTC (permalink / raw)
  To: Richard Biener; +Cc: Hongtao Liu, Andrew Pinski, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 11248 bytes --]

On Thu, May 2, 2019 at 10:53 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, May 2, 2019 at 7:55 AM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Thu, May 2, 2019 at 4:54 PM Richard Biener
> > <richard.guenther@gmail.com> wrote:
> > >
> > > On Mon, Mar 11, 2019 at 8:03 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Fri, Mar 8, 2019 at 7:03 PM Richard Biener
> > > > <richard.guenther@gmail.com> wrote:
> > > > >
> > > > > On Fri, Mar 8, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > >
> > > > > > > On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> > > > > > > <richard.guenther@gmail.com> wrote:
> > > > > > > >
> > > > > > > > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > >
> > > > > > > > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > > > > > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > > > > > > > > )
> > > > > > > > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > For vector init constructor:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > __v4sf
> > > > > > > > > > > > > > foo (__v4sf x, float f)
> > > > > > > > > > > > > > {
> > > > > > > > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > > > > > > > >   return y;
> > > > > > > > > > > > > > }
> > > > > > > > > > > > > > ---
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > > > > > > > > followed by a single scalar insert:
> > > > > > > > > >
> > > > > > > > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > > > > > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > > > > > > > > BIT_INSERT_EXPR.
> > > > > > > > > >
> > > > > > > > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > H.J.
> > > > > > > > > > ---
> > > > > > > > > > We can optimize vector constructor with vector copy or permute followed
> > > > > > > > > > by a single scalar insert:
> > > > > > > > > >
> > > > > > > > > >   __v4sf y;
> > > > > > > > > >   __v4sf D.1930;
> > > > > > > > > >   float _1;
> > > > > > > > > >   float _2;
> > > > > > > > > >   float _3;
> > > > > > > > > >
> > > > > > > > > >   <bb 2> :
> > > > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > > > >   y_6 = {f_5(D), _3, _2, _1};
> > > > > > > > > >   return y_6;
> > > > > > > > > >
> > > > > > > > > > with
> > > > > > > > > >
> > > > > > > > > >  __v4sf y;
> > > > > > > > > >   __v4sf D.1930;
> > > > > > > > > >   float _1;
> > > > > > > > > >   float _2;
> > > > > > > > > >   float _3;
> > > > > > > > > >   vector(4) float _8;
> > > > > > > > > >
> > > > > > > > > >   <bb 2> :
> > > > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > > > >   _8 = x_9(D);
> > > > > > > > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > > > > > > > >   return y_6;
> > > > > > > > > >
> > > > > > > > > > gcc/
> > > > > > > > > >
> > > > > > > > > >         PR tree-optimization/88828
> > > > > > > > > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > > > > > > > > >         vector init constructor with vector copy or permute followed
> > > > > > > > > >         by a single scalar insert.
> > > > > > > > > >
> > > > > > > > > > gcc/testsuite/
> > > > > > > > > >
> > > > > > > > > >         PR tree-optimization/88828
> > > > > > > > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > > > > > > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > > > > > > > >
> > > > > > > > > Here is the updated patch with run-time tests.
> > > > > > > >
> > > > > > > > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > > > > > > > +      if (TREE_CODE (ce->value) != SSA_NAME)
> > > > > > > >         return false;
> > > > > > > >
> > > > > > > > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> > > > > > > > scalar value can be a constant as well.
> > > > > > >
> > > > > > > Fixed.
> > > > > > >
> > > > > > > >        if (!def_stmt)
> > > > > > > > -       return false;
> > > > > > > > +       {
> > > > > > > > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> > > > > > > >
> > > > > > > > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> > > > > > > >
> > > > > > > > +           {
> > > > > > > >
> > > > > > > > also you seem to disallow
> > > > > > > >
> > > > > > > >   { i + 1, v[1], v[2], v[3] }
> > > > > > >
> > > > > > > Fixed by
> > > > > > >
> > > > > > >      if (code != BIT_FIELD_REF)
> > > > > > >         {
> > > > > > >           /* Only allow one scalar insert.  */
> > > > > > >           if (nscalars != 0)
> > > > > > >             return false;
> > > > > > >
> > > > > > >           nscalars = 1;
> > > > > > >           insert = true;
> > > > > > >           scalar_idx = i;
> > > > > > >           sel.quick_push (i);
> > > > > > >           scalar_element = ce->value;
> > > > > > >           continue;
> > > > > > >         }
> > > > > > >
> > > > > > > > because get_prop_source_stmt will return the definition computing
> > > > > > > > i + 1 in this case and your code will be skipped?
> > > > > > > >
> > > > > > > > I think you can simplify the code by treating scalar_element != NULL
> > > > > > > > as nscalars == 1 and eliding nscalars.
> > > > > > >
> > > > > > > It works only if
> > > > > > >
> > > > > > > TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)
> > > > > > >
> > > > > > > We need to check both nscalars and nvectors.  Elide nscalar
> > > > > > > check doesn't help much here.
> > > > > > >
> > > > > > > > -      if (conv_code == ERROR_MARK)
> > > > > > > > +      if (conv_code == ERROR_MARK && !insert)
> > > > > > > >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
> > > > > > > >                                         orig[1], op2);
> > > > > > > >        else
> > > > > > > > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> > > > > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > > > > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> > > > > > > > +         gimple_assign_set_rhs_with_ops (gsi,
> > > > > > > > +                                         (conv_code != ERROR_MARK
> > > > > > > > +                                          ? conv_code
> > > > > > > > +                                          : NOP_EXPR),
> > > > > > > > +                                         orig[0],
> > > > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > > > >
> > > > > > > > I believe you should elide the last stmt for conv_code == ERROR_MARK,
> > > > > > > > that is, why did you need to add the && !insert check in the guarding condition
> > > > > > >
> > > > > > > When conv_code == ERROR_MARK, we still need
> > > > > > >
> > > > > > >        gimple *perm
> > > > > > >             = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
> > > > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > > >           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
> > > > > > >                                           orig[0],
> > > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > > >
> > > > > > > Otherwise, scalar insert won't work.
> > > > > > >
> > > > > > > > (this path should already do the correct thing?).  Note that in all
> > > > > > > > cases it looks
> > > > > > > > that with conv_code != ERROR_MARK you may end up doing a float->int
> > > > > > > > or int->float conversion on a value it wasn't done on before which might
> > > > > > > > raise exceptions?  That is, do we need to make sure we permute a
> > > > > > > > value we already do convert into the place we're going to insert to?
> > > > > > >
> > > > > > > This couldn't happen:
> > > > > > >
> > > > > > >       if (type == TREE_TYPE (ref))
> > > > > > >          {
> > > > > > >            /* The RHS vector has the same type as LHS.  */
> > > > > > >            if (rhs_vector == NULL)
> > > > > > >              rhs_vector = ref;
> > > > > > >            /* Check if all RHS vector elements come fome the same
> > > > > > >               vector.  */
> > > > > > >            if (rhs_vector == ref)
> > > > > > >              nvectors++;
> > > > > > >          }
> > > > > > > ...
> > > > > > >   if (insert
> > > > > > >       && (nvectors == 0
> > > > > > >           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> > > > > > >               != (nscalars + nvectors))))
> > > > > > >     return false;
> > > > >
> > > > > I see - that looks like a missed case then?
> > > > >
> > > > >  { 1., (float)v[1], (float)v[2], (float)v[3] }
> > > > >
> > > > > with integer vector v?
> > > >
> > > > True.
> > > >
> > > > > I'll have a look at the full patch next week (it's GCC 10 material in any case).
> > > > >
> > >
> > > Now looking again.  I still don't like the new "structure" of the loop
> > > very much.
> > > A refactoring like the attached should make it easier to clearly separate the
> > > cases where we reach a vector def and where not.
> >
> > Now attached.
> >
> > > Do you want me to take over the patch?
> > >
>

Here is the updated patch on top of your patch plus my fix.

-- 
H.J.

[-- Attachment #2: 0001-Optimize-vector-constructor.patch --]
[-- Type: application/x-patch, Size: 35171 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: V6 [PATCH] Optimize vector constructor
  2019-05-03 16:54                           ` V6 " H.J. Lu
@ 2019-05-08 12:04                             ` Richard Biener
  2019-05-14  9:13                               ` Richard Biener
  0 siblings, 1 reply; 17+ messages in thread
From: Richard Biener @ 2019-05-08 12:04 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Hongtao Liu, Andrew Pinski, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 15524 bytes --]

On Fri, May 3, 2019 at 6:54 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, May 2, 2019 at 10:53 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Thu, May 2, 2019 at 7:55 AM Richard Biener
> > <richard.guenther@gmail.com> wrote:
> > >
> > > On Thu, May 2, 2019 at 4:54 PM Richard Biener
> > > <richard.guenther@gmail.com> wrote:
> > > >
> > > > On Mon, Mar 11, 2019 at 8:03 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Fri, Mar 8, 2019 at 7:03 PM Richard Biener
> > > > > <richard.guenther@gmail.com> wrote:
> > > > > >
> > > > > > On Fri, Mar 8, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > >
> > > > > > > On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > >
> > > > > > > > On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> > > > > > > > <richard.guenther@gmail.com> wrote:
> > > > > > > > >
> > > > > > > > > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > > > > > > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > > > >
> > > > > > > > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > > > > > > > > > )
> > > > > > > > > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > For vector init constructor:
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > __v4sf
> > > > > > > > > > > > > > > foo (__v4sf x, float f)
> > > > > > > > > > > > > > > {
> > > > > > > > > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > > > > > > > > >   return y;
> > > > > > > > > > > > > > > }
> > > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > > > > > > > > > followed by a single scalar insert:
> > > > > > > > > > >
> > > > > > > > > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > > > > > > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > > > > > > > > > BIT_INSERT_EXPR.
> > > > > > > > > > >
> > > > > > > > > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > H.J.
> > > > > > > > > > > ---
> > > > > > > > > > > We can optimize vector constructor with vector copy or permute followed
> > > > > > > > > > > by a single scalar insert:
> > > > > > > > > > >
> > > > > > > > > > >   __v4sf y;
> > > > > > > > > > >   __v4sf D.1930;
> > > > > > > > > > >   float _1;
> > > > > > > > > > >   float _2;
> > > > > > > > > > >   float _3;
> > > > > > > > > > >
> > > > > > > > > > >   <bb 2> :
> > > > > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > > > > >   y_6 = {f_5(D), _3, _2, _1};
> > > > > > > > > > >   return y_6;
> > > > > > > > > > >
> > > > > > > > > > > with
> > > > > > > > > > >
> > > > > > > > > > >  __v4sf y;
> > > > > > > > > > >   __v4sf D.1930;
> > > > > > > > > > >   float _1;
> > > > > > > > > > >   float _2;
> > > > > > > > > > >   float _3;
> > > > > > > > > > >   vector(4) float _8;
> > > > > > > > > > >
> > > > > > > > > > >   <bb 2> :
> > > > > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > > > > >   _8 = x_9(D);
> > > > > > > > > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > > > > > > > > >   return y_6;
> > > > > > > > > > >
> > > > > > > > > > > gcc/
> > > > > > > > > > >
> > > > > > > > > > >         PR tree-optimization/88828
> > > > > > > > > > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > > > > > > > > > >         vector init constructor with vector copy or permute followed
> > > > > > > > > > >         by a single scalar insert.
> > > > > > > > > > >
> > > > > > > > > > > gcc/testsuite/
> > > > > > > > > > >
> > > > > > > > > > >         PR tree-optimization/88828
> > > > > > > > > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > > > > > > > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > > > > > > > > >
> > > > > > > > > > Here is the updated patch with run-time tests.
> > > > > > > > >
> > > > > > > > > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > > > > > > > > +      if (TREE_CODE (ce->value) != SSA_NAME)
> > > > > > > > >         return false;
> > > > > > > > >
> > > > > > > > > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> > > > > > > > > scalar value can be a constant as well.
> > > > > > > >
> > > > > > > > Fixed.
> > > > > > > >
> > > > > > > > >        if (!def_stmt)
> > > > > > > > > -       return false;
> > > > > > > > > +       {
> > > > > > > > > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> > > > > > > > >
> > > > > > > > > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> > > > > > > > >
> > > > > > > > > +           {
> > > > > > > > >
> > > > > > > > > also you seem to disallow
> > > > > > > > >
> > > > > > > > >   { i + 1, v[1], v[2], v[3] }
> > > > > > > >
> > > > > > > > Fixed by
> > > > > > > >
> > > > > > > >      if (code != BIT_FIELD_REF)
> > > > > > > >         {
> > > > > > > >           /* Only allow one scalar insert.  */
> > > > > > > >           if (nscalars != 0)
> > > > > > > >             return false;
> > > > > > > >
> > > > > > > >           nscalars = 1;
> > > > > > > >           insert = true;
> > > > > > > >           scalar_idx = i;
> > > > > > > >           sel.quick_push (i);
> > > > > > > >           scalar_element = ce->value;
> > > > > > > >           continue;
> > > > > > > >         }
> > > > > > > >
> > > > > > > > > because get_prop_source_stmt will return the definition computing
> > > > > > > > > i + 1 in this case and your code will be skipped?
> > > > > > > > >
> > > > > > > > > I think you can simplify the code by treating scalar_element != NULL
> > > > > > > > > as nscalars == 1 and eliding nscalars.
> > > > > > > >
> > > > > > > > It works only if
> > > > > > > >
> > > > > > > > TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)
> > > > > > > >
> > > > > > > > We need to check both nscalars and nvectors.  Elide nscalar
> > > > > > > > check doesn't help much here.
> > > > > > > >
> > > > > > > > > -      if (conv_code == ERROR_MARK)
> > > > > > > > > +      if (conv_code == ERROR_MARK && !insert)
> > > > > > > > >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
> > > > > > > > >                                         orig[1], op2);
> > > > > > > > >        else
> > > > > > > > > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> > > > > > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > > > > > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> > > > > > > > > +         gimple_assign_set_rhs_with_ops (gsi,
> > > > > > > > > +                                         (conv_code != ERROR_MARK
> > > > > > > > > +                                          ? conv_code
> > > > > > > > > +                                          : NOP_EXPR),
> > > > > > > > > +                                         orig[0],
> > > > > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > > > > >
> > > > > > > > > I believe you should elide the last stmt for conv_code == ERROR_MARK,
> > > > > > > > > that is, why did you need to add the && !insert check in the guarding condition
> > > > > > > >
> > > > > > > > When conv_code == ERROR_MARK, we still need
> > > > > > > >
> > > > > > > >        gimple *perm
> > > > > > > >             = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
> > > > > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > > > >           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
> > > > > > > >                                           orig[0],
> > > > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > > > >
> > > > > > > > Otherwise, scalar insert won't work.
> > > > > > > >
> > > > > > > > > (this path should already do the correct thing?).  Note that in all
> > > > > > > > > cases it looks
> > > > > > > > > that with conv_code != ERROR_MARK you may end up doing a float->int
> > > > > > > > > or int->float conversion on a value it wasn't done on before which might
> > > > > > > > > raise exceptions?  That is, do we need to make sure we permute a
> > > > > > > > > value we already do convert into the place we're going to insert to?
> > > > > > > >
> > > > > > > > This couldn't happen:
> > > > > > > >
> > > > > > > >       if (type == TREE_TYPE (ref))
> > > > > > > >          {
> > > > > > > >            /* The RHS vector has the same type as LHS.  */
> > > > > > > >            if (rhs_vector == NULL)
> > > > > > > >              rhs_vector = ref;
> > > > > > > >            /* Check if all RHS vector elements come fome the same
> > > > > > > >               vector.  */
> > > > > > > >            if (rhs_vector == ref)
> > > > > > > >              nvectors++;
> > > > > > > >          }
> > > > > > > > ...
> > > > > > > >   if (insert
> > > > > > > >       && (nvectors == 0
> > > > > > > >           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> > > > > > > >               != (nscalars + nvectors))))
> > > > > > > >     return false;
> > > > > >
> > > > > > I see - that looks like a missed case then?
> > > > > >
> > > > > >  { 1., (float)v[1], (float)v[2], (float)v[3] }
> > > > > >
> > > > > > with integer vector v?
> > > > >
> > > > > True.
> > > > >
> > > > > > I'll have a look at the full patch next week (it's GCC 10 material in any case).
> > > > > >
> > > >
> > > > Now looking again.  I still don't like the new "structure" of the loop
> > > > very much.
> > > > A refactoring like the attached should make it easier to clearly separate the
> > > > cases where we reach a vector def and where not.
> > >
> > > Now attached.
> > >
> > > > Do you want me to take over the patch?
> > > >
> >
>
> Here is the updated patch on top of your patch plus my fix.

Thanks - when doing the constant vector I was thinking of the following patch
to handle your cases.  It doesn't use insertion but eventually leaves that
to a separate transform.  Instead it handles non-constants similar to constants
by permuting from a uniform vector.  Thus

__attribute__((noinline, noclone))
__v4sf
foo2 (__v4sf x, float f)
{
  __v4sf y = { f, x[1], x[2], x[3] };
  return y;
}

becomes

  _4 = {f_2(D), f_2(D), f_2(D), f_2(D)};
  y_3 = VEC_PERM_EXPR <x_5(D), _4, { 4, 1, 2, 3 }>;
  return y_3;

this allows us to handle an arbitrary number of inserts of this
single value.  It also ensures we can actually perform the
permutation while for the insertion we currently do not have
a convenient way to query whether the target can perform
it efficiently (IIRC x86 needs AVX to insert to arbitrary lanes
with a single instruction?).  Similarly if the user writes the above
in source using __builtin_shuffle we'd want to optimize it as well.

The patch as attached only passes some of your testcases,
the following FAIL:

FAIL: gcc.target/i386/pr88828-2a.c scan-assembler movss
FAIL: gcc.target/i386/pr88828-2a.c scan-assembler-not movaps
FAIL: gcc.target/i386/pr88828-2a.c scan-assembler-not movlhps
FAIL: gcc.target/i386/pr88828-2a.c scan-assembler-not unpcklps
FAIL: gcc.target/i386/pr88828-2b.c scan-assembler-times vpermilps 1
FAIL: gcc.target/i386/pr88828-2b.c scan-assembler-times vmovss 1
FAIL: gcc.target/i386/pr88828-2c.c scan-assembler movss
FAIL: gcc.target/i386/pr88828-2c.c scan-assembler-not movaps
FAIL: gcc.target/i386/pr88828-2c.c scan-assembler-not movlhps
FAIL: gcc.target/i386/pr88828-2c.c scan-assembler-not unpcklps
FAIL: gcc.target/i386/pr88828-2d.c scan-assembler-times vpermilps 1
FAIL: gcc.target/i386/pr88828-2d.c scan-assembler-times vmovss 1
FAIL: gcc.target/i386/pr88828-3a.c scan-assembler movss
FAIL: gcc.target/i386/pr88828-3a.c scan-assembler-times shufps 2
FAIL: gcc.target/i386/pr88828-3a.c scan-assembler-times movaps 1
FAIL: gcc.target/i386/pr88828-3a.c scan-assembler-not movlhps
FAIL: gcc.target/i386/pr88828-3a.c scan-assembler-not unpcklps
FAIL: gcc.target/i386/pr88828-3b.c scan-assembler-times vpermilps 1
FAIL: gcc.target/i386/pr88828-3b.c scan-assembler-times vinsertps 1
FAIL: gcc.target/i386/pr88828-3b.c scan-assembler-not vshufps
FAIL: gcc.target/i386/pr88828-3c.c scan-assembler-times vpermilps 1
FAIL: gcc.target/i386/pr88828-3c.c scan-assembler-times vinsertps 1
FAIL: gcc.target/i386/pr88828-3c.c scan-assembler-not vshufps

Making the patch emit inserts for single insert locations is of course
still possible but you get to arrive at heuristics like your choice
of permuting the original lane into the later overwritten lane which
might be a choice making the permute impossible or more expensive?

The original purpose of simplify_vector_constructor was to simplify
the IL, not so much optimal code-generation in the end but I wonder
if we can rely on RTL expansion or later RTL optimization to do
the optimal choices here?

I guess simplify_permutation could perform a VEC_PERM
into an insert if the remaining permutation would be a no-op
but RTL optimization handles this case well already.

Whether code-generation for a one vector permute plus insert or
a two-vector permute is better in the end I don't know - at least
the permute expansion has a chance to see the combined
instruction.

Do you think the remaining cases above can be handled in the
backend?

Comments?

Thanks,
Richard.

2019-05-08  Richard Biener  <rguenther@suse.de>

        PR tree-optimization/88828
        * tree-ssa-forwprop.c (simplify_vector_constructor): Handle
        permuting in a single non-constant element not extracted
        from a vector.


> --
> H.J.

[-- Attachment #2: fix-pr88828-2 --]
[-- Type: application/octet-stream, Size: 4994 bytes --]

2019-05-08  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/88828
	* tree-ssa-forwprop.c (simplify_vector_constructor): Handle
	permuting in a single non-constant element not extracted
	from a vector.

Index: gcc/tree-ssa-forwprop.c
===================================================================
--- gcc/tree-ssa-forwprop.c	(revision 271001)
+++ gcc/tree-ssa-forwprop.c	(working copy)
@@ -2065,71 +2065,87 @@ simplify_vector_constructor (gimple_stmt
   conv_code = ERROR_MARK;
   maybe_ident = true;
   tree one_constant = NULL_TREE;
+  tree one_nonconstant = NULL_TREE;
   auto_vec<tree> constants;
   constants.safe_grow_cleared (nelts);
   FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt)
     {
       tree ref, op1;
+      unsigned int elem;
 
       if (i >= nelts)
 	return false;
 
+      /* Look for elements extracted and possibly converted from
+         another vector.  */
       op1 = get_bit_field_ref_def (elt->value, conv_code);
-      if (op1)
+      if (op1
+	  && TREE_CODE ((ref = TREE_OPERAND (op1, 0))) == SSA_NAME
+	  && VECTOR_TYPE_P (TREE_TYPE (ref))
+	  && useless_type_conversion_p (TREE_TYPE (op1),
+					TREE_TYPE (TREE_TYPE (ref)))
+	  && known_eq (bit_field_size (op1), elem_size)
+	  && constant_multiple_p (bit_field_offset (op1),
+				  elem_size, &elem))
 	{
-	  ref = TREE_OPERAND (op1, 0);
 	  unsigned int j;
 	  for (j = 0; j < 2; ++j)
 	    {
 	      if (!orig[j])
 		{
-		  if (TREE_CODE (ref) != SSA_NAME)
-		    return false;
-		  if (! VECTOR_TYPE_P (TREE_TYPE (ref))
-		      || ! useless_type_conversion_p (TREE_TYPE (op1),
-						      TREE_TYPE (TREE_TYPE (ref))))
-		    return false;
-		  if (j && !useless_type_conversion_p (TREE_TYPE (orig[0]),
-						       TREE_TYPE (ref)))
-		    return false;
-		  orig[j] = ref;
-		  break;
+		  if (j == 0
+		      || useless_type_conversion_p (TREE_TYPE (orig[0]),
+						    TREE_TYPE (ref)))
+		    break;
 		}
 	      else if (ref == orig[j])
 		break;
 	    }
-	  if (j == 2)
-	    return false;
-
-	  unsigned int elt;
-	  if (maybe_ne (bit_field_size (op1), elem_size)
-	      || !constant_multiple_p (bit_field_offset (op1), elem_size, &elt))
-	    return false;
-	  if (j)
-	    elt += nelts;
-	  if (elt != i)
-	    maybe_ident = false;
-	  sel.quick_push (elt);
+	  /* Found a suitable vector element.  */
+	  if (j <= 2)
+	    {
+	      orig[j] = ref;
+	      if (j)
+		elem += nelts;
+	      if (elem != i)
+		maybe_ident = false;
+	      sel.quick_push (elem);
+	      continue;
+	    }
+	  /* Else fallthru.  */
 	}
-      else if (CONSTANT_CLASS_P (elt->value))
+      /* Handle elements not extracted from a vector.
+          1. constants by permuting with constant vector
+	  2. a unique non-constant element by permuting with a splat vector  */
+      if (orig[1]
+	  && orig[1] != error_mark_node)
+	return false;
+      orig[1] = error_mark_node;
+      if (CONSTANT_CLASS_P (elt->value))
 	{
-	  if (orig[1]
-	      && orig[1] != error_mark_node)
+	  if (one_nonconstant)
 	    return false;
-	  orig[1] = error_mark_node;
 	  if (!one_constant)
 	    one_constant = elt->value;
 	  constants[i] = elt->value;
-	  sel.quick_push (i + nelts);
-	  maybe_ident = false;
 	}
       else
-	return false;
+	{
+	  if (one_constant)
+	    return false;
+	  if (!one_nonconstant)
+	    one_nonconstant = elt->value;
+	  else if (!operand_equal_p (one_nonconstant, elt->value, 0))
+	    return false;
+	}
+      sel.quick_push (i + nelts);
+      maybe_ident = false;
     }
   if (i < nelts)
     return false;
 
-  if (! VECTOR_TYPE_P (TREE_TYPE (orig[0]))
+  if (! orig[0]
+      || ! VECTOR_TYPE_P (TREE_TYPE (orig[0]))
       || maybe_ne (TYPE_VECTOR_SUBPARTS (type),
 		   TYPE_VECTOR_SUBPARTS (TREE_TYPE (orig[0]))))
     return false;
@@ -2165,9 +2181,19 @@ simplify_vector_constructor (gimple_stmt
 		       GET_MODE_SIZE (TYPE_MODE (type))))
 	return false;
       op2 = vec_perm_indices_to_tree (mask_type, indices);
+      bool convert_orig0 = false;
       if (!orig[1])
 	orig[1] = orig[0];
-      if (orig[1] == error_mark_node)
+      else if (orig[1] == error_mark_node
+	       && one_nonconstant)
+	{
+	  gimple_seq seq = NULL;
+	  orig[1] = gimple_build_vector_from_val (&seq, UNKNOWN_LOCATION,
+						  type, one_nonconstant);
+	  gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+	  convert_orig0 = true;
+	}
+      else if (orig[1] == error_mark_node)
 	{
 	  tree_vector_builder vec (type, nelts, 1);
 	  for (unsigned i = 0; i < nelts; ++i)
@@ -2177,11 +2203,12 @@ simplify_vector_constructor (gimple_stmt
 	      /* ??? Push a don't-care value.  */
 	      vec.quick_push (one_constant);
 	  orig[1] = vec.build ();
+	  convert_orig0 = true;
 	}
       if (conv_code == ERROR_MARK)
 	gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
 					orig[1], op2);
-      else if (TREE_CODE (orig[1]) == VECTOR_CST)
+      else if (convert_orig0)
 	{
 	  gimple *conv
 	    = gimple_build_assign (make_ssa_name (type), conv_code, orig[0]);

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: V6 [PATCH] Optimize vector constructor
  2019-05-08 12:04                             ` Richard Biener
@ 2019-05-14  9:13                               ` Richard Biener
  0 siblings, 0 replies; 17+ messages in thread
From: Richard Biener @ 2019-05-14  9:13 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Hongtao Liu, Andrew Pinski, GCC Patches

On Wed, May 8, 2019 at 2:04 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Fri, May 3, 2019 at 6:54 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Thu, May 2, 2019 at 10:53 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Thu, May 2, 2019 at 7:55 AM Richard Biener
> > > <richard.guenther@gmail.com> wrote:
> > > >
> > > > On Thu, May 2, 2019 at 4:54 PM Richard Biener
> > > > <richard.guenther@gmail.com> wrote:
> > > > >
> > > > > On Mon, Mar 11, 2019 at 8:03 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > On Fri, Mar 8, 2019 at 7:03 PM Richard Biener
> > > > > > <richard.guenther@gmail.com> wrote:
> > > > > > >
> > > > > > > On Fri, Mar 8, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > >
> > > > > > > > On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > >
> > > > > > > > > On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> > > > > > > > > <richard.guenther@gmail.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > > > > > > > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > > > > > > > > > > )
> > > > > > > > > > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > For vector init constructor:
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > __v4sf
> > > > > > > > > > > > > > > > foo (__v4sf x, float f)
> > > > > > > > > > > > > > > > {
> > > > > > > > > > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > > > > > > > > > >   return y;
> > > > > > > > > > > > > > > > }
> > > > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > > > > > > > > > > followed by a single scalar insert:
> > > > > > > > > > > >
> > > > > > > > > > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > > > > > > > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > > > > > > > > > > BIT_INSERT_EXPR.
> > > > > > > > > > > >
> > > > > > > > > > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > H.J.
> > > > > > > > > > > > ---
> > > > > > > > > > > > We can optimize vector constructor with vector copy or permute followed
> > > > > > > > > > > > by a single scalar insert:
> > > > > > > > > > > >
> > > > > > > > > > > >   __v4sf y;
> > > > > > > > > > > >   __v4sf D.1930;
> > > > > > > > > > > >   float _1;
> > > > > > > > > > > >   float _2;
> > > > > > > > > > > >   float _3;
> > > > > > > > > > > >
> > > > > > > > > > > >   <bb 2> :
> > > > > > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > > > > > >   y_6 = {f_5(D), _3, _2, _1};
> > > > > > > > > > > >   return y_6;
> > > > > > > > > > > >
> > > > > > > > > > > > with
> > > > > > > > > > > >
> > > > > > > > > > > >  __v4sf y;
> > > > > > > > > > > >   __v4sf D.1930;
> > > > > > > > > > > >   float _1;
> > > > > > > > > > > >   float _2;
> > > > > > > > > > > >   float _3;
> > > > > > > > > > > >   vector(4) float _8;
> > > > > > > > > > > >
> > > > > > > > > > > >   <bb 2> :
> > > > > > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > > > > > >   _8 = x_9(D);
> > > > > > > > > > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > > > > > > > > > >   return y_6;
> > > > > > > > > > > >
> > > > > > > > > > > > gcc/
> > > > > > > > > > > >
> > > > > > > > > > > >         PR tree-optimization/88828
> > > > > > > > > > > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > > > > > > > > > > >         vector init constructor with vector copy or permute followed
> > > > > > > > > > > >         by a single scalar insert.
> > > > > > > > > > > >
> > > > > > > > > > > > gcc/testsuite/
> > > > > > > > > > > >
> > > > > > > > > > > >         PR tree-optimization/88828
> > > > > > > > > > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > > > > > > > > > >
> > > > > > > > > > > Here is the updated patch with run-time tests.
> > > > > > > > > >
> > > > > > > > > > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > > > > > > > > > +      if (TREE_CODE (ce->value) != SSA_NAME)
> > > > > > > > > >         return false;
> > > > > > > > > >
> > > > > > > > > > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> > > > > > > > > > scalar value can be a constant as well.
> > > > > > > > >
> > > > > > > > > Fixed.
> > > > > > > > >
> > > > > > > > > >        if (!def_stmt)
> > > > > > > > > > -       return false;
> > > > > > > > > > +       {
> > > > > > > > > > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> > > > > > > > > >
> > > > > > > > > > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> > > > > > > > > >
> > > > > > > > > > +           {
> > > > > > > > > >
> > > > > > > > > > also you seem to disallow
> > > > > > > > > >
> > > > > > > > > >   { i + 1, v[1], v[2], v[3] }
> > > > > > > > >
> > > > > > > > > Fixed by
> > > > > > > > >
> > > > > > > > >      if (code != BIT_FIELD_REF)
> > > > > > > > >         {
> > > > > > > > >           /* Only allow one scalar insert.  */
> > > > > > > > >           if (nscalars != 0)
> > > > > > > > >             return false;
> > > > > > > > >
> > > > > > > > >           nscalars = 1;
> > > > > > > > >           insert = true;
> > > > > > > > >           scalar_idx = i;
> > > > > > > > >           sel.quick_push (i);
> > > > > > > > >           scalar_element = ce->value;
> > > > > > > > >           continue;
> > > > > > > > >         }
> > > > > > > > >
> > > > > > > > > > because get_prop_source_stmt will return the definition computing
> > > > > > > > > > i + 1 in this case and your code will be skipped?
> > > > > > > > > >
> > > > > > > > > > I think you can simplify the code by treating scalar_element != NULL
> > > > > > > > > > as nscalars == 1 and eliding nscalars.
> > > > > > > > >
> > > > > > > > > It works only if
> > > > > > > > >
> > > > > > > > > TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)
> > > > > > > > >
> > > > > > > > > We need to check both nscalars and nvectors.  Elide nscalar
> > > > > > > > > check doesn't help much here.
> > > > > > > > >
> > > > > > > > > > -      if (conv_code == ERROR_MARK)
> > > > > > > > > > +      if (conv_code == ERROR_MARK && !insert)
> > > > > > > > > >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
> > > > > > > > > >                                         orig[1], op2);
> > > > > > > > > >        else
> > > > > > > > > > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> > > > > > > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > > > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > > > > > > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> > > > > > > > > > +         gimple_assign_set_rhs_with_ops (gsi,
> > > > > > > > > > +                                         (conv_code != ERROR_MARK
> > > > > > > > > > +                                          ? conv_code
> > > > > > > > > > +                                          : NOP_EXPR),
> > > > > > > > > > +                                         orig[0],
> > > > > > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > > > > > >
> > > > > > > > > > I believe you should elide the last stmt for conv_code == ERROR_MARK,
> > > > > > > > > > that is, why did you need to add the && !insert check in the guarding condition
> > > > > > > > >
> > > > > > > > > When conv_code == ERROR_MARK, we still need
> > > > > > > > >
> > > > > > > > >        gimple *perm
> > > > > > > > >             = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
> > > > > > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > > > > >           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
> > > > > > > > >                                           orig[0],
> > > > > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > > > > >
> > > > > > > > > Otherwise, scalar insert won't work.
> > > > > > > > >
> > > > > > > > > > (this path should already do the correct thing?).  Note that in all
> > > > > > > > > > cases it looks
> > > > > > > > > > that with conv_code != ERROR_MARK you may end up doing a float->int
> > > > > > > > > > or int->float conversion on a value it wasn't done on before which might
> > > > > > > > > > raise exceptions?  That is, do we need to make sure we permute a
> > > > > > > > > > value we already do convert into the place we're going to insert to?
> > > > > > > > >
> > > > > > > > > This couldn't happen:
> > > > > > > > >
> > > > > > > > >       if (type == TREE_TYPE (ref))
> > > > > > > > >          {
> > > > > > > > >            /* The RHS vector has the same type as LHS.  */
> > > > > > > > >            if (rhs_vector == NULL)
> > > > > > > > >              rhs_vector = ref;
> > > > > > > > >            /* Check if all RHS vector elements come fome the same
> > > > > > > > >               vector.  */
> > > > > > > > >            if (rhs_vector == ref)
> > > > > > > > >              nvectors++;
> > > > > > > > >          }
> > > > > > > > > ...
> > > > > > > > >   if (insert
> > > > > > > > >       && (nvectors == 0
> > > > > > > > >           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> > > > > > > > >               != (nscalars + nvectors))))
> > > > > > > > >     return false;
> > > > > > >
> > > > > > > I see - that looks like a missed case then?
> > > > > > >
> > > > > > >  { 1., (float)v[1], (float)v[2], (float)v[3] }
> > > > > > >
> > > > > > > with integer vector v?
> > > > > >
> > > > > > True.
> > > > > >
> > > > > > > I'll have a look at the full patch next week (it's GCC 10 material in any case).
> > > > > > >
> > > > >
> > > > > Now looking again.  I still don't like the new "structure" of the loop
> > > > > very much.
> > > > > A refactoring like the attached should make it easier to clearly separate the
> > > > > cases where we reach a vector def and where not.
> > > >
> > > > Now attached.
> > > >
> > > > > Do you want me to take over the patch?
> > > > >
> > >
> >
> > Here is the updated patch on top of your patch plus my fix.
>
> Thanks - when doing the constant vector I was thinking of the following patch
> to handle your cases.  It doesn't use insertion but eventually leaves that
> to a separate transform.  Instead it handles non-constants similar to constants
> by permuting from a uniform vector.  Thus
>
> __attribute__((noinline, noclone))
> __v4sf
> foo2 (__v4sf x, float f)
> {
>   __v4sf y = { f, x[1], x[2], x[3] };
>   return y;
> }
>
> becomes
>
>   _4 = {f_2(D), f_2(D), f_2(D), f_2(D)};
>   y_3 = VEC_PERM_EXPR <x_5(D), _4, { 4, 1, 2, 3 }>;
>   return y_3;
>
> this allows us to handle an arbitrary number of inserts of this
> single value.  It also ensures we can actually perform the
> permutation while for the insertion we currently do not have
> a convenient way to query whether the target can perform
> it efficiently (IIRC x86 needs AVX to insert to arbitrary lanes
> with a single instruction?).  Similarly if the user writes the above
> in source using __builtin_shuffle we'd want to optimize it as well.
>
> The patch as attached only passes some of your testcases,
> the following FAIL:
>
> FAIL: gcc.target/i386/pr88828-2a.c scan-assembler movss
> FAIL: gcc.target/i386/pr88828-2a.c scan-assembler-not movaps
> FAIL: gcc.target/i386/pr88828-2a.c scan-assembler-not movlhps
> FAIL: gcc.target/i386/pr88828-2a.c scan-assembler-not unpcklps
> FAIL: gcc.target/i386/pr88828-2b.c scan-assembler-times vpermilps 1
> FAIL: gcc.target/i386/pr88828-2b.c scan-assembler-times vmovss 1
> FAIL: gcc.target/i386/pr88828-2c.c scan-assembler movss
> FAIL: gcc.target/i386/pr88828-2c.c scan-assembler-not movaps
> FAIL: gcc.target/i386/pr88828-2c.c scan-assembler-not movlhps
> FAIL: gcc.target/i386/pr88828-2c.c scan-assembler-not unpcklps
> FAIL: gcc.target/i386/pr88828-2d.c scan-assembler-times vpermilps 1
> FAIL: gcc.target/i386/pr88828-2d.c scan-assembler-times vmovss 1
> FAIL: gcc.target/i386/pr88828-3a.c scan-assembler movss
> FAIL: gcc.target/i386/pr88828-3a.c scan-assembler-times shufps 2
> FAIL: gcc.target/i386/pr88828-3a.c scan-assembler-times movaps 1
> FAIL: gcc.target/i386/pr88828-3a.c scan-assembler-not movlhps
> FAIL: gcc.target/i386/pr88828-3a.c scan-assembler-not unpcklps
> FAIL: gcc.target/i386/pr88828-3b.c scan-assembler-times vpermilps 1
> FAIL: gcc.target/i386/pr88828-3b.c scan-assembler-times vinsertps 1
> FAIL: gcc.target/i386/pr88828-3b.c scan-assembler-not vshufps
> FAIL: gcc.target/i386/pr88828-3c.c scan-assembler-times vpermilps 1
> FAIL: gcc.target/i386/pr88828-3c.c scan-assembler-times vinsertps 1
> FAIL: gcc.target/i386/pr88828-3c.c scan-assembler-not vshufps
>
> Making the patch emit inserts for single insert locations is of course
> still possible but you get to arrive at heuristics like your choice
> of permuting the original lane into the later overwritten lane which
> might be a choice making the permute impossible or more expensive?
>
> The original purpose of simplify_vector_constructor was to simplify
> the IL, not so much optimal code-generation in the end but I wonder
> if we can rely on RTL expansion or later RTL optimization to do
> the optimal choices here?
>
> I guess simplify_permutation could perform a VEC_PERM
> into an insert if the remaining permutation would be a no-op
> but RTL optimization handles this case well already.
>
> Whether code-generation for a one vector permute plus insert or
> a two-vector permute is better in the end I don't know - at least
> the permute expansion has a chance to see the combined
> instruction.
>
> Do you think the remaining cases above can be handled in the
> backend?
>
> Comments?

I have now applied this after bootstrap / regtest on x86_64-unknown-linux-gnu
together with the part of the testcases that PASS.  Note I had to
add -fexcess-precision=standard to the -7.c one as it otherwise fails to
execute both patched and unpatched.

r271153.

I didn't check whether the remaining testcases simply need adjustments
(thus their code-gen is OK) or if there's something to do on the target
or in a GIMPLE transform.  That needs to be evaluated still.

There's also still the missed optimization of using VEC_UNPACK/PACK
codes for conversions.

Richard.

> Thanks,
> Richard.
>
> 2019-05-08  Richard Biener  <rguenther@suse.de>
>
>         PR tree-optimization/88828
>         * tree-ssa-forwprop.c (simplify_vector_constructor): Handle
>         permuting in a single non-constant element not extracted
>         from a vector.
>
>
> > --
> > H.J.

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2019-05-14  9:13 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-03 14:32 [PATCH] Optimize vector init constructor H.J. Lu
2019-03-03 14:40 ` Andrew Pinski
2019-03-03 21:13   ` H.J. Lu
2019-03-04 11:55     ` Richard Biener
2019-03-04 17:46       ` V2 [PATCH] Optimize vector constructor H.J. Lu
2019-03-06  7:54         ` V3 " H.J. Lu
2019-03-06 13:39           ` Richard Biener
2019-03-07  7:12             ` V4 " H.J. Lu
2019-03-08  9:56               ` V5 " H.J. Lu
2019-03-08 11:23                 ` Richard Biener
2019-03-11  7:58                   ` H.J. Lu
2019-05-02 14:54                     ` Richard Biener
2019-05-02 14:55                       ` Richard Biener
2019-05-02 17:53                         ` H.J. Lu
2019-05-03 16:54                           ` V6 " H.J. Lu
2019-05-08 12:04                             ` Richard Biener
2019-05-14  9:13                               ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).