public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [RFC PATCH] SLP vectorize calls
@ 2011-10-20 23:56 Jakub Jelinek
  2011-10-21 12:45 ` Ira Rosen
  0 siblings, 1 reply; 12+ messages in thread
From: Jakub Jelinek @ 2011-10-20 23:56 UTC (permalink / raw)
  To: Ira Rosen, Richard Guenther; +Cc: gcc-patches

Hi!

While looking at *.vect dumps from Polyhedron, I've noticed the lack
of SLP vectorization of builtin calls.

This patch is an attempt to handle at least 1 and 2 operand builtin calls
(SLP doesn't handle ternary stmts either yet), where all the types are the
same.  E.g. it can handle
extern float copysignf (float, float);
extern float sqrtf (float);
float a[8], b[8], c[8], d[8];

void
foo (void)
{
  a[0] = copysignf (b[0], c[0]) + 1.0f + sqrtf (d[0]);
  a[1] = copysignf (b[1], c[1]) + 2.0f + sqrtf (d[1]);
  a[2] = copysignf (b[2], c[2]) + 3.0f + sqrtf (d[2]);
  a[3] = copysignf (b[3], c[3]) + 4.0f + sqrtf (d[3]);
  a[4] = copysignf (b[4], c[4]) + 5.0f + sqrtf (d[4]);
  a[5] = copysignf (b[5], c[5]) + 6.0f + sqrtf (d[5]);
  a[6] = copysignf (b[6], c[6]) + 7.0f + sqrtf (d[6]);
  a[7] = copysignf (b[7], c[7]) + 8.0f + sqrtf (d[7]);
}
and compile it into:
        vmovaps .LC0(%rip), %ymm0
        vandnps b(%rip), %ymm0, %ymm1
        vandps  c(%rip), %ymm0, %ymm0
        vorps   %ymm0, %ymm1, %ymm0
        vsqrtps d(%rip), %ymm1
        vaddps  %ymm1, %ymm0, %ymm0
        vaddps  .LC1(%rip), %ymm0, %ymm0
        vmovaps %ymm0, a(%rip)
I've bootstrapped/regtested it on x86_64-linux and i686-linux, but
am not 100% sure about all the changes, e.g. that
|| PURE_SLP_STMT (stmt_info) part.

2011-10-20  Jakub Jelinek  <jakub@redhat.com>

	* tree-vect-stmts.c (vectorizable_call): Add SLP_NODE argument.
	Handle vectorization of SLP calls.
	(vect_analyze_stmt): Adjust caller, add call to it for SLP too.
	(vect_transform_stmt): Adjust vectorizable_call caller, remove
	assertion.
	* tree-vect-slp.c (vect_get_and_check_slp_defs): Handle one
	and two argument calls too.
	(vect_build_slp_tree): Allow CALL_EXPR.
	(vect_get_slp_defs): Handle calls.

--- gcc/tree-vect-stmts.c.jj	2011-10-20 14:13:34.000000000 +0200
+++ gcc/tree-vect-stmts.c	2011-10-20 18:02:43.000000000 +0200
@@ -1483,7 +1483,8 @@ vectorizable_function (gimple call, tree
    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
 
 static bool
-vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
+vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
+		   slp_tree slp_node)
 {
   tree vec_dest;
   tree scalar_dest;
@@ -1494,6 +1495,7 @@ vectorizable_call (gimple stmt, gimple_s
   int nunits_in;
   int nunits_out;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
   tree fndecl, new_temp, def, rhs_type;
   gimple def_stmt;
   enum vect_def_type dt[3]
@@ -1505,19 +1507,12 @@ vectorizable_call (gimple stmt, gimple_s
   size_t i, nargs;
   tree lhs;
 
-  /* FORNOW: unsupported in basic block SLP.  */
-  gcc_assert (loop_vinfo);
-
-  if (!STMT_VINFO_RELEVANT_P (stmt_info))
+  if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
     return false;
 
   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
     return false;
 
-  /* FORNOW: SLP not supported.  */
-  if (STMT_SLP_TYPE (stmt_info))
-    return false;
-
   /* Is STMT a vectorizable call?   */
   if (!is_gimple_call (stmt))
     return false;
@@ -1558,7 +1553,7 @@ vectorizable_call (gimple stmt, gimple_s
       if (!rhs_type)
 	rhs_type = TREE_TYPE (op);
 
-      if (!vect_is_simple_use_1 (op, loop_vinfo, NULL,
+      if (!vect_is_simple_use_1 (op, loop_vinfo, bb_vinfo,
 				 &def_stmt, &def, &dt[i], &opvectype))
 	{
 	  if (vect_print_dump_info (REPORT_DETAILS))
@@ -1620,7 +1615,13 @@ vectorizable_call (gimple stmt, gimple_s
 
   gcc_assert (!gimple_vuse (stmt));
 
-  if (modifier == NARROW)
+  if (slp_node || PURE_SLP_STMT (stmt_info))
+    {
+      if (modifier != NONE)
+	return false;
+      ncopies = 1;
+    }
+  else if (modifier == NARROW)
     ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
   else
     ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
@@ -1659,6 +1660,43 @@ vectorizable_call (gimple stmt, gimple_s
 	  else
 	    VEC_truncate (tree, vargs, 0);
 
+	  if (slp_node)
+	    {
+	      VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
+
+	      gcc_assert (j == 0);
+	      if (nargs == 1)
+		vect_get_vec_defs (gimple_call_arg (stmt, 0), NULL_TREE, stmt,
+				   &vec_oprnds0, &vec_oprnds1, slp_node);
+	      else if (nargs == 2)
+		vect_get_vec_defs (gimple_call_arg (stmt, 0),
+				   gimple_call_arg (stmt, 1), stmt,
+				   &vec_oprnds0, &vec_oprnds1, slp_node);
+	      else
+		gcc_unreachable ();
+
+	      /* Arguments are ready.  Create the new vector stmt.  */
+	      FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, vec_oprnd0)
+		{
+		  vec_oprnd1 = nargs == 2 ? VEC_index (tree, vec_oprnds1, i)
+					  : NULL_TREE;
+		  new_stmt = gimple_build_call (fndecl, nargs, vec_oprnd0,
+						vec_oprnd1);
+		  new_temp = make_ssa_name (vec_dest, new_stmt);
+		  gimple_call_set_lhs (new_stmt, new_temp);
+		  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+		  mark_symbols_for_renaming (new_stmt);
+		  VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node),
+				  new_stmt);
+		}
+
+	      VEC_free (tree, heap, vec_oprnds0);
+	      if (vec_oprnds1)
+		VEC_free (tree, heap, vec_oprnds1);
+
+	      continue;
+	    }
+
 	  for (i = 0; i < nargs; i++)
 	    {
 	      op = gimple_call_arg (stmt, i);
@@ -5099,7 +5137,7 @@ vect_analyze_stmt (gimple stmt, bool *ne
             || vectorizable_operation (stmt, NULL, NULL, NULL)
             || vectorizable_assignment (stmt, NULL, NULL, NULL)
             || vectorizable_load (stmt, NULL, NULL, NULL, NULL)
-            || vectorizable_call (stmt, NULL, NULL)
+            || vectorizable_call (stmt, NULL, NULL, NULL)
             || vectorizable_store (stmt, NULL, NULL, NULL)
             || vectorizable_reduction (stmt, NULL, NULL, NULL)
             || vectorizable_condition (stmt, NULL, NULL, NULL, 0));
@@ -5108,10 +5146,11 @@ vect_analyze_stmt (gimple stmt, bool *ne
         if (bb_vinfo)
           ok = (vectorizable_type_promotion (stmt, NULL, NULL, node)
                 || vectorizable_type_demotion (stmt, NULL, NULL, node)
-               || vectorizable_shift (stmt, NULL, NULL, node)
+                || vectorizable_shift (stmt, NULL, NULL, node)
                 || vectorizable_operation (stmt, NULL, NULL, node)
                 || vectorizable_assignment (stmt, NULL, NULL, node)
                 || vectorizable_load (stmt, NULL, NULL, node, NULL)
+                || vectorizable_call (stmt, NULL, NULL, node)
                 || vectorizable_store (stmt, NULL, NULL, node));
       }
 
@@ -5234,8 +5273,7 @@ vect_transform_stmt (gimple stmt, gimple
       break;
 
     case call_vec_info_type:
-      gcc_assert (!slp_node);
-      done = vectorizable_call (stmt, gsi, &vec_stmt);
+      done = vectorizable_call (stmt, gsi, &vec_stmt, slp_node);
       stmt = gsi_stmt (*gsi);
       break;
 
--- gcc/tree-vect-slp.c.jj	2011-10-18 23:52:07.000000000 +0200
+++ gcc/tree-vect-slp.c	2011-10-20 18:06:55.000000000 +0200
@@ -129,12 +129,30 @@ vect_get_and_check_slp_defs (loop_vec_in
   if (loop_vinfo)
     loop = LOOP_VINFO_LOOP (loop_vinfo);
 
-  rhs_class = get_gimple_rhs_class (gimple_assign_rhs_code (stmt));
-  number_of_oprnds = gimple_num_ops (stmt) - 1;	/* RHS only */
+  if (is_gimple_call (stmt))
+    {
+      number_of_oprnds = gimple_call_num_args (stmt);
+      if (number_of_oprnds != 1 && number_of_oprnds != 2)
+	{
+	  if (vect_print_dump_info (REPORT_SLP))
+	    fprintf (vect_dump, "Build SLP failed: calls with %d "
+				"operands unhandled\n", number_of_oprnds);
+	  return false;
+	}
+      rhs_class = number_of_oprnds == 1 ? GIMPLE_UNARY_RHS : GIMPLE_BINARY_RHS;
+    }
+  else
+    {
+      rhs_class = get_gimple_rhs_class (gimple_assign_rhs_code (stmt));
+      number_of_oprnds = gimple_num_ops (stmt) - 1;	/* RHS only */
+    }
 
   for (i = 0; i < number_of_oprnds; i++)
     {
-      oprnd = gimple_op (stmt, i + 1);
+      if (is_gimple_call (stmt))
+	oprnd = gimple_call_arg (stmt, i);
+      else
+	oprnd = gimple_op (stmt, i + 1);
 
       if (!vect_is_simple_use (oprnd, loop_vinfo, bb_vinfo, &def_stmt, &def[i],
                                &dt[i])
@@ -660,7 +678,8 @@ vect_build_slp_tree (loop_vec_info loop_
 
 	  /* Not memory operation.  */
 	  if (TREE_CODE_CLASS (rhs_code) != tcc_binary
-	      && TREE_CODE_CLASS (rhs_code) != tcc_unary)
+	      && TREE_CODE_CLASS (rhs_code) != tcc_unary
+	      && rhs_code != CALL_EXPR)
 	    {
 	      if (vect_print_dump_info (REPORT_SLP))
 		{
@@ -2308,9 +2327,19 @@ vect_get_slp_defs (tree op0, tree op1, s
   if (reduc_index != -1)
     return;
 
-  code = gimple_assign_rhs_code (first_stmt);
-  if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1 || !op1)
+  if (!vec_oprnds1 || !op1)
     return;
+  if (is_gimple_call (first_stmt))
+    {
+      if (gimple_call_num_args (first_stmt) < 2)
+        return;
+    }
+  else
+    {
+      code = gimple_assign_rhs_code (first_stmt);
+      if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
+	return;
+    }
 
   /* The number of vector defs is determined by the number of vector statements
      in the node from which we get those statements.  */

	Jakub

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] SLP vectorize calls
  2011-10-20 23:56 [RFC PATCH] SLP vectorize calls Jakub Jelinek
@ 2011-10-21 12:45 ` Ira Rosen
  2011-10-21 13:31   ` Jakub Jelinek
  0 siblings, 1 reply; 12+ messages in thread
From: Ira Rosen @ 2011-10-21 12:45 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Guenther, gcc-patches

On 20 October 2011 23:50, Jakub Jelinek <jakub@redhat.com> wrote:
> Hi!

Hi,

>
> While looking at *.vect dumps from Polyhedron, I've noticed the lack
> of SLP vectorization of builtin calls.
>
> This patch is an attempt to handle at least 1 and 2 operand builtin calls
> (SLP doesn't handle ternary stmts either yet),

This is on the top of my todo list :).

> where all the types are the
> same.  E.g. it can handle
> extern float copysignf (float, float);
> extern float sqrtf (float);
> float a[8], b[8], c[8], d[8];
>
> void
> foo (void)
> {
>  a[0] = copysignf (b[0], c[0]) + 1.0f + sqrtf (d[0]);
>  a[1] = copysignf (b[1], c[1]) + 2.0f + sqrtf (d[1]);
>  a[2] = copysignf (b[2], c[2]) + 3.0f + sqrtf (d[2]);
>  a[3] = copysignf (b[3], c[3]) + 4.0f + sqrtf (d[3]);
>  a[4] = copysignf (b[4], c[4]) + 5.0f + sqrtf (d[4]);
>  a[5] = copysignf (b[5], c[5]) + 6.0f + sqrtf (d[5]);
>  a[6] = copysignf (b[6], c[6]) + 7.0f + sqrtf (d[6]);
>  a[7] = copysignf (b[7], c[7]) + 8.0f + sqrtf (d[7]);
> }
> and compile it into:
>        vmovaps .LC0(%rip), %ymm0
>        vandnps b(%rip), %ymm0, %ymm1
>        vandps  c(%rip), %ymm0, %ymm0
>        vorps   %ymm0, %ymm1, %ymm0
>        vsqrtps d(%rip), %ymm1
>        vaddps  %ymm1, %ymm0, %ymm0
>        vaddps  .LC1(%rip), %ymm0, %ymm0
>        vmovaps %ymm0, a(%rip)
> I've bootstrapped/regtested it on x86_64-linux and i686-linux, but
> am not 100% sure about all the changes, e.g. that
> || PURE_SLP_STMT (stmt_info) part.
>
> 2011-10-20  Jakub Jelinek  <jakub@redhat.com>
>
>        * tree-vect-stmts.c (vectorizable_call): Add SLP_NODE argument.
>        Handle vectorization of SLP calls.
>        (vect_analyze_stmt): Adjust caller, add call to it for SLP too.
>        (vect_transform_stmt): Adjust vectorizable_call caller, remove
>        assertion.
>        * tree-vect-slp.c (vect_get_and_check_slp_defs): Handle one
>        and two argument calls too.
>        (vect_build_slp_tree): Allow CALL_EXPR.
>        (vect_get_slp_defs): Handle calls.
>
> --- gcc/tree-vect-stmts.c.jj    2011-10-20 14:13:34.000000000 +0200
> +++ gcc/tree-vect-stmts.c       2011-10-20 18:02:43.000000000 +0200
> @@ -1483,7 +1483,8 @@ vectorizable_function (gimple call, tree
>    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
>
>  static bool
> -vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
> +vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
> +                  slp_tree slp_node)
>  {
>   tree vec_dest;
>   tree scalar_dest;
> @@ -1494,6 +1495,7 @@ vectorizable_call (gimple stmt, gimple_s
>   int nunits_in;
>   int nunits_out;
>   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
> +  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
>   tree fndecl, new_temp, def, rhs_type;
>   gimple def_stmt;
>   enum vect_def_type dt[3]
> @@ -1505,19 +1507,12 @@ vectorizable_call (gimple stmt, gimple_s
>   size_t i, nargs;
>   tree lhs;
>
> -  /* FORNOW: unsupported in basic block SLP.  */
> -  gcc_assert (loop_vinfo);
> -
> -  if (!STMT_VINFO_RELEVANT_P (stmt_info))
> +  if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
>     return false;
>
>   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
>     return false;
>
> -  /* FORNOW: SLP not supported.  */
> -  if (STMT_SLP_TYPE (stmt_info))
> -    return false;
> -
>   /* Is STMT a vectorizable call?   */
>   if (!is_gimple_call (stmt))
>     return false;
> @@ -1558,7 +1553,7 @@ vectorizable_call (gimple stmt, gimple_s
>       if (!rhs_type)
>        rhs_type = TREE_TYPE (op);
>
> -      if (!vect_is_simple_use_1 (op, loop_vinfo, NULL,
> +      if (!vect_is_simple_use_1 (op, loop_vinfo, bb_vinfo,
>                                 &def_stmt, &def, &dt[i], &opvectype))
>        {
>          if (vect_print_dump_info (REPORT_DETAILS))
> @@ -1620,7 +1615,13 @@ vectorizable_call (gimple stmt, gimple_s
>
>   gcc_assert (!gimple_vuse (stmt));
>
> -  if (modifier == NARROW)
> +  if (slp_node || PURE_SLP_STMT (stmt_info))
> +    {
> +      if (modifier != NONE)
> +       return false;
> +      ncopies = 1;
> +    }

If you want to bail out if it's SLP and modifier != NONE, this check
is not enough. PURE_SLP means the stmt is not used outside the SLP
instance, so for hybrid SLP stmts (those that have uses outside SLP)
this check will not work. You need

  if (modifier != NONE && STMT_SLP_TYPE (stmt_info))
     return false;

But I wonder why not allow different type sizes? I see that we fail in
such cases in vectorizable_conversion too, but I think we should
support this as well.

> +  else if (modifier == NARROW)
>     ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
>   else
>     ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
> @@ -1659,6 +1660,43 @@ vectorizable_call (gimple stmt, gimple_s
>          else
>            VEC_truncate (tree, vargs, 0);
>
> +         if (slp_node)
> +           {
> +             VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
> +
> +             gcc_assert (j == 0);
> +             if (nargs == 1)
> +               vect_get_vec_defs (gimple_call_arg (stmt, 0), NULL_TREE, stmt,
> +                                  &vec_oprnds0, &vec_oprnds1, slp_node);
> +             else if (nargs == 2)
> +               vect_get_vec_defs (gimple_call_arg (stmt, 0),
> +                                  gimple_call_arg (stmt, 1), stmt,
> +                                  &vec_oprnds0, &vec_oprnds1, slp_node);
> +             else
> +               gcc_unreachable ();
> +
> +             /* Arguments are ready.  Create the new vector stmt.  */
> +             FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, vec_oprnd0)
> +               {
> +                 vec_oprnd1 = nargs == 2 ? VEC_index (tree, vec_oprnds1, i)
> +                                         : NULL_TREE;
> +                 new_stmt = gimple_build_call (fndecl, nargs, vec_oprnd0,
> +                                               vec_oprnd1);
> +                 new_temp = make_ssa_name (vec_dest, new_stmt);
> +                 gimple_call_set_lhs (new_stmt, new_temp);
> +                 vect_finish_stmt_generation (stmt, new_stmt, gsi);
> +                 mark_symbols_for_renaming (new_stmt);
> +                 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node),
> +                                 new_stmt);
> +               }
> +
> +             VEC_free (tree, heap, vec_oprnds0);
> +             if (vec_oprnds1)
> +               VEC_free (tree, heap, vec_oprnds1);
> +
> +             continue;
> +           }
> +
>          for (i = 0; i < nargs; i++)
>            {
>              op = gimple_call_arg (stmt, i);
> @@ -5099,7 +5137,7 @@ vect_analyze_stmt (gimple stmt, bool *ne
>             || vectorizable_operation (stmt, NULL, NULL, NULL)
>             || vectorizable_assignment (stmt, NULL, NULL, NULL)
>             || vectorizable_load (stmt, NULL, NULL, NULL, NULL)
> -            || vectorizable_call (stmt, NULL, NULL)
> +            || vectorizable_call (stmt, NULL, NULL, NULL)
>             || vectorizable_store (stmt, NULL, NULL, NULL)
>             || vectorizable_reduction (stmt, NULL, NULL, NULL)
>             || vectorizable_condition (stmt, NULL, NULL, NULL, 0));
> @@ -5108,10 +5146,11 @@ vect_analyze_stmt (gimple stmt, bool *ne
>         if (bb_vinfo)
>           ok = (vectorizable_type_promotion (stmt, NULL, NULL, node)
>                 || vectorizable_type_demotion (stmt, NULL, NULL, node)
> -               || vectorizable_shift (stmt, NULL, NULL, node)
> +                || vectorizable_shift (stmt, NULL, NULL, node)
>                 || vectorizable_operation (stmt, NULL, NULL, node)
>                 || vectorizable_assignment (stmt, NULL, NULL, node)
>                 || vectorizable_load (stmt, NULL, NULL, node, NULL)
> +                || vectorizable_call (stmt, NULL, NULL, node)
>                 || vectorizable_store (stmt, NULL, NULL, node));
>       }
>
> @@ -5234,8 +5273,7 @@ vect_transform_stmt (gimple stmt, gimple
>       break;
>
>     case call_vec_info_type:
> -      gcc_assert (!slp_node);
> -      done = vectorizable_call (stmt, gsi, &vec_stmt);
> +      done = vectorizable_call (stmt, gsi, &vec_stmt, slp_node);
>       stmt = gsi_stmt (*gsi);
>       break;
>
> --- gcc/tree-vect-slp.c.jj      2011-10-18 23:52:07.000000000 +0200
> +++ gcc/tree-vect-slp.c 2011-10-20 18:06:55.000000000 +0200
> @@ -129,12 +129,30 @@ vect_get_and_check_slp_defs (loop_vec_in
>   if (loop_vinfo)
>     loop = LOOP_VINFO_LOOP (loop_vinfo);
>
> -  rhs_class = get_gimple_rhs_class (gimple_assign_rhs_code (stmt));
> -  number_of_oprnds = gimple_num_ops (stmt) - 1;        /* RHS only */
> +  if (is_gimple_call (stmt))
> +    {
> +      number_of_oprnds = gimple_call_num_args (stmt);
> +      if (number_of_oprnds != 1 && number_of_oprnds != 2)
> +       {
> +         if (vect_print_dump_info (REPORT_SLP))
> +           fprintf (vect_dump, "Build SLP failed: calls with %d "
> +                               "operands unhandled\n", number_of_oprnds);

No need in \n.

> +         return false;
> +       }
> +      rhs_class = number_of_oprnds == 1 ? GIMPLE_UNARY_RHS : GIMPLE_BINARY_RHS;
> +    }
> +  else
> +    {
> +      rhs_class = get_gimple_rhs_class (gimple_assign_rhs_code (stmt));
> +      number_of_oprnds = gimple_num_ops (stmt) - 1;    /* RHS only */
> +    }
>
>   for (i = 0; i < number_of_oprnds; i++)
>     {
> -      oprnd = gimple_op (stmt, i + 1);
> +      if (is_gimple_call (stmt))
> +       oprnd = gimple_call_arg (stmt, i);
> +      else
> +       oprnd = gimple_op (stmt, i + 1);
>
>       if (!vect_is_simple_use (oprnd, loop_vinfo, bb_vinfo, &def_stmt, &def[i],
>                                &dt[i])

I think you forgot to check that all the calls are to the same function.

Thanks,
Ira

> @@ -660,7 +678,8 @@ vect_build_slp_tree (loop_vec_info loop_
>
>          /* Not memory operation.  */
>          if (TREE_CODE_CLASS (rhs_code) != tcc_binary
> -             && TREE_CODE_CLASS (rhs_code) != tcc_unary)
> +             && TREE_CODE_CLASS (rhs_code) != tcc_unary
> +             && rhs_code != CALL_EXPR)
>            {
>              if (vect_print_dump_info (REPORT_SLP))
>                {
> @@ -2308,9 +2327,19 @@ vect_get_slp_defs (tree op0, tree op1, s
>   if (reduc_index != -1)
>     return;
>
> -  code = gimple_assign_rhs_code (first_stmt);
> -  if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1 || !op1)
> +  if (!vec_oprnds1 || !op1)
>     return;
> +  if (is_gimple_call (first_stmt))
> +    {
> +      if (gimple_call_num_args (first_stmt) < 2)
> +        return;
> +    }
> +  else
> +    {
> +      code = gimple_assign_rhs_code (first_stmt);
> +      if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
> +       return;
> +    }
>
>   /* The number of vector defs is determined by the number of vector statements
>      in the node from which we get those statements.  */
>
>        Jakub
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] SLP vectorize calls
  2011-10-21 12:45 ` Ira Rosen
@ 2011-10-21 13:31   ` Jakub Jelinek
  2011-10-21 14:26     ` Ira Rosen
  0 siblings, 1 reply; 12+ messages in thread
From: Jakub Jelinek @ 2011-10-21 13:31 UTC (permalink / raw)
  To: Ira Rosen; +Cc: Richard Guenther, gcc-patches

On Fri, Oct 21, 2011 at 02:37:06PM +0200, Ira Rosen wrote:
> > @@ -1620,7 +1615,13 @@ vectorizable_call (gimple stmt, gimple_s
> >
> >   gcc_assert (!gimple_vuse (stmt));
> >
> > -  if (modifier == NARROW)
> > +  if (slp_node || PURE_SLP_STMT (stmt_info))
> > +    {
> > +      if (modifier != NONE)
> > +       return false;
> > +      ncopies = 1;
> > +    }
> 
> If you want to bail out if it's SLP and modifier != NONE, this check
> is not enough. PURE_SLP means the stmt is not used outside the SLP
> instance, so for hybrid SLP stmts (those that have uses outside SLP)
> this check will not work. You need
> 
>   if (modifier != NONE && STMT_SLP_TYPE (stmt_info))
>      return false;

I just blindly copied what vectorizable_operation does, without
too much understanding what PURE_SLP_STMT or STMT_SLP_TYPE etc. mean.
Didn't get that far.
But modifier != NONE && something would sometimes allow modifier != NONE
through, which at least the current code isn't prepared to handle.
Did you mean || instead?

> But I wonder why not allow different type sizes? I see that we fail in
> such cases in vectorizable_conversion too, but I think we should
> support this as well.

Merely because I don't know SLP enough, vectorizable_operation also
handles just same size to same size, so I didn't have good examples
on how to do it.  For loops narrowing or widening operations are
handled through ncopies != 1, but for SLP it seems it is always
asserted it is 1...

> No need in \n.

Ok.

> >   for (i = 0; i < number_of_oprnds; i++)
> >     {
> > -      oprnd = gimple_op (stmt, i + 1);
> > +      if (is_gimple_call (stmt))
> > +       oprnd = gimple_call_arg (stmt, i);
> > +      else
> > +       oprnd = gimple_op (stmt, i + 1);
> >
> >       if (!vect_is_simple_use (oprnd, loop_vinfo, bb_vinfo, &def_stmt, &def[i],
> >                                &dt[i])
> 
> I think you forgot to check that all the calls are to the same function.

Right, that is easy to add, but modifier != NONE is something I have no idea
how to do currently.

	Jakub

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] SLP vectorize calls
  2011-10-21 13:31   ` Jakub Jelinek
@ 2011-10-21 14:26     ` Ira Rosen
  2011-10-21 14:42       ` Jakub Jelinek
  2011-11-07 18:44       ` [PATCH] SLP vectorize calls (take 2) Jakub Jelinek
  0 siblings, 2 replies; 12+ messages in thread
From: Ira Rosen @ 2011-10-21 14:26 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Guenther, gcc-patches

On 21 October 2011 14:52, Jakub Jelinek <jakub@redhat.com> wrote:
> On Fri, Oct 21, 2011 at 02:37:06PM +0200, Ira Rosen wrote:
>> > @@ -1620,7 +1615,13 @@ vectorizable_call (gimple stmt, gimple_s
>> >
>> >   gcc_assert (!gimple_vuse (stmt));
>> >
>> > -  if (modifier == NARROW)
>> > +  if (slp_node || PURE_SLP_STMT (stmt_info))
>> > +    {
>> > +      if (modifier != NONE)
>> > +       return false;
>> > +      ncopies = 1;
>> > +    }
>>
>> If you want to bail out if it's SLP and modifier != NONE, this check
>> is not enough. PURE_SLP means the stmt is not used outside the SLP
>> instance, so for hybrid SLP stmts (those that have uses outside SLP)
>> this check will not work. You need
>>
>>   if (modifier != NONE && STMT_SLP_TYPE (stmt_info))
>>      return false;
>
> I just blindly copied what vectorizable_operation does, without
> too much understanding what PURE_SLP_STMT or STMT_SLP_TYPE etc. mean.
> Didn't get that far.
> But modifier != NONE && something would sometimes allow modifier != NONE
> through, which at least the current code isn't prepared to handle.
> Did you mean || instead?

But it's OK to allow modifier != NONE if it's not SLP, so we need &&, no?
Something like:

if (modifier != NONE && STMT_SLP_TYPE (stmt_info))
   return false;

if (slp_node || PURE_SLP_STMT (stmt_info))
   ncopies = 1;
else if (modifier == NARROW)
   ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
else
   ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;

>
>> But I wonder why not allow different type sizes? I see that we fail in
>> such cases in vectorizable_conversion too, but I think we should
>> support this as well.
>
> Merely because I don't know SLP enough, vectorizable_operation also
> handles just same size to same size, so I didn't have good examples
> on how to do it.  For loops narrowing or widening operations are
> handled through ncopies != 1, but for SLP it seems it is always
> asserted it is 1...

There are vectorizable_type_promotion/demotion, and for the rest the
copies are "hidden" inside multiple vector operands that you get from
vect_get_vec_defs. But, of course, there is not need to handle
modifier == NARROW for SLP at the moment. I was just wondering out
loud.

Ira

>
>        Jakub
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] SLP vectorize calls
  2011-10-21 14:26     ` Ira Rosen
@ 2011-10-21 14:42       ` Jakub Jelinek
  2011-10-21 15:51         ` Ira Rosen
  2011-11-07 18:44       ` [PATCH] SLP vectorize calls (take 2) Jakub Jelinek
  1 sibling, 1 reply; 12+ messages in thread
From: Jakub Jelinek @ 2011-10-21 14:42 UTC (permalink / raw)
  To: Ira Rosen; +Cc: Richard Guenther, gcc-patches

On Fri, Oct 21, 2011 at 03:44:11PM +0200, Ira Rosen wrote:
> But it's OK to allow modifier != NONE if it's not SLP, so we need &&, no?

Well, in my patch that check was guarded by the if (slp_node ...),
so presumably it would allow modifier == NARROW vectorization in the loops
(otherwise some testcases would fail I'd hope).

Is gcc_assert ((slp_node != NULL) == (STMT_SLP_TYPE (stmt_info) != 0));
always true?  If not, when it is not?  When would be slp_node == NULL
and PURE_SLP_STMT true?

	Jakub

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] SLP vectorize calls
  2011-10-21 14:42       ` Jakub Jelinek
@ 2011-10-21 15:51         ` Ira Rosen
  0 siblings, 0 replies; 12+ messages in thread
From: Ira Rosen @ 2011-10-21 15:51 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Guenther, gcc-patches

On 21 October 2011 16:25, Jakub Jelinek <jakub@redhat.com> wrote:
> On Fri, Oct 21, 2011 at 03:44:11PM +0200, Ira Rosen wrote:
>> But it's OK to allow modifier != NONE if it's not SLP, so we need &&, no?
>
> Well, in my patch that check was guarded by the if (slp_node ...),
> so presumably it would allow modifier == NARROW vectorization in the loops
> (otherwise some testcases would fail I'd hope).

The problem with that is that slp_node can be NULL but it can still be
an SLP stmt (as you probably have guessed judging by the following
questions ;))

>
> Is gcc_assert ((slp_node != NULL) == (STMT_SLP_TYPE (stmt_info) != 0));
> always true?

No.

> If not, when it is not?

STMT_SLP_TYPE (stmt_info) != 0 may mean HYBRID_SLP_STMT, meaning that
we are vectorizing the stmt both as SLP and as regular loop
vectorization. So in the regular loop transformation of a hybrid stmt
(STMT_SLP_TYPE (stmt_info) != 0) doesn't (entail slp_node != NULL).

The other direction is always true.

> When would be slp_node == NULL
> and PURE_SLP_STMT true?

In the analysis of loop SLP. In loop SLP we analyze all the stmts of
the loop in their original order (and not as in BB SLP where we just
analyze SLP nodes). A stmt can belong to more than one SLP node, and
we may also need to vectorize it in a regular loop-vectorization way
at the same time. So, during the analysis we don't have stmt's SLP
node. (Note that during the analysis we need to know ncopies only to
verify that the operation is supported and for cost estimation).
And this is another case when 'if (STMT_SLP_TYPE (stmt_info) != 0)
then (slp_node != NULL)' is false.

I hope this makes sense.
Ira

>
>        Jakub
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] SLP vectorize calls (take 2)
  2011-10-21 14:26     ` Ira Rosen
  2011-10-21 14:42       ` Jakub Jelinek
@ 2011-11-07 18:44       ` Jakub Jelinek
  2011-11-08  8:00         ` Ira Rosen
  1 sibling, 1 reply; 12+ messages in thread
From: Jakub Jelinek @ 2011-11-07 18:44 UTC (permalink / raw)
  To: Ira Rosen, Richard Guenther; +Cc: gcc-patches

Hi!

On Fri, Oct 21, 2011 at 03:44:11PM +0200, Ira Rosen wrote:
> There are vectorizable_type_promotion/demotion, and for the rest the
> copies are "hidden" inside multiple vector operands that you get from
> vect_get_vec_defs. But, of course, there is not need to handle
> modifier == NARROW for SLP at the moment. I was just wondering out
> loud.

Here is an updated patch, which handles both modifier == NONE
and modifier == NARROW for SLP, after all it wasn't that hard.
Additionally it checks that the fndecls and various call flags
match, and adds some testcases.

Bootstrapped/regtested on x86_64-linux and i686-linux,
ok for trunk?

2011-11-07  Jakub Jelinek  <jakub@redhat.com>

	* tree-vect-stmts.c (vectorizable_call): Add SLP_NODE argument.
	Handle vectorization of SLP calls.
	(vect_analyze_stmt): Adjust caller, add call to it for SLP too.
	(vect_transform_stmt): Adjust vectorizable_call caller, remove
	assertion.
	* tree-vect-slp.c (vect_get_and_check_slp_defs): For calls start
	with op_idx 3.
	(vect_build_slp_tree): Allow CALL_EXPR.

	* gcc.dg/vect/fast-math-vect-call-1.c: New test.
	* gcc.dg/vect/fast-math-vect-call-2.c: New test.

--- gcc/tree-vect-slp.c.jj	2011-11-07 12:40:56.000000000 +0100
+++ gcc/tree-vect-slp.c	2011-11-07 12:45:06.000000000 +0100
@@ -202,7 +202,10 @@ vect_get_and_check_slp_defs (loop_vec_in
     loop = LOOP_VINFO_LOOP (loop_vinfo);
 
   if (is_gimple_call (stmt))
-    number_of_oprnds = gimple_call_num_args (stmt);
+    {
+      number_of_oprnds = gimple_call_num_args (stmt);
+      op_idx = 3;
+    }
   else if (is_gimple_assign (stmt))
     {
       number_of_oprnds = gimple_num_ops (stmt) - 1;
@@ -558,7 +561,25 @@ vect_build_slp_tree (loop_vec_info loop_
       ncopies = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
 
       if (is_gimple_call (stmt))
-	rhs_code = CALL_EXPR;
+	{
+	  rhs_code = CALL_EXPR;
+	  if (gimple_call_internal_p (stmt)
+	      || gimple_call_tail_p (stmt)
+	      || gimple_call_noreturn_p (stmt)
+	      || !gimple_call_nothrow_p (stmt)
+	      || gimple_call_chain (stmt))
+	    {
+	      if (vect_print_dump_info (REPORT_SLP))
+		{
+		  fprintf (vect_dump,
+			   "Build SLP failed: unsupported call type ");
+		  print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+		}
+
+	      vect_free_oprnd_info (&oprnds_info, true);
+	      return false;
+	    }
+	}
       else
 	rhs_code = gimple_assign_rhs_code (stmt);
 
@@ -653,6 +674,27 @@ vect_build_slp_tree (loop_vec_info loop_
 	      vect_free_oprnd_info (&oprnds_info, true);
 	      return false;
 	    }
+
+	  if (rhs_code == CALL_EXPR)
+	    {
+	      gimple first_stmt = VEC_index (gimple, stmts, 0);
+	      if (gimple_call_num_args (stmt) != nops
+		  || !operand_equal_p (gimple_call_fn (first_stmt),
+				       gimple_call_fn (stmt), 0)
+		  || gimple_call_fntype (first_stmt)
+		     != gimple_call_fntype (stmt))
+		{
+		  if (vect_print_dump_info (REPORT_SLP))
+		    {
+		      fprintf (vect_dump,
+			       "Build SLP failed: different calls in ");
+		      print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+		    }
+
+		  vect_free_oprnd_info (&oprnds_info, true);
+		  return false;
+		}
+	    }
 	}
 
       /* Strided store or load.  */
@@ -786,7 +828,8 @@ vect_build_slp_tree (loop_vec_info loop_
 	  /* Not memory operation.  */
 	  if (TREE_CODE_CLASS (rhs_code) != tcc_binary
 	      && TREE_CODE_CLASS (rhs_code) != tcc_unary
-              && rhs_code != COND_EXPR)
+	      && rhs_code != COND_EXPR
+	      && rhs_code != CALL_EXPR)
 	    {
 	      if (vect_print_dump_info (REPORT_SLP))
 		{
--- gcc/tree-vect-stmts.c.jj	2011-11-07 12:40:56.000000000 +0100
+++ gcc/tree-vect-stmts.c	2011-11-07 14:39:54.000000000 +0100
@@ -1505,7 +1505,8 @@ vectorizable_function (gimple call, tree
    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
 
 static bool
-vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
+vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
+		   slp_tree slp_node)
 {
   tree vec_dest;
   tree scalar_dest;
@@ -1516,6 +1517,7 @@ vectorizable_call (gimple stmt, gimple_s
   int nunits_in;
   int nunits_out;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
   tree fndecl, new_temp, def, rhs_type;
   gimple def_stmt;
   enum vect_def_type dt[3]
@@ -1527,19 +1529,12 @@ vectorizable_call (gimple stmt, gimple_s
   size_t i, nargs;
   tree lhs;
 
-  /* FORNOW: unsupported in basic block SLP.  */
-  gcc_assert (loop_vinfo);
-
-  if (!STMT_VINFO_RELEVANT_P (stmt_info))
+  if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
     return false;
 
   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
     return false;
 
-  /* FORNOW: SLP not supported.  */
-  if (STMT_SLP_TYPE (stmt_info))
-    return false;
-
   /* Is STMT a vectorizable call?   */
   if (!is_gimple_call (stmt))
     return false;
@@ -1580,7 +1575,7 @@ vectorizable_call (gimple stmt, gimple_s
       if (!rhs_type)
 	rhs_type = TREE_TYPE (op);
 
-      if (!vect_is_simple_use_1 (op, loop_vinfo, NULL,
+      if (!vect_is_simple_use_1 (op, loop_vinfo, bb_vinfo,
 				 &def_stmt, &def, &dt[i], &opvectype))
 	{
 	  if (vect_print_dump_info (REPORT_DETAILS))
@@ -1642,7 +1637,9 @@ vectorizable_call (gimple stmt, gimple_s
 
   gcc_assert (!gimple_vuse (stmt));
 
-  if (modifier == NARROW)
+  if (slp_node || PURE_SLP_STMT (stmt_info))
+    ncopies = 1;
+  else if (modifier == NARROW)
     ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
   else
     ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
@@ -1681,6 +1678,50 @@ vectorizable_call (gimple stmt, gimple_s
 	  else
 	    VEC_truncate (tree, vargs, 0);
 
+	  if (slp_node)
+	    {
+	      VEC (slp_void_p, heap) *vec_defs
+		= VEC_alloc (slp_void_p, heap, nargs);
+	      VEC (tree, heap) *vec_oprnds0;
+
+	      for (i = 0; i < nargs; i++)
+		VEC_quick_push (tree, vargs, gimple_call_arg (stmt, i));
+	      vect_get_slp_defs (vargs, slp_node, &vec_defs, -1);
+	      vec_oprnds0
+		= (VEC (tree, heap) *) VEC_index (slp_void_p, vec_defs, 0);
+
+	      /* Arguments are ready.  Create the new vector stmt.  */
+	      FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, vec_oprnd0)
+		{
+		  size_t k;
+		  for (k = 0; k < nargs; k++)
+		    {
+		      VEC (tree, heap) *vec_oprndsk
+			= (VEC (tree, heap) *)
+			  VEC_index (slp_void_p, vec_defs, k);
+		      VEC_replace (tree, vargs, k,
+				   VEC_index (tree, vec_oprndsk, i));
+		    }
+		  new_stmt = gimple_build_call_vec (fndecl, vargs);
+		  new_temp = make_ssa_name (vec_dest, new_stmt);
+		  gimple_call_set_lhs (new_stmt, new_temp);
+		  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+		  mark_symbols_for_renaming (new_stmt);
+		  VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node),
+				  new_stmt);
+		}
+
+	      for (i = 0; i < nargs; i++)
+		{
+		  VEC (tree, heap) *vec_oprndsi
+		    = (VEC (tree, heap) *)
+		      VEC_index (slp_void_p, vec_defs, i);
+		  VEC_free (tree, heap, vec_oprndsi);
+		}
+	      VEC_free (slp_void_p, heap, vec_defs);
+	      continue;
+	    }
+
 	  for (i = 0; i < nargs; i++)
 	    {
 	      op = gimple_call_arg (stmt, i);
@@ -1723,6 +1764,55 @@ vectorizable_call (gimple stmt, gimple_s
 	  else
 	    VEC_truncate (tree, vargs, 0);
 
+	  if (slp_node)
+	    {
+	      VEC (slp_void_p, heap) *vec_defs
+		= VEC_alloc (slp_void_p, heap, nargs);
+	      VEC (tree, heap) *vec_oprnds0;
+
+	      for (i = 0; i < nargs; i++)
+		VEC_quick_push (tree, vargs, gimple_call_arg (stmt, i));
+	      vect_get_slp_defs (vargs, slp_node, &vec_defs, -1);
+	      vec_oprnds0
+		= (VEC (tree, heap) *) VEC_index (slp_void_p, vec_defs, 0);
+
+	      /* Arguments are ready.  Create the new vector stmt.  */
+	      FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, vec_oprnd0)
+	      for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vec_oprnd0);
+		   i += 2)
+		{
+		  size_t k;
+		  VEC_truncate (tree, vargs, 0);
+		  for (k = 0; k < nargs; k++)
+		    {
+		      VEC (tree, heap) *vec_oprndsk
+			= (VEC (tree, heap) *)
+			  VEC_index (slp_void_p, vec_defs, k);
+		      VEC_quick_push (tree, vargs,
+				      VEC_index (tree, vec_oprndsk, i));
+		      VEC_quick_push (tree, vargs,
+				      VEC_index (tree, vec_oprndsk, i + 1));
+		    }
+		  new_stmt = gimple_build_call_vec (fndecl, vargs);
+		  new_temp = make_ssa_name (vec_dest, new_stmt);
+		  gimple_call_set_lhs (new_stmt, new_temp);
+		  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+		  mark_symbols_for_renaming (new_stmt);
+		  VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node),
+				  new_stmt);
+		}
+
+	      for (i = 0; i < nargs; i++)
+		{
+		  VEC (tree, heap) *vec_oprndsi
+		    = (VEC (tree, heap) *)
+		      VEC_index (slp_void_p, vec_defs, i);
+		  VEC_free (tree, heap, vec_oprndsi);
+		}
+	      VEC_free (slp_void_p, heap, vec_defs);
+	      continue;
+	    }
+
 	  for (i = 0; i < nargs; i++)
 	    {
 	      op = gimple_call_arg (stmt, i);
@@ -1788,7 +1878,8 @@ vectorizable_call (gimple stmt, gimple_s
     lhs = gimple_call_lhs (stmt);
   new_stmt = gimple_build_assign (lhs, build_zero_cst (type));
   set_vinfo_for_stmt (new_stmt, stmt_info);
-  set_vinfo_for_stmt (stmt, NULL);
+  if (!slp_node)
+    set_vinfo_for_stmt (stmt, NULL);
   STMT_VINFO_STMT (stmt_info) = new_stmt;
   gsi_replace (gsi, new_stmt, false);
   SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
@@ -5058,7 +5149,7 @@ vect_analyze_stmt (gimple stmt, bool *ne
             || vectorizable_operation (stmt, NULL, NULL, NULL)
             || vectorizable_assignment (stmt, NULL, NULL, NULL)
             || vectorizable_load (stmt, NULL, NULL, NULL, NULL)
-            || vectorizable_call (stmt, NULL, NULL)
+	    || vectorizable_call (stmt, NULL, NULL, NULL)
             || vectorizable_store (stmt, NULL, NULL, NULL)
             || vectorizable_reduction (stmt, NULL, NULL, NULL)
             || vectorizable_condition (stmt, NULL, NULL, NULL, 0, NULL));
@@ -5070,6 +5161,7 @@ vect_analyze_stmt (gimple stmt, bool *ne
                 || vectorizable_operation (stmt, NULL, NULL, node)
                 || vectorizable_assignment (stmt, NULL, NULL, node)
                 || vectorizable_load (stmt, NULL, NULL, node, NULL)
+		|| vectorizable_call (stmt, NULL, NULL, node)
                 || vectorizable_store (stmt, NULL, NULL, node)
                 || vectorizable_condition (stmt, NULL, NULL, NULL, 0, node));
       }
@@ -5184,8 +5276,7 @@ vect_transform_stmt (gimple stmt, gimple
       break;
 
     case call_vec_info_type:
-      gcc_assert (!slp_node);
-      done = vectorizable_call (stmt, gsi, &vec_stmt);
+      done = vectorizable_call (stmt, gsi, &vec_stmt, slp_node);
       stmt = gsi_stmt (*gsi);
       break;
 
--- gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c.jj	2011-11-07 15:05:36.000000000 +0100
+++ gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c	2011-11-07 15:07:10.000000000 +0100
@@ -0,0 +1,100 @@
+/* { dg-do run } */
+
+#include "tree-vect.h"
+
+extern float copysignf (float, float);
+extern float sqrtf (float);
+extern float fabsf (float);
+extern void abort (void);
+float a[64], b[64], c[64], d[64];
+
+__attribute__((noinline, noclone)) void
+f1 (void)
+{
+  a[0] = copysignf (b[0], c[0]) + 1.0f + sqrtf (d[0]);
+  a[1] = copysignf (b[1], c[1]) + 2.0f + sqrtf (d[1]);
+  a[2] = copysignf (b[2], c[2]) + 3.0f + sqrtf (d[2]);
+  a[3] = copysignf (b[3], c[3]) + 4.0f + sqrtf (d[3]);
+  a[4] = copysignf (b[4], c[4]) + 5.0f + sqrtf (d[4]);
+  a[5] = copysignf (b[5], c[5]) + 6.0f + sqrtf (d[5]);
+  a[6] = copysignf (b[6], c[6]) + 7.0f + sqrtf (d[6]);
+  a[7] = copysignf (b[7], c[7]) + 8.0f + sqrtf (d[7]);
+}
+
+__attribute__((noinline, noclone)) void
+f2 (int n)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    {
+      a[4 * i + 0] = copysignf (b[4 * i + 0], c[4 * i + 0]) + 1.0f + sqrtf (d[4 * i + 0]);
+      a[4 * i + 1] = copysignf (b[4 * i + 1], c[4 * i + 1]) + 2.0f + sqrtf (d[4 * i + 1]);
+      a[4 * i + 2] = copysignf (b[4 * i + 2], c[4 * i + 2]) + 3.0f + sqrtf (d[4 * i + 2]);
+      a[4 * i + 3] = copysignf (b[4 * i + 3], c[4 * i + 3]) + 4.0f + sqrtf (d[4 * i + 3]);
+    }
+}
+
+__attribute__((noinline, noclone)) void
+f3 (int n)
+{
+  int i;
+  for (i = 0; i < 2 * n; i++)
+    {
+      a[2 * i + 0] = copysignf (b[2 * i + 0], c[2 * i + 0]) + 1.0f + sqrtf (d[2 * i + 0]);
+      a[2 * i + 1] = copysignf (b[2 * i + 1], c[2 * i + 1]) + 2.0f + sqrtf (d[2 * i + 1]);
+    }
+}
+
+__attribute__((noinline, noclone)) void
+f4 (void)
+{
+  int i;
+  for (i = 0; i < 64; i++)
+    a[i] = copysignf (b[i], c[i]) + 1.0f + sqrtf (d[i]);
+}
+
+__attribute__((noinline, noclone)) int
+main1 ()
+{
+  int i;
+
+  for (i = 0; i < 64; i++)
+    {
+      asm ("");
+      b[i] = (i & 1) ? -4 * i : 4 * i;
+      c[i] = (i & 2) ? -8 * i : 8 * i;
+      d[i] = i * i;
+    }
+  f1 ();
+  for (i = 0; i < 8; i++)
+    if (fabsf (((i & 2) ? -4 * i : 4 * i) + 1 + i + i - a[i]) >= 0.0001f)
+      abort ();
+    else
+      a[i] = 131.25;
+  f2 (16);
+  for (i = 0; i < 64; i++)
+    if (fabsf (((i & 2) ? -4 * i : 4 * i) + 1 + (i & 3) + i - a[i]) >= 0.0001f)
+      abort ();
+    else
+      a[i] = 131.25;
+  f3 (16);
+  for (i = 0; i < 64; i++)
+    if (fabsf (((i & 2) ? -4 * i : 4 * i) + 1 + (i & 1) + i - a[i]) >= 0.0001f)
+      abort ();
+    else
+      a[i] = 131.25;
+  f4 ();
+  for (i = 0; i < 64; i++)
+    if (fabsf (((i & 2) ? -4 * i : 4 * i) + 1 + i - a[i]) >= 0.0001f)
+      abort ();
+  return 0;
+}
+
+int
+main ()
+{
+  check_vect ();
+  return main1 ();
+}
+
+/* { dg-final { cleanup-tree-dump "vect" } } */
--- gcc/testsuite/gcc.dg/vect/fast-math-vect-call-2.c.jj	2011-11-07 15:09:00.000000000 +0100
+++ gcc/testsuite/gcc.dg/vect/fast-math-vect-call-2.c	2011-11-07 15:11:58.000000000 +0100
@@ -0,0 +1,166 @@
+/* { dg-do run } */
+
+#include "tree-vect.h"
+
+extern long int lrint (double);
+extern void abort (void);
+long int a[64];
+double b[64];
+
+__attribute__((noinline, noclone)) void
+f1 (void)
+{
+  a[0] = lrint (b[0]) + 1;
+  a[1] = lrint (b[1]) + 2;
+  a[2] = lrint (b[2]) + 3;
+  a[3] = lrint (b[3]) + 4;
+  a[4] = lrint (b[4]) + 5;
+  a[5] = lrint (b[5]) + 6;
+  a[6] = lrint (b[6]) + 7;
+  a[7] = lrint (b[7]) + 8;
+}
+
+__attribute__((noinline, noclone)) void
+f2 (int n)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    {
+      a[4 * i + 0] = lrint (b[4 * i + 0]) + 1;
+      a[4 * i + 1] = lrint (b[4 * i + 1]) + 2;
+      a[4 * i + 2] = lrint (b[4 * i + 2]) + 3;
+      a[4 * i + 3] = lrint (b[4 * i + 3]) + 4;
+    }
+}
+
+__attribute__((noinline, noclone)) void
+f3 (int n)
+{
+  int i;
+  for (i = 0; i < 2 * n; i++)
+    {
+      a[2 * i + 0] = lrint (b[2 * i + 0]) + 1;
+      a[2 * i + 1] = lrint (b[2 * i + 1]) + 2;
+    }
+}
+
+__attribute__((noinline, noclone)) void
+f4 (void)
+{
+  int i;
+  for (i = 0; i < 64; i++)
+    a[i] = lrint (b[i]) + 1;
+}
+
+__attribute__((noinline, noclone)) void
+f5 (void)
+{
+  a[0] = lrint (b[0]);
+  a[1] = lrint (b[1]);
+  a[2] = lrint (b[2]);
+  a[3] = lrint (b[3]);
+  a[4] = lrint (b[4]);
+  a[5] = lrint (b[5]);
+  a[6] = lrint (b[6]);
+  a[7] = lrint (b[7]);
+}
+
+__attribute__((noinline, noclone)) void
+f6 (int n)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    {
+      a[4 * i + 0] = lrint (b[4 * i + 0]);
+      a[4 * i + 1] = lrint (b[4 * i + 1]);
+      a[4 * i + 2] = lrint (b[4 * i + 2]);
+      a[4 * i + 3] = lrint (b[4 * i + 3]);
+    }
+}
+
+__attribute__((noinline, noclone)) void
+f7 (int n)
+{
+  int i;
+  for (i = 0; i < 2 * n; i++)
+    {
+      a[2 * i + 0] = lrint (b[2 * i + 0]);
+      a[2 * i + 1] = lrint (b[2 * i + 1]);
+    }
+}
+
+__attribute__((noinline, noclone)) void
+f8 (void)
+{
+  int i;
+  for (i = 0; i < 64; i++)
+    a[i] = lrint (b[i]);
+}
+
+__attribute__((noinline, noclone)) int
+main1 ()
+{
+  int i;
+
+  for (i = 0; i < 64; i++)
+    {
+      asm ("");
+      b[i] = ((i & 1) ? -4 * i : 4 * i) + 0.25;
+    }
+  f1 ();
+  for (i = 0; i < 8; i++)
+    if (a[i] != ((i & 1) ? -4 * i : 4 * i) + 1 + i)
+      abort ();
+    else
+      a[i] = 131.25;
+  f2 (16);
+  for (i = 0; i < 64; i++)
+    if (a[i] != ((i & 1) ? -4 * i : 4 * i) + 1 + (i & 3))
+      abort ();
+    else
+      a[i] = 131.25;
+  f3 (16);
+  for (i = 0; i < 64; i++)
+    if (a[i] != ((i & 1) ? -4 * i : 4 * i) + 1 + (i & 1))
+      abort ();
+    else
+      a[i] = 131.25;
+  f4 ();
+  for (i = 0; i < 64; i++)
+    if (a[i] != ((i & 1) ? -4 * i : 4 * i) + 1)
+      abort ();
+    else
+      a[i] = 131.25;
+  f5 ();
+  for (i = 0; i < 8; i++)
+    if (a[i] != ((i & 1) ? -4 * i : 4 * i))
+      abort ();
+    else
+      a[i] = 131.25;
+  f6 (16);
+  for (i = 0; i < 64; i++)
+    if (a[i] != ((i & 1) ? -4 * i : 4 * i))
+      abort ();
+    else
+      a[i] = 131.25;
+  f7 (16);
+  for (i = 0; i < 64; i++)
+    if (a[i] != ((i & 1) ? -4 * i : 4 * i))
+      abort ();
+    else
+      a[i] = 131.25;
+  f8 ();
+  for (i = 0; i < 64; i++)
+    if (a[i] != ((i & 1) ? -4 * i : 4 * i))
+      abort ();
+  return 0;
+}
+
+int
+main ()
+{
+  check_vect ();
+  return main1 ();
+}
+
+/* { dg-final { cleanup-tree-dump "vect" } } */


	Jakub

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] SLP vectorize calls (take 2)
  2011-11-07 18:44       ` [PATCH] SLP vectorize calls (take 2) Jakub Jelinek
@ 2011-11-08  8:00         ` Ira Rosen
  2011-11-08  8:03           ` Jakub Jelinek
  0 siblings, 1 reply; 12+ messages in thread
From: Ira Rosen @ 2011-11-08  8:00 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Guenther, gcc-patches

On 7 November 2011 20:35, Jakub Jelinek <jakub@redhat.com> wrote:
> Hi!

Hi,

>
> Here is an updated patch, which handles both modifier == NONE
> and modifier == NARROW for SLP, after all it wasn't that hard.
> Additionally it checks that the fndecls and various call flags
> match, and adds some testcases.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux,
> ok for trunk?


> @@ -1723,6 +1764,55 @@ vectorizable_call (gimple stmt, gimple_s
>          else
>            VEC_truncate (tree, vargs, 0);
>
> +         if (slp_node)
> +           {
> +             VEC (slp_void_p, heap) *vec_defs
> +               = VEC_alloc (slp_void_p, heap, nargs);
> +             VEC (tree, heap) *vec_oprnds0;
> +
> +             for (i = 0; i < nargs; i++)
> +               VEC_quick_push (tree, vargs, gimple_call_arg (stmt, i));
> +             vect_get_slp_defs (vargs, slp_node, &vec_defs, -1);
> +             vec_oprnds0
> +               = (VEC (tree, heap) *) VEC_index (slp_void_p, vec_defs, 0);
> +
> +             /* Arguments are ready.  Create the new vector stmt.  */
> +             FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, vec_oprnd0)

Was this line left by mistake?

> +             for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vec_oprnd0);
> +                  i += 2)
> +               {
> +                 size_t k;
> +                 VEC_truncate (tree, vargs, 0);
> +                 for (k = 0; k < nargs; k++)
> +                   {
> +                     VEC (tree, heap) *vec_oprndsk
> +                       = (VEC (tree, heap) *)
> +                         VEC_index (slp_void_p, vec_defs, k);
> +                     VEC_quick_push (tree, vargs,
> +                                     VEC_index (tree, vec_oprndsk, i));
> +                     VEC_quick_push (tree, vargs,
> +                                     VEC_index (tree, vec_oprndsk, i + 1));
> +                   }
> +                 new_stmt = gimple_build_call_vec (fndecl, vargs);
> +                 new_temp = make_ssa_name (vec_dest, new_stmt);
> +                 gimple_call_set_lhs (new_stmt, new_temp);
> +                 vect_finish_stmt_generation (stmt, new_stmt, gsi);
> +                 mark_symbols_for_renaming (new_stmt);
> +                 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node),
> +                                 new_stmt);
> +               }
> +
> +             for (i = 0; i < nargs; i++)
> +               {
> +                 VEC (tree, heap) *vec_oprndsi
> +                   = (VEC (tree, heap) *)
> +                     VEC_index (slp_void_p, vec_defs, i);
> +                 VEC_free (tree, heap, vec_oprndsi);
> +               }
> +             VEC_free (slp_void_p, heap, vec_defs);
> +             continue;
> +           }
> +
>          for (i = 0; i < nargs; i++)
>            {
>              op = gimple_call_arg (stmt, i);


Could you please rearrange the tests (separate basic blocks and loops)
and make them actually test that bbs/loops were vectorized?
Also there is no need in dg-do run.

OK otherwise.

Thanks,
Ira

> --- gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c.jj        2011-11-07 15:05:36.000000000 +0100
> +++ gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c   2011-11-07 15:07:10.000000000 +0100
> @@ -0,0 +1,100 @@
> +/* { dg-do run } */
> +
> +#include "tree-vect.h"
> +
> +extern float copysignf (float, float);
> +extern float sqrtf (float);
> +extern float fabsf (float);
> +extern void abort (void);
> +float a[64], b[64], c[64], d[64];
> +
> +__attribute__((noinline, noclone)) void
> +f1 (void)
> +{
> +  a[0] = copysignf (b[0], c[0]) + 1.0f + sqrtf (d[0]);
> +  a[1] = copysignf (b[1], c[1]) + 2.0f + sqrtf (d[1]);
> +  a[2] = copysignf (b[2], c[2]) + 3.0f + sqrtf (d[2]);
> +  a[3] = copysignf (b[3], c[3]) + 4.0f + sqrtf (d[3]);
> +  a[4] = copysignf (b[4], c[4]) + 5.0f + sqrtf (d[4]);
> +  a[5] = copysignf (b[5], c[5]) + 6.0f + sqrtf (d[5]);
> +  a[6] = copysignf (b[6], c[6]) + 7.0f + sqrtf (d[6]);
> +  a[7] = copysignf (b[7], c[7]) + 8.0f + sqrtf (d[7]);
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f2 (int n)
> +{
> +  int i;
> +  for (i = 0; i < n; i++)
> +    {
> +      a[4 * i + 0] = copysignf (b[4 * i + 0], c[4 * i + 0]) + 1.0f + sqrtf (d[4 * i + 0]);
> +      a[4 * i + 1] = copysignf (b[4 * i + 1], c[4 * i + 1]) + 2.0f + sqrtf (d[4 * i + 1]);
> +      a[4 * i + 2] = copysignf (b[4 * i + 2], c[4 * i + 2]) + 3.0f + sqrtf (d[4 * i + 2]);
> +      a[4 * i + 3] = copysignf (b[4 * i + 3], c[4 * i + 3]) + 4.0f + sqrtf (d[4 * i + 3]);
> +    }
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f3 (int n)
> +{
> +  int i;
> +  for (i = 0; i < 2 * n; i++)
> +    {
> +      a[2 * i + 0] = copysignf (b[2 * i + 0], c[2 * i + 0]) + 1.0f + sqrtf (d[2 * i + 0]);
> +      a[2 * i + 1] = copysignf (b[2 * i + 1], c[2 * i + 1]) + 2.0f + sqrtf (d[2 * i + 1]);
> +    }
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f4 (void)
> +{
> +  int i;
> +  for (i = 0; i < 64; i++)
> +    a[i] = copysignf (b[i], c[i]) + 1.0f + sqrtf (d[i]);
> +}
> +
> +__attribute__((noinline, noclone)) int
> +main1 ()
> +{
> +  int i;
> +
> +  for (i = 0; i < 64; i++)
> +    {
> +      asm ("");
> +      b[i] = (i & 1) ? -4 * i : 4 * i;
> +      c[i] = (i & 2) ? -8 * i : 8 * i;
> +      d[i] = i * i;
> +    }
> +  f1 ();
> +  for (i = 0; i < 8; i++)
> +    if (fabsf (((i & 2) ? -4 * i : 4 * i) + 1 + i + i - a[i]) >= 0.0001f)
> +      abort ();
> +    else
> +      a[i] = 131.25;
> +  f2 (16);
> +  for (i = 0; i < 64; i++)
> +    if (fabsf (((i & 2) ? -4 * i : 4 * i) + 1 + (i & 3) + i - a[i]) >= 0.0001f)
> +      abort ();
> +    else
> +      a[i] = 131.25;
> +  f3 (16);
> +  for (i = 0; i < 64; i++)
> +    if (fabsf (((i & 2) ? -4 * i : 4 * i) + 1 + (i & 1) + i - a[i]) >= 0.0001f)
> +      abort ();
> +    else
> +      a[i] = 131.25;
> +  f4 ();
> +  for (i = 0; i < 64; i++)
> +    if (fabsf (((i & 2) ? -4 * i : 4 * i) + 1 + i - a[i]) >= 0.0001f)
> +      abort ();
> +  return 0;
> +}
> +
> +int
> +main ()
> +{
> +  check_vect ();
> +  return main1 ();
> +}
> +
> +/* { dg-final { cleanup-tree-dump "vect" } } */
> --- gcc/testsuite/gcc.dg/vect/fast-math-vect-call-2.c.jj        2011-11-07 15:09:00.000000000 +0100
> +++ gcc/testsuite/gcc.dg/vect/fast-math-vect-call-2.c   2011-11-07 15:11:58.000000000 +0100
> @@ -0,0 +1,166 @@
> +/* { dg-do run } */
> +
> +#include "tree-vect.h"
> +
> +extern long int lrint (double);
> +extern void abort (void);
> +long int a[64];
> +double b[64];
> +
> +__attribute__((noinline, noclone)) void
> +f1 (void)
> +{
> +  a[0] = lrint (b[0]) + 1;
> +  a[1] = lrint (b[1]) + 2;
> +  a[2] = lrint (b[2]) + 3;
> +  a[3] = lrint (b[3]) + 4;
> +  a[4] = lrint (b[4]) + 5;
> +  a[5] = lrint (b[5]) + 6;
> +  a[6] = lrint (b[6]) + 7;
> +  a[7] = lrint (b[7]) + 8;
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f2 (int n)
> +{
> +  int i;
> +  for (i = 0; i < n; i++)
> +    {
> +      a[4 * i + 0] = lrint (b[4 * i + 0]) + 1;
> +      a[4 * i + 1] = lrint (b[4 * i + 1]) + 2;
> +      a[4 * i + 2] = lrint (b[4 * i + 2]) + 3;
> +      a[4 * i + 3] = lrint (b[4 * i + 3]) + 4;
> +    }
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f3 (int n)
> +{
> +  int i;
> +  for (i = 0; i < 2 * n; i++)
> +    {
> +      a[2 * i + 0] = lrint (b[2 * i + 0]) + 1;
> +      a[2 * i + 1] = lrint (b[2 * i + 1]) + 2;
> +    }
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f4 (void)
> +{
> +  int i;
> +  for (i = 0; i < 64; i++)
> +    a[i] = lrint (b[i]) + 1;
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f5 (void)
> +{
> +  a[0] = lrint (b[0]);
> +  a[1] = lrint (b[1]);
> +  a[2] = lrint (b[2]);
> +  a[3] = lrint (b[3]);
> +  a[4] = lrint (b[4]);
> +  a[5] = lrint (b[5]);
> +  a[6] = lrint (b[6]);
> +  a[7] = lrint (b[7]);
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f6 (int n)
> +{
> +  int i;
> +  for (i = 0; i < n; i++)
> +    {
> +      a[4 * i + 0] = lrint (b[4 * i + 0]);
> +      a[4 * i + 1] = lrint (b[4 * i + 1]);
> +      a[4 * i + 2] = lrint (b[4 * i + 2]);
> +      a[4 * i + 3] = lrint (b[4 * i + 3]);
> +    }
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f7 (int n)
> +{
> +  int i;
> +  for (i = 0; i < 2 * n; i++)
> +    {
> +      a[2 * i + 0] = lrint (b[2 * i + 0]);
> +      a[2 * i + 1] = lrint (b[2 * i + 1]);
> +    }
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f8 (void)
> +{
> +  int i;
> +  for (i = 0; i < 64; i++)
> +    a[i] = lrint (b[i]);
> +}
> +
> +__attribute__((noinline, noclone)) int
> +main1 ()
> +{
> +  int i;
> +
> +  for (i = 0; i < 64; i++)
> +    {
> +      asm ("");
> +      b[i] = ((i & 1) ? -4 * i : 4 * i) + 0.25;
> +    }
> +  f1 ();
> +  for (i = 0; i < 8; i++)
> +    if (a[i] != ((i & 1) ? -4 * i : 4 * i) + 1 + i)
> +      abort ();
> +    else
> +      a[i] = 131.25;
> +  f2 (16);
> +  for (i = 0; i < 64; i++)
> +    if (a[i] != ((i & 1) ? -4 * i : 4 * i) + 1 + (i & 3))
> +      abort ();
> +    else
> +      a[i] = 131.25;
> +  f3 (16);
> +  for (i = 0; i < 64; i++)
> +    if (a[i] != ((i & 1) ? -4 * i : 4 * i) + 1 + (i & 1))
> +      abort ();
> +    else
> +      a[i] = 131.25;
> +  f4 ();
> +  for (i = 0; i < 64; i++)
> +    if (a[i] != ((i & 1) ? -4 * i : 4 * i) + 1)
> +      abort ();
> +    else
> +      a[i] = 131.25;
> +  f5 ();
> +  for (i = 0; i < 8; i++)
> +    if (a[i] != ((i & 1) ? -4 * i : 4 * i))
> +      abort ();
> +    else
> +      a[i] = 131.25;
> +  f6 (16);
> +  for (i = 0; i < 64; i++)
> +    if (a[i] != ((i & 1) ? -4 * i : 4 * i))
> +      abort ();
> +    else
> +      a[i] = 131.25;
> +  f7 (16);
> +  for (i = 0; i < 64; i++)
> +    if (a[i] != ((i & 1) ? -4 * i : 4 * i))
> +      abort ();
> +    else
> +      a[i] = 131.25;
> +  f8 ();
> +  for (i = 0; i < 64; i++)
> +    if (a[i] != ((i & 1) ? -4 * i : 4 * i))
> +      abort ();
> +  return 0;
> +}
> +
> +int
> +main ()
> +{
> +  check_vect ();
> +  return main1 ();
> +}
> +
> +/* { dg-final { cleanup-tree-dump "vect" } } */
>
>
>        Jakub
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] SLP vectorize calls (take 2)
  2011-11-08  8:00         ` Ira Rosen
@ 2011-11-08  8:03           ` Jakub Jelinek
  2011-11-08  8:22             ` Ira Rosen
  0 siblings, 1 reply; 12+ messages in thread
From: Jakub Jelinek @ 2011-11-08  8:03 UTC (permalink / raw)
  To: Ira Rosen; +Cc: Richard Guenther, gcc-patches

On Tue, Nov 08, 2011 at 09:05:16AM +0200, Ira Rosen wrote:
> > +             /* Arguments are ready.  Create the new vector stmt.  */
> > +             FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, vec_oprnd0)
> 
> Was this line left by mistake?

Oops, yes.  It didn't make a difference at runtime, so passed the test, will
fix.  Thanks for catching this up.

> > +             for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vec_oprnd0);
> > +                  i += 2)
> 
> Could you please rearrange the tests (separate basic blocks and loops)
> and make them actually test that bbs/loops were vectorized?

This is more difficult for me, my familiarity with gcc.dg/vect/
infrastructure is very weak.
First of all, whether copysignf, sqrtf and/or lrint are vectorized is
very much target specific, should I guard the dg-final lines with
{ target { i?86-*-* x86_64-*-* } }
resp.
{ target { { i?86-*-* x86_64-*-* } && !lp64 } }
(the latter for lrint - we don't vectorize it on x86_64), or add
vect_call_copysignf, vect_call_sqrtf, vect_call_lrint tests in *.exp?

For the split, some fns are hybrid, so shall I split f1+f2+f3 as slp
and f4 as loop, or is f3 (hybrid) something else?  What test names
should I use?  fast-math-slp-call-*.c/fast-math-vect-call-*.c or something
else?  From what I gather for bb slp the test should start with bb-slp-*
(is that f1/f2 or just f1?), but then there is currently no way to
add -ffast-math.

> Also there is no need in dg-do run.

You mean because it is the default?  Certainly it is useful to test
that gcc doesn't miscompile the tests.

	Jakub

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] SLP vectorize calls (take 2)
  2011-11-08  8:03           ` Jakub Jelinek
@ 2011-11-08  8:22             ` Ira Rosen
  2011-11-08 10:03               ` [PATCH] SLP vectorize calls (take 3) Jakub Jelinek
  0 siblings, 1 reply; 12+ messages in thread
From: Ira Rosen @ 2011-11-08  8:22 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Guenther, gcc-patches

On 8 November 2011 09:22, Jakub Jelinek <jakub@redhat.com> wrote:

> First of all, whether copysignf, sqrtf and/or lrint are vectorized is
> very much target specific, should I guard the dg-final lines with
> { target { i?86-*-* x86_64-*-* } }
> resp.
> { target { { i?86-*-* x86_64-*-* } && !lp64 } }
> (the latter for lrint - we don't vectorize it on x86_64), or add
> vect_call_copysignf, vect_call_sqrtf, vect_call_lrint tests in *.exp?

The second option would be nicer.

>
> For the split, some fns are hybrid, so shall I split f1+f2+f3 as slp
> and f4 as loop, or is f3 (hybrid) something else?
>  What test names
> should I use?  fast-math-slp-call-*.c/fast-math-vect-call-*.c or something
> else?  From what I gather for bb slp the test should start with bb-slp-*
> (is that f1/f2 or just f1?), but then there is currently no way to
> add -ffast-math.

In fast-math-vect-call-1.c, f1 is basic block SLP, f2+f3 are loop SLP,
and f4 is regular loop vectorization.
So, f1 should be in fast-math-bb-slp-call-1.c,  with

/* { dg-final { scan-tree-dump-times "basic block vectorized using
SLP" 1 "slp" } } */
/* { dg-final { cleanup-tree-dump "slp" } } */

and

# -ffast-math
set VECT_SLP_CFLAGS $SAVED_VECT_SLP_CFLAGS
lappend VECT_SLP_CFLAGS "-ffast-math"
dg-runtest [lsort [glob -nocomplain
$srcdir/$subdir/fast-math-bb-slp-*.\[cS\]]]  \
        "" $VECT_SLP_CFLAGS

in  vect.exp.

The rest can simply stay in fast-math-vect-call-1.c, but to check SLP please use

/* { dg-final { scan-tree-dump-times "vectorized 1 loops" X "vect"  } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" Y
"vect"  } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

>
>> Also there is no need in dg-do run.
>
> You mean because it is the default?

Yes.

Thanks,
Ira

> Certainly it is useful to test
> that gcc doesn't miscompile the tests.
>
>        Jakub
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] SLP vectorize calls (take 3)
  2011-11-08  8:22             ` Ira Rosen
@ 2011-11-08 10:03               ` Jakub Jelinek
  2011-11-08 10:15                 ` Ira Rosen
  0 siblings, 1 reply; 12+ messages in thread
From: Jakub Jelinek @ 2011-11-08 10:03 UTC (permalink / raw)
  To: Ira Rosen; +Cc: Richard Guenther, gcc-patches

On Tue, Nov 08, 2011 at 10:03:23AM +0200, Ira Rosen wrote:
> The second option would be nicer.
...

Thanks.  Here is an updated patch, will bootstrap/regtest it now.
Ok for trunk if it passes?

2011-11-08  Jakub Jelinek  <jakub@redhat.com>

	* tree-vect-stmts.c (vectorizable_call): Add SLP_NODE argument.
	Handle vectorization of SLP calls.
	(vect_analyze_stmt): Adjust caller, add call to it for SLP too.
	(vect_transform_stmt): Adjust vectorizable_call caller, remove
	assertion.
	* tree-vect-slp.c (vect_get_and_check_slp_defs): For calls start
	with op_idx 3.
	(vect_build_slp_tree): Allow CALL_EXPR.

	* lib/target-supports.exp (check_effective_target_vect_call_sqrtf,
	check_effective_target_vect_call_copysignf,
	check_effective_target_vect_call_lrint): New procedures.
	* gcc.dg/vect/vect.exp: Run fast-math-bb-slp* tests using
	$VECT_SLP_CFLAGS with -ffast-math.
	* gcc.dg/vect/fast-math-vect-call-1.c: New test.
	* gcc.dg/vect/fast-math-vect-call-2.c: New test.
	* gcc.dg/vect/fast-math-bb-slp-call-1.c: New test.
	* gcc.dg/vect/fast-math-bb-slp-call-2.c: New test.

--- gcc/tree-vect-slp.c.jj	2011-11-07 20:32:03.000000000 +0100
+++ gcc/tree-vect-slp.c	2011-11-08 09:28:12.000000000 +0100
@@ -202,7 +202,10 @@ vect_get_and_check_slp_defs (loop_vec_in
     loop = LOOP_VINFO_LOOP (loop_vinfo);
 
   if (is_gimple_call (stmt))
-    number_of_oprnds = gimple_call_num_args (stmt);
+    {
+      number_of_oprnds = gimple_call_num_args (stmt);
+      op_idx = 3;
+    }
   else if (is_gimple_assign (stmt))
     {
       number_of_oprnds = gimple_num_ops (stmt) - 1;
@@ -558,7 +561,25 @@ vect_build_slp_tree (loop_vec_info loop_
       ncopies = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
 
       if (is_gimple_call (stmt))
-	rhs_code = CALL_EXPR;
+	{
+	  rhs_code = CALL_EXPR;
+	  if (gimple_call_internal_p (stmt)
+	      || gimple_call_tail_p (stmt)
+	      || gimple_call_noreturn_p (stmt)
+	      || !gimple_call_nothrow_p (stmt)
+	      || gimple_call_chain (stmt))
+	    {
+	      if (vect_print_dump_info (REPORT_SLP))
+		{
+		  fprintf (vect_dump,
+			   "Build SLP failed: unsupported call type ");
+		  print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+		}
+
+	      vect_free_oprnd_info (&oprnds_info, true);
+	      return false;
+	    }
+	}
       else
 	rhs_code = gimple_assign_rhs_code (stmt);
 
@@ -653,6 +674,27 @@ vect_build_slp_tree (loop_vec_info loop_
 	      vect_free_oprnd_info (&oprnds_info, true);
 	      return false;
 	    }
+
+	  if (rhs_code == CALL_EXPR)
+	    {
+	      gimple first_stmt = VEC_index (gimple, stmts, 0);
+	      if (gimple_call_num_args (stmt) != nops
+		  || !operand_equal_p (gimple_call_fn (first_stmt),
+				       gimple_call_fn (stmt), 0)
+		  || gimple_call_fntype (first_stmt)
+		     != gimple_call_fntype (stmt))
+		{
+		  if (vect_print_dump_info (REPORT_SLP))
+		    {
+		      fprintf (vect_dump,
+			       "Build SLP failed: different calls in ");
+		      print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+		    }
+
+		  vect_free_oprnd_info (&oprnds_info, true);
+		  return false;
+		}
+	    }
 	}
 
       /* Strided store or load.  */
@@ -786,7 +828,8 @@ vect_build_slp_tree (loop_vec_info loop_
 	  /* Not memory operation.  */
 	  if (TREE_CODE_CLASS (rhs_code) != tcc_binary
 	      && TREE_CODE_CLASS (rhs_code) != tcc_unary
-              && rhs_code != COND_EXPR)
+	      && rhs_code != COND_EXPR
+	      && rhs_code != CALL_EXPR)
 	    {
 	      if (vect_print_dump_info (REPORT_SLP))
 		{
--- gcc/tree-vect-stmts.c.jj	2011-11-07 20:32:09.000000000 +0100
+++ gcc/tree-vect-stmts.c	2011-11-08 09:28:55.000000000 +0100
@@ -1521,7 +1521,8 @@ vectorizable_function (gimple call, tree
    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
 
 static bool
-vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
+vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
+		   slp_tree slp_node)
 {
   tree vec_dest;
   tree scalar_dest;
@@ -1532,6 +1533,7 @@ vectorizable_call (gimple stmt, gimple_s
   int nunits_in;
   int nunits_out;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
   tree fndecl, new_temp, def, rhs_type;
   gimple def_stmt;
   enum vect_def_type dt[3]
@@ -1543,19 +1545,12 @@ vectorizable_call (gimple stmt, gimple_s
   size_t i, nargs;
   tree lhs;
 
-  /* FORNOW: unsupported in basic block SLP.  */
-  gcc_assert (loop_vinfo);
-
-  if (!STMT_VINFO_RELEVANT_P (stmt_info))
+  if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
     return false;
 
   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
     return false;
 
-  /* FORNOW: SLP not supported.  */
-  if (STMT_SLP_TYPE (stmt_info))
-    return false;
-
   /* Is STMT a vectorizable call?   */
   if (!is_gimple_call (stmt))
     return false;
@@ -1596,7 +1591,7 @@ vectorizable_call (gimple stmt, gimple_s
       if (!rhs_type)
 	rhs_type = TREE_TYPE (op);
 
-      if (!vect_is_simple_use_1 (op, loop_vinfo, NULL,
+      if (!vect_is_simple_use_1 (op, loop_vinfo, bb_vinfo,
 				 &def_stmt, &def, &dt[i], &opvectype))
 	{
 	  if (vect_print_dump_info (REPORT_DETAILS))
@@ -1658,7 +1653,9 @@ vectorizable_call (gimple stmt, gimple_s
 
   gcc_assert (!gimple_vuse (stmt));
 
-  if (modifier == NARROW)
+  if (slp_node || PURE_SLP_STMT (stmt_info))
+    ncopies = 1;
+  else if (modifier == NARROW)
     ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
   else
     ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
@@ -1697,6 +1694,50 @@ vectorizable_call (gimple stmt, gimple_s
 	  else
 	    VEC_truncate (tree, vargs, 0);
 
+	  if (slp_node)
+	    {
+	      VEC (slp_void_p, heap) *vec_defs
+		= VEC_alloc (slp_void_p, heap, nargs);
+	      VEC (tree, heap) *vec_oprnds0;
+
+	      for (i = 0; i < nargs; i++)
+		VEC_quick_push (tree, vargs, gimple_call_arg (stmt, i));
+	      vect_get_slp_defs (vargs, slp_node, &vec_defs, -1);
+	      vec_oprnds0
+		= (VEC (tree, heap) *) VEC_index (slp_void_p, vec_defs, 0);
+
+	      /* Arguments are ready.  Create the new vector stmt.  */
+	      FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, vec_oprnd0)
+		{
+		  size_t k;
+		  for (k = 0; k < nargs; k++)
+		    {
+		      VEC (tree, heap) *vec_oprndsk
+			= (VEC (tree, heap) *)
+			  VEC_index (slp_void_p, vec_defs, k);
+		      VEC_replace (tree, vargs, k,
+				   VEC_index (tree, vec_oprndsk, i));
+		    }
+		  new_stmt = gimple_build_call_vec (fndecl, vargs);
+		  new_temp = make_ssa_name (vec_dest, new_stmt);
+		  gimple_call_set_lhs (new_stmt, new_temp);
+		  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+		  mark_symbols_for_renaming (new_stmt);
+		  VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node),
+				  new_stmt);
+		}
+
+	      for (i = 0; i < nargs; i++)
+		{
+		  VEC (tree, heap) *vec_oprndsi
+		    = (VEC (tree, heap) *)
+		      VEC_index (slp_void_p, vec_defs, i);
+		  VEC_free (tree, heap, vec_oprndsi);
+		}
+	      VEC_free (slp_void_p, heap, vec_defs);
+	      continue;
+	    }
+
 	  for (i = 0; i < nargs; i++)
 	    {
 	      op = gimple_call_arg (stmt, i);
@@ -1739,6 +1780,54 @@ vectorizable_call (gimple stmt, gimple_s
 	  else
 	    VEC_truncate (tree, vargs, 0);
 
+	  if (slp_node)
+	    {
+	      VEC (slp_void_p, heap) *vec_defs
+		= VEC_alloc (slp_void_p, heap, nargs);
+	      VEC (tree, heap) *vec_oprnds0;
+
+	      for (i = 0; i < nargs; i++)
+		VEC_quick_push (tree, vargs, gimple_call_arg (stmt, i));
+	      vect_get_slp_defs (vargs, slp_node, &vec_defs, -1);
+	      vec_oprnds0
+		= (VEC (tree, heap) *) VEC_index (slp_void_p, vec_defs, 0);
+
+	      /* Arguments are ready.  Create the new vector stmt.  */
+	      for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vec_oprnd0);
+		   i += 2)
+		{
+		  size_t k;
+		  VEC_truncate (tree, vargs, 0);
+		  for (k = 0; k < nargs; k++)
+		    {
+		      VEC (tree, heap) *vec_oprndsk
+			= (VEC (tree, heap) *)
+			  VEC_index (slp_void_p, vec_defs, k);
+		      VEC_quick_push (tree, vargs,
+				      VEC_index (tree, vec_oprndsk, i));
+		      VEC_quick_push (tree, vargs,
+				      VEC_index (tree, vec_oprndsk, i + 1));
+		    }
+		  new_stmt = gimple_build_call_vec (fndecl, vargs);
+		  new_temp = make_ssa_name (vec_dest, new_stmt);
+		  gimple_call_set_lhs (new_stmt, new_temp);
+		  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+		  mark_symbols_for_renaming (new_stmt);
+		  VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node),
+				  new_stmt);
+		}
+
+	      for (i = 0; i < nargs; i++)
+		{
+		  VEC (tree, heap) *vec_oprndsi
+		    = (VEC (tree, heap) *)
+		      VEC_index (slp_void_p, vec_defs, i);
+		  VEC_free (tree, heap, vec_oprndsi);
+		}
+	      VEC_free (slp_void_p, heap, vec_defs);
+	      continue;
+	    }
+
 	  for (i = 0; i < nargs; i++)
 	    {
 	      op = gimple_call_arg (stmt, i);
@@ -1804,7 +1893,8 @@ vectorizable_call (gimple stmt, gimple_s
     lhs = gimple_call_lhs (stmt);
   new_stmt = gimple_build_assign (lhs, build_zero_cst (type));
   set_vinfo_for_stmt (new_stmt, stmt_info);
-  set_vinfo_for_stmt (stmt, NULL);
+  if (!slp_node)
+    set_vinfo_for_stmt (stmt, NULL);
   STMT_VINFO_STMT (stmt_info) = new_stmt;
   gsi_replace (gsi, new_stmt, false);
   SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
@@ -5265,7 +5355,7 @@ vect_analyze_stmt (gimple stmt, bool *ne
             || vectorizable_operation (stmt, NULL, NULL, NULL)
             || vectorizable_assignment (stmt, NULL, NULL, NULL)
             || vectorizable_load (stmt, NULL, NULL, NULL, NULL)
-            || vectorizable_call (stmt, NULL, NULL)
+	    || vectorizable_call (stmt, NULL, NULL, NULL)
             || vectorizable_store (stmt, NULL, NULL, NULL)
             || vectorizable_reduction (stmt, NULL, NULL, NULL)
             || vectorizable_condition (stmt, NULL, NULL, NULL, 0, NULL));
@@ -5277,6 +5367,7 @@ vect_analyze_stmt (gimple stmt, bool *ne
                 || vectorizable_operation (stmt, NULL, NULL, node)
                 || vectorizable_assignment (stmt, NULL, NULL, node)
                 || vectorizable_load (stmt, NULL, NULL, node, NULL)
+		|| vectorizable_call (stmt, NULL, NULL, node)
                 || vectorizable_store (stmt, NULL, NULL, node)
                 || vectorizable_condition (stmt, NULL, NULL, NULL, 0, node));
       }
@@ -5391,8 +5482,7 @@ vect_transform_stmt (gimple stmt, gimple
       break;
 
     case call_vec_info_type:
-      gcc_assert (!slp_node);
-      done = vectorizable_call (stmt, gsi, &vec_stmt);
+      done = vectorizable_call (stmt, gsi, &vec_stmt, slp_node);
       stmt = gsi_stmt (*gsi);
       break;
 
--- gcc/testsuite/lib/target-supports.exp.jj	2011-11-08 09:26:58.000000000 +0100
+++ gcc/testsuite/lib/target-supports.exp	2011-11-08 10:15:38.000000000 +0100
@@ -3520,6 +3520,58 @@ proc check_effective_target_vect64 { } {
     return $et_vect64_saved
 }
 
+# Return 1 if the target supports vector copysignf calls.
+
+proc check_effective_target_vect_call_copysignf { } {
+    global et_vect_call_copysignf_saved
+
+    if [info exists et_vect_call_copysignf_saved] {
+	verbose "check_effective_target_vect_call_copysignf: using cached result" 2
+    } else {
+	set et_vect_call_copysignf_saved 0
+	if { [istarget i?86-*-*]
+	     || [istarget x86_64-*-*]
+	     || [istarget powerpc*-*-*] } {
+	   set et_vect_call_copysignf_saved 1
+	}
+    }
+
+    verbose "check_effective_target_vect_call_copysignf: returning $et_vect_call_copysignf_saved" 2
+    return $et_vect_call_copysignf_saved
+}
+
+# Return 1 if the target supports vector sqrtf calls.
+
+proc check_effective_target_vect_call_sqrtf { } {
+    global et_vect_call_sqrtf_saved
+
+    if [info exists et_vect_call_sqrtf_saved] {
+	verbose "check_effective_target_vect_call_sqrtf: using cached result" 2
+    } else {
+	set et_vect_call_sqrtf_saved 0
+	if { [istarget i?86-*-*]
+	     || [istarget x86_64-*-*]
+	     || ([istarget powerpc*-*-*] && [check_vsx_hw_available]) } {
+	    set et_vect_call_sqrtf_saved 1
+	}
+    }
+
+    verbose "check_effective_target_vect_call_sqrtf: returning $et_vect_call_sqrtf_saved" 2
+    return $et_vect_call_sqrtf_saved
+}
+
+# Return 1 if the target supports vector lrint calls.
+
+proc check_effective_target_vect_call_lrint { } {
+    set et_vect_call_lrint 0
+    if { ([istarget i?86-*-*] || [istarget x86_64-*-*]) && [check_effective_target_ilp32] } {
+	set et_vect_call_lrint 1
+    }
+
+    verbose "check_effective_target_vect_call_lrint: returning $et_vect_call_lrint" 2
+    return $et_vect_call_lrint
+}
+
 # Return 1 if the target supports section-anchors
 
 proc check_effective_target_section_anchors { } {
--- gcc/testsuite/gcc.dg/vect/vect.exp.jj	2011-10-24 12:21:08.000000000 +0200
+++ gcc/testsuite/gcc.dg/vect/vect.exp	2011-11-08 10:09:27.000000000 +0100
@@ -104,9 +104,15 @@ dg-runtest [lsort [glob -nocomplain $src
 # -ffast-math tests
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-ffast-math"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-*.\[cS\]]]  \
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-\[ipsv\]*.\[cS\]]]  \
 	"" $DEFAULT_VECTCFLAGS
 
+# -ffast-math SLP tests
+set VECT_SLP_CFLAGS $SAVED_VECT_SLP_CFLAGS
+lappend VECT_SLP_CFLAGS "-ffast-math"
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-bb-slp-*.\[cS\]]]  \
+        "" $VECT_SLP_CFLAGS
+
 # -fno-fast-math tests
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-fno-fast-math"
--- gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c.jj	2011-11-08 09:28:12.000000000 +0100
+++ gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c	2011-11-08 09:57:19.000000000 +0100
@@ -0,0 +1,81 @@
+#include "tree-vect.h"
+
+extern float copysignf (float, float);
+extern float sqrtf (float);
+extern float fabsf (float);
+extern void abort (void);
+float a[64], b[64], c[64], d[64];
+
+__attribute__((noinline, noclone)) void
+f1 (int n)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    {
+      a[4 * i + 0] = copysignf (b[4 * i + 0], c[4 * i + 0]) + 1.0f + sqrtf (d[4 * i + 0]);
+      a[4 * i + 1] = copysignf (b[4 * i + 1], c[4 * i + 1]) + 2.0f + sqrtf (d[4 * i + 1]);
+      a[4 * i + 2] = copysignf (b[4 * i + 2], c[4 * i + 2]) + 3.0f + sqrtf (d[4 * i + 2]);
+      a[4 * i + 3] = copysignf (b[4 * i + 3], c[4 * i + 3]) + 4.0f + sqrtf (d[4 * i + 3]);
+    }
+}
+
+__attribute__((noinline, noclone)) void
+f2 (int n)
+{
+  int i;
+  for (i = 0; i < 2 * n; i++)
+    {
+      a[2 * i + 0] = copysignf (b[2 * i + 0], c[2 * i + 0]) + 1.0f + sqrtf (d[2 * i + 0]);
+      a[2 * i + 1] = copysignf (b[2 * i + 1], c[2 * i + 1]) + 2.0f + sqrtf (d[2 * i + 1]);
+    }
+}
+
+__attribute__((noinline, noclone)) void
+f3 (void)
+{
+  int i;
+  for (i = 0; i < 64; i++)
+    a[i] = copysignf (b[i], c[i]) + 1.0f + sqrtf (d[i]);
+}
+
+__attribute__((noinline, noclone)) int
+main1 ()
+{
+  int i;
+
+  for (i = 0; i < 64; i++)
+    {
+      asm ("");
+      b[i] = (i & 1) ? -4 * i : 4 * i;
+      c[i] = (i & 2) ? -8 * i : 8 * i;
+      d[i] = i * i;
+    }
+  f1 (16);
+  for (i = 0; i < 64; i++)
+    if (fabsf (((i & 2) ? -4 * i : 4 * i) + 1 + (i & 3) + i - a[i]) >= 0.0001f)
+      abort ();
+    else
+      a[i] = 131.25;
+  f2 (16);
+  for (i = 0; i < 64; i++)
+    if (fabsf (((i & 2) ? -4 * i : 4 * i) + 1 + (i & 1) + i - a[i]) >= 0.0001f)
+      abort ();
+    else
+      a[i] = 131.25;
+  f3 ();
+  for (i = 0; i < 64; i++)
+    if (fabsf (((i & 2) ? -4 * i : 4 * i) + 1 + i - a[i]) >= 0.0001f)
+      abort ();
+  return 0;
+}
+
+int
+main ()
+{
+  check_vect ();
+  return main1 ();
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 3 "vect" { target { vect_call_copysignf && vect_call_sqrtf } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_call_copysignf && vect_call_sqrtf } } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
--- gcc/testsuite/gcc.dg/vect/fast-math-vect-call-2.c.jj	2011-11-08 09:28:12.000000000 +0100
+++ gcc/testsuite/gcc.dg/vect/fast-math-vect-call-2.c	2011-11-08 10:03:37.000000000 +0100
@@ -0,0 +1,128 @@
+#include "tree-vect.h"
+
+extern long int lrint (double);
+extern void abort (void);
+long int a[64];
+double b[64];
+
+__attribute__((noinline, noclone)) void
+f1 (int n)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    {
+      a[4 * i + 0] = lrint (b[4 * i + 0]) + 1;
+      a[4 * i + 1] = lrint (b[4 * i + 1]) + 2;
+      a[4 * i + 2] = lrint (b[4 * i + 2]) + 3;
+      a[4 * i + 3] = lrint (b[4 * i + 3]) + 4;
+    }
+}
+
+__attribute__((noinline, noclone)) void
+f2 (int n)
+{
+  int i;
+  for (i = 0; i < 2 * n; i++)
+    {
+      a[2 * i + 0] = lrint (b[2 * i + 0]) + 1;
+      a[2 * i + 1] = lrint (b[2 * i + 1]) + 2;
+    }
+}
+
+__attribute__((noinline, noclone)) void
+f3 (void)
+{
+  int i;
+  for (i = 0; i < 64; i++)
+    a[i] = lrint (b[i]) + 1;
+}
+
+__attribute__((noinline, noclone)) void
+f4 (int n)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    {
+      a[4 * i + 0] = lrint (b[4 * i + 0]);
+      a[4 * i + 1] = lrint (b[4 * i + 1]);
+      a[4 * i + 2] = lrint (b[4 * i + 2]);
+      a[4 * i + 3] = lrint (b[4 * i + 3]);
+    }
+}
+
+__attribute__((noinline, noclone)) void
+f5 (int n)
+{
+  int i;
+  for (i = 0; i < 2 * n; i++)
+    {
+      a[2 * i + 0] = lrint (b[2 * i + 0]);
+      a[2 * i + 1] = lrint (b[2 * i + 1]);
+    }
+}
+
+__attribute__((noinline, noclone)) void
+f6 (void)
+{
+  int i;
+  for (i = 0; i < 64; i++)
+    a[i] = lrint (b[i]);
+}
+
+__attribute__((noinline, noclone)) int
+main1 ()
+{
+  int i;
+
+  for (i = 0; i < 64; i++)
+    {
+      asm ("");
+      b[i] = ((i & 1) ? -4 * i : 4 * i) + 0.25;
+    }
+  f1 (16);
+  for (i = 0; i < 64; i++)
+    if (a[i] != ((i & 1) ? -4 * i : 4 * i) + 1 + (i & 3))
+      abort ();
+    else
+      a[i] = 131.25;
+  f2 (16);
+  for (i = 0; i < 64; i++)
+    if (a[i] != ((i & 1) ? -4 * i : 4 * i) + 1 + (i & 1))
+      abort ();
+    else
+      a[i] = 131.25;
+  f3 ();
+  for (i = 0; i < 64; i++)
+    if (a[i] != ((i & 1) ? -4 * i : 4 * i) + 1)
+      abort ();
+    else
+      a[i] = 131.25;
+  f4 (16);
+  for (i = 0; i < 64; i++)
+    if (a[i] != ((i & 1) ? -4 * i : 4 * i))
+      abort ();
+    else
+      a[i] = 131.25;
+  f5 (16);
+  for (i = 0; i < 64; i++)
+    if (a[i] != ((i & 1) ? -4 * i : 4 * i))
+      abort ();
+    else
+      a[i] = 131.25;
+  f6 ();
+  for (i = 0; i < 64; i++)
+    if (a[i] != ((i & 1) ? -4 * i : 4 * i))
+      abort ();
+  return 0;
+}
+
+int
+main ()
+{
+  check_vect ();
+  return main1 ();
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" { target vect_call_lrint } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target vect_call_lrint } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
--- gcc/testsuite/gcc.dg/vect/fast-math-bb-slp-call-1.c.jj	2011-11-08 09:46:00.000000000 +0100
+++ gcc/testsuite/gcc.dg/vect/fast-math-bb-slp-call-1.c	2011-11-08 09:49:49.000000000 +0100
@@ -0,0 +1,49 @@
+#include "tree-vect.h"
+
+extern float copysignf (float, float);
+extern float sqrtf (float);
+extern float fabsf (float);
+extern void abort (void);
+float a[64], b[64], c[64], d[64];
+
+__attribute__((noinline, noclone)) void
+f1 (void)
+{
+  a[0] = copysignf (b[0], c[0]) + 1.0f + sqrtf (d[0]);
+  a[1] = copysignf (b[1], c[1]) + 2.0f + sqrtf (d[1]);
+  a[2] = copysignf (b[2], c[2]) + 3.0f + sqrtf (d[2]);
+  a[3] = copysignf (b[3], c[3]) + 4.0f + sqrtf (d[3]);
+  a[4] = copysignf (b[4], c[4]) + 5.0f + sqrtf (d[4]);
+  a[5] = copysignf (b[5], c[5]) + 6.0f + sqrtf (d[5]);
+  a[6] = copysignf (b[6], c[6]) + 7.0f + sqrtf (d[6]);
+  a[7] = copysignf (b[7], c[7]) + 8.0f + sqrtf (d[7]);
+}
+
+__attribute__((noinline, noclone)) int
+main1 ()
+{
+  int i;
+
+  for (i = 0; i < 8; i++)
+    {
+      asm ("");
+      b[i] = (i & 1) ? -4 * i : 4 * i;
+      c[i] = (i & 2) ? -8 * i : 8 * i;
+      d[i] = i * i;
+    }
+  f1 ();
+  for (i = 0; i < 8; i++)
+    if (fabsf (((i & 2) ? -4 * i : 4 * i) + 1 + i + i - a[i]) >= 0.0001f)
+      abort ();
+  return 0;
+}
+
+int
+main ()
+{
+  check_vect ();
+  return main1 ();
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 1 "slp" { target { vect_call_copysignf && vect_call_sqrtf } } } } */
+/* { dg-final { cleanup-tree-dump "slp" } } */
--- gcc/testsuite/gcc.dg/vect/fast-math-bb-slp-call-2.c.jj	2011-11-08 09:46:04.000000000 +0100
+++ gcc/testsuite/gcc.dg/vect/fast-math-bb-slp-call-2.c	2011-11-08 10:11:20.000000000 +0100
@@ -0,0 +1,65 @@
+#include "tree-vect.h"
+
+extern long int lrint (double);
+extern void abort (void);
+long int a[64];
+double b[64];
+
+__attribute__((noinline, noclone)) void
+f1 (void)
+{
+  a[0] = lrint (b[0]) + 1;
+  a[1] = lrint (b[1]) + 2;
+  a[2] = lrint (b[2]) + 3;
+  a[3] = lrint (b[3]) + 4;
+  a[4] = lrint (b[4]) + 5;
+  a[5] = lrint (b[5]) + 6;
+  a[6] = lrint (b[6]) + 7;
+  a[7] = lrint (b[7]) + 8;
+}
+
+__attribute__((noinline, noclone)) void
+f2 (void)
+{
+  a[0] = lrint (b[0]);
+  a[1] = lrint (b[1]);
+  a[2] = lrint (b[2]);
+  a[3] = lrint (b[3]);
+  a[4] = lrint (b[4]);
+  a[5] = lrint (b[5]);
+  a[6] = lrint (b[6]);
+  a[7] = lrint (b[7]);
+}
+
+__attribute__((noinline, noclone)) int
+main1 ()
+{
+  int i;
+
+  for (i = 0; i < 8; i++)
+    {
+      asm ("");
+      b[i] = ((i & 1) ? -4 * i : 4 * i) + 0.25;
+    }
+  f1 ();
+  for (i = 0; i < 8; i++)
+    if (a[i] != ((i & 1) ? -4 * i : 4 * i) + 1 + i)
+      abort ();
+    else
+      a[i] = 131.25;
+  f2 ();
+  for (i = 0; i < 8; i++)
+    if (a[i] != ((i & 1) ? -4 * i : 4 * i))
+      abort ();
+  return 0;
+}
+
+int
+main ()
+{
+  check_vect ();
+  return main1 ();
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 2 "slp" { target vect_call_lrint } } } */
+/* { dg-final { cleanup-tree-dump "slp" } } */


	Jakub

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] SLP vectorize calls (take 3)
  2011-11-08 10:03               ` [PATCH] SLP vectorize calls (take 3) Jakub Jelinek
@ 2011-11-08 10:15                 ` Ira Rosen
  0 siblings, 0 replies; 12+ messages in thread
From: Ira Rosen @ 2011-11-08 10:15 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Guenther, gcc-patches

On 8 November 2011 11:32, Jakub Jelinek <jakub@redhat.com> wrote:
> On Tue, Nov 08, 2011 at 10:03:23AM +0200, Ira Rosen wrote:
>> The second option would be nicer.
> ...
>
> Thanks.  Here is an updated patch, will bootstrap/regtest it now.
> Ok for trunk if it passes?

Yes.

Thanks,
Ira

>
> 2011-11-08  Jakub Jelinek  <jakub@redhat.com>
>
>        * tree-vect-stmts.c (vectorizable_call): Add SLP_NODE argument.
>        Handle vectorization of SLP calls.
>        (vect_analyze_stmt): Adjust caller, add call to it for SLP too.
>        (vect_transform_stmt): Adjust vectorizable_call caller, remove
>        assertion.
>        * tree-vect-slp.c (vect_get_and_check_slp_defs): For calls start
>        with op_idx 3.
>        (vect_build_slp_tree): Allow CALL_EXPR.
>
>        * lib/target-supports.exp (check_effective_target_vect_call_sqrtf,
>        check_effective_target_vect_call_copysignf,
>        check_effective_target_vect_call_lrint): New procedures.
>        * gcc.dg/vect/vect.exp: Run fast-math-bb-slp* tests using
>        $VECT_SLP_CFLAGS with -ffast-math.
>        * gcc.dg/vect/fast-math-vect-call-1.c: New test.
>        * gcc.dg/vect/fast-math-vect-call-2.c: New test.
>        * gcc.dg/vect/fast-math-bb-slp-call-1.c: New test.
>        * gcc.dg/vect/fast-math-bb-slp-call-2.c: New test.
>
> --- gcc/tree-vect-slp.c.jj      2011-11-07 20:32:03.000000000 +0100
> +++ gcc/tree-vect-slp.c 2011-11-08 09:28:12.000000000 +0100
> @@ -202,7 +202,10 @@ vect_get_and_check_slp_defs (loop_vec_in
>     loop = LOOP_VINFO_LOOP (loop_vinfo);
>
>   if (is_gimple_call (stmt))
> -    number_of_oprnds = gimple_call_num_args (stmt);
> +    {
> +      number_of_oprnds = gimple_call_num_args (stmt);
> +      op_idx = 3;
> +    }
>   else if (is_gimple_assign (stmt))
>     {
>       number_of_oprnds = gimple_num_ops (stmt) - 1;
> @@ -558,7 +561,25 @@ vect_build_slp_tree (loop_vec_info loop_
>       ncopies = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
>
>       if (is_gimple_call (stmt))
> -       rhs_code = CALL_EXPR;
> +       {
> +         rhs_code = CALL_EXPR;
> +         if (gimple_call_internal_p (stmt)
> +             || gimple_call_tail_p (stmt)
> +             || gimple_call_noreturn_p (stmt)
> +             || !gimple_call_nothrow_p (stmt)
> +             || gimple_call_chain (stmt))
> +           {
> +             if (vect_print_dump_info (REPORT_SLP))
> +               {
> +                 fprintf (vect_dump,
> +                          "Build SLP failed: unsupported call type ");
> +                 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
> +               }
> +
> +             vect_free_oprnd_info (&oprnds_info, true);
> +             return false;
> +           }
> +       }
>       else
>        rhs_code = gimple_assign_rhs_code (stmt);
>
> @@ -653,6 +674,27 @@ vect_build_slp_tree (loop_vec_info loop_
>              vect_free_oprnd_info (&oprnds_info, true);
>              return false;
>            }
> +
> +         if (rhs_code == CALL_EXPR)
> +           {
> +             gimple first_stmt = VEC_index (gimple, stmts, 0);
> +             if (gimple_call_num_args (stmt) != nops
> +                 || !operand_equal_p (gimple_call_fn (first_stmt),
> +                                      gimple_call_fn (stmt), 0)
> +                 || gimple_call_fntype (first_stmt)
> +                    != gimple_call_fntype (stmt))
> +               {
> +                 if (vect_print_dump_info (REPORT_SLP))
> +                   {
> +                     fprintf (vect_dump,
> +                              "Build SLP failed: different calls in ");
> +                     print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
> +                   }
> +
> +                 vect_free_oprnd_info (&oprnds_info, true);
> +                 return false;
> +               }
> +           }
>        }
>
>       /* Strided store or load.  */
> @@ -786,7 +828,8 @@ vect_build_slp_tree (loop_vec_info loop_
>          /* Not memory operation.  */
>          if (TREE_CODE_CLASS (rhs_code) != tcc_binary
>              && TREE_CODE_CLASS (rhs_code) != tcc_unary
> -              && rhs_code != COND_EXPR)
> +             && rhs_code != COND_EXPR
> +             && rhs_code != CALL_EXPR)
>            {
>              if (vect_print_dump_info (REPORT_SLP))
>                {
> --- gcc/tree-vect-stmts.c.jj    2011-11-07 20:32:09.000000000 +0100
> +++ gcc/tree-vect-stmts.c       2011-11-08 09:28:55.000000000 +0100
> @@ -1521,7 +1521,8 @@ vectorizable_function (gimple call, tree
>    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
>
>  static bool
> -vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
> +vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
> +                  slp_tree slp_node)
>  {
>   tree vec_dest;
>   tree scalar_dest;
> @@ -1532,6 +1533,7 @@ vectorizable_call (gimple stmt, gimple_s
>   int nunits_in;
>   int nunits_out;
>   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
> +  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
>   tree fndecl, new_temp, def, rhs_type;
>   gimple def_stmt;
>   enum vect_def_type dt[3]
> @@ -1543,19 +1545,12 @@ vectorizable_call (gimple stmt, gimple_s
>   size_t i, nargs;
>   tree lhs;
>
> -  /* FORNOW: unsupported in basic block SLP.  */
> -  gcc_assert (loop_vinfo);
> -
> -  if (!STMT_VINFO_RELEVANT_P (stmt_info))
> +  if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
>     return false;
>
>   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
>     return false;
>
> -  /* FORNOW: SLP not supported.  */
> -  if (STMT_SLP_TYPE (stmt_info))
> -    return false;
> -
>   /* Is STMT a vectorizable call?   */
>   if (!is_gimple_call (stmt))
>     return false;
> @@ -1596,7 +1591,7 @@ vectorizable_call (gimple stmt, gimple_s
>       if (!rhs_type)
>        rhs_type = TREE_TYPE (op);
>
> -      if (!vect_is_simple_use_1 (op, loop_vinfo, NULL,
> +      if (!vect_is_simple_use_1 (op, loop_vinfo, bb_vinfo,
>                                 &def_stmt, &def, &dt[i], &opvectype))
>        {
>          if (vect_print_dump_info (REPORT_DETAILS))
> @@ -1658,7 +1653,9 @@ vectorizable_call (gimple stmt, gimple_s
>
>   gcc_assert (!gimple_vuse (stmt));
>
> -  if (modifier == NARROW)
> +  if (slp_node || PURE_SLP_STMT (stmt_info))
> +    ncopies = 1;
> +  else if (modifier == NARROW)
>     ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
>   else
>     ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
> @@ -1697,6 +1694,50 @@ vectorizable_call (gimple stmt, gimple_s
>          else
>            VEC_truncate (tree, vargs, 0);
>
> +         if (slp_node)
> +           {
> +             VEC (slp_void_p, heap) *vec_defs
> +               = VEC_alloc (slp_void_p, heap, nargs);
> +             VEC (tree, heap) *vec_oprnds0;
> +
> +             for (i = 0; i < nargs; i++)
> +               VEC_quick_push (tree, vargs, gimple_call_arg (stmt, i));
> +             vect_get_slp_defs (vargs, slp_node, &vec_defs, -1);
> +             vec_oprnds0
> +               = (VEC (tree, heap) *) VEC_index (slp_void_p, vec_defs, 0);
> +
> +             /* Arguments are ready.  Create the new vector stmt.  */
> +             FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, vec_oprnd0)
> +               {
> +                 size_t k;
> +                 for (k = 0; k < nargs; k++)
> +                   {
> +                     VEC (tree, heap) *vec_oprndsk
> +                       = (VEC (tree, heap) *)
> +                         VEC_index (slp_void_p, vec_defs, k);
> +                     VEC_replace (tree, vargs, k,
> +                                  VEC_index (tree, vec_oprndsk, i));
> +                   }
> +                 new_stmt = gimple_build_call_vec (fndecl, vargs);
> +                 new_temp = make_ssa_name (vec_dest, new_stmt);
> +                 gimple_call_set_lhs (new_stmt, new_temp);
> +                 vect_finish_stmt_generation (stmt, new_stmt, gsi);
> +                 mark_symbols_for_renaming (new_stmt);
> +                 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node),
> +                                 new_stmt);
> +               }
> +
> +             for (i = 0; i < nargs; i++)
> +               {
> +                 VEC (tree, heap) *vec_oprndsi
> +                   = (VEC (tree, heap) *)
> +                     VEC_index (slp_void_p, vec_defs, i);
> +                 VEC_free (tree, heap, vec_oprndsi);
> +               }
> +             VEC_free (slp_void_p, heap, vec_defs);
> +             continue;
> +           }
> +
>          for (i = 0; i < nargs; i++)
>            {
>              op = gimple_call_arg (stmt, i);
> @@ -1739,6 +1780,54 @@ vectorizable_call (gimple stmt, gimple_s
>          else
>            VEC_truncate (tree, vargs, 0);
>
> +         if (slp_node)
> +           {
> +             VEC (slp_void_p, heap) *vec_defs
> +               = VEC_alloc (slp_void_p, heap, nargs);
> +             VEC (tree, heap) *vec_oprnds0;
> +
> +             for (i = 0; i < nargs; i++)
> +               VEC_quick_push (tree, vargs, gimple_call_arg (stmt, i));
> +             vect_get_slp_defs (vargs, slp_node, &vec_defs, -1);
> +             vec_oprnds0
> +               = (VEC (tree, heap) *) VEC_index (slp_void_p, vec_defs, 0);
> +
> +             /* Arguments are ready.  Create the new vector stmt.  */
> +             for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vec_oprnd0);
> +                  i += 2)
> +               {
> +                 size_t k;
> +                 VEC_truncate (tree, vargs, 0);
> +                 for (k = 0; k < nargs; k++)
> +                   {
> +                     VEC (tree, heap) *vec_oprndsk
> +                       = (VEC (tree, heap) *)
> +                         VEC_index (slp_void_p, vec_defs, k);
> +                     VEC_quick_push (tree, vargs,
> +                                     VEC_index (tree, vec_oprndsk, i));
> +                     VEC_quick_push (tree, vargs,
> +                                     VEC_index (tree, vec_oprndsk, i + 1));
> +                   }
> +                 new_stmt = gimple_build_call_vec (fndecl, vargs);
> +                 new_temp = make_ssa_name (vec_dest, new_stmt);
> +                 gimple_call_set_lhs (new_stmt, new_temp);
> +                 vect_finish_stmt_generation (stmt, new_stmt, gsi);
> +                 mark_symbols_for_renaming (new_stmt);
> +                 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node),
> +                                 new_stmt);
> +               }
> +
> +             for (i = 0; i < nargs; i++)
> +               {
> +                 VEC (tree, heap) *vec_oprndsi
> +                   = (VEC (tree, heap) *)
> +                     VEC_index (slp_void_p, vec_defs, i);
> +                 VEC_free (tree, heap, vec_oprndsi);
> +               }
> +             VEC_free (slp_void_p, heap, vec_defs);
> +             continue;
> +           }
> +
>          for (i = 0; i < nargs; i++)
>            {
>              op = gimple_call_arg (stmt, i);
> @@ -1804,7 +1893,8 @@ vectorizable_call (gimple stmt, gimple_s
>     lhs = gimple_call_lhs (stmt);
>   new_stmt = gimple_build_assign (lhs, build_zero_cst (type));
>   set_vinfo_for_stmt (new_stmt, stmt_info);
> -  set_vinfo_for_stmt (stmt, NULL);
> +  if (!slp_node)
> +    set_vinfo_for_stmt (stmt, NULL);
>   STMT_VINFO_STMT (stmt_info) = new_stmt;
>   gsi_replace (gsi, new_stmt, false);
>   SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
> @@ -5265,7 +5355,7 @@ vect_analyze_stmt (gimple stmt, bool *ne
>             || vectorizable_operation (stmt, NULL, NULL, NULL)
>             || vectorizable_assignment (stmt, NULL, NULL, NULL)
>             || vectorizable_load (stmt, NULL, NULL, NULL, NULL)
> -            || vectorizable_call (stmt, NULL, NULL)
> +           || vectorizable_call (stmt, NULL, NULL, NULL)
>             || vectorizable_store (stmt, NULL, NULL, NULL)
>             || vectorizable_reduction (stmt, NULL, NULL, NULL)
>             || vectorizable_condition (stmt, NULL, NULL, NULL, 0, NULL));
> @@ -5277,6 +5367,7 @@ vect_analyze_stmt (gimple stmt, bool *ne
>                 || vectorizable_operation (stmt, NULL, NULL, node)
>                 || vectorizable_assignment (stmt, NULL, NULL, node)
>                 || vectorizable_load (stmt, NULL, NULL, node, NULL)
> +               || vectorizable_call (stmt, NULL, NULL, node)
>                 || vectorizable_store (stmt, NULL, NULL, node)
>                 || vectorizable_condition (stmt, NULL, NULL, NULL, 0, node));
>       }
> @@ -5391,8 +5482,7 @@ vect_transform_stmt (gimple stmt, gimple
>       break;
>
>     case call_vec_info_type:
> -      gcc_assert (!slp_node);
> -      done = vectorizable_call (stmt, gsi, &vec_stmt);
> +      done = vectorizable_call (stmt, gsi, &vec_stmt, slp_node);
>       stmt = gsi_stmt (*gsi);
>       break;
>
> --- gcc/testsuite/lib/target-supports.exp.jj    2011-11-08 09:26:58.000000000 +0100
> +++ gcc/testsuite/lib/target-supports.exp       2011-11-08 10:15:38.000000000 +0100
> @@ -3520,6 +3520,58 @@ proc check_effective_target_vect64 { } {
>     return $et_vect64_saved
>  }
>
> +# Return 1 if the target supports vector copysignf calls.
> +
> +proc check_effective_target_vect_call_copysignf { } {
> +    global et_vect_call_copysignf_saved
> +
> +    if [info exists et_vect_call_copysignf_saved] {
> +       verbose "check_effective_target_vect_call_copysignf: using cached result" 2
> +    } else {
> +       set et_vect_call_copysignf_saved 0
> +       if { [istarget i?86-*-*]
> +            || [istarget x86_64-*-*]
> +            || [istarget powerpc*-*-*] } {
> +          set et_vect_call_copysignf_saved 1
> +       }
> +    }
> +
> +    verbose "check_effective_target_vect_call_copysignf: returning $et_vect_call_copysignf_saved" 2
> +    return $et_vect_call_copysignf_saved
> +}
> +
> +# Return 1 if the target supports vector sqrtf calls.
> +
> +proc check_effective_target_vect_call_sqrtf { } {
> +    global et_vect_call_sqrtf_saved
> +
> +    if [info exists et_vect_call_sqrtf_saved] {
> +       verbose "check_effective_target_vect_call_sqrtf: using cached result" 2
> +    } else {
> +       set et_vect_call_sqrtf_saved 0
> +       if { [istarget i?86-*-*]
> +            || [istarget x86_64-*-*]
> +            || ([istarget powerpc*-*-*] && [check_vsx_hw_available]) } {
> +           set et_vect_call_sqrtf_saved 1
> +       }
> +    }
> +
> +    verbose "check_effective_target_vect_call_sqrtf: returning $et_vect_call_sqrtf_saved" 2
> +    return $et_vect_call_sqrtf_saved
> +}
> +
> +# Return 1 if the target supports vector lrint calls.
> +
> +proc check_effective_target_vect_call_lrint { } {
> +    set et_vect_call_lrint 0
> +    if { ([istarget i?86-*-*] || [istarget x86_64-*-*]) && [check_effective_target_ilp32] } {
> +       set et_vect_call_lrint 1
> +    }
> +
> +    verbose "check_effective_target_vect_call_lrint: returning $et_vect_call_lrint" 2
> +    return $et_vect_call_lrint
> +}
> +
>  # Return 1 if the target supports section-anchors
>
>  proc check_effective_target_section_anchors { } {
> --- gcc/testsuite/gcc.dg/vect/vect.exp.jj       2011-10-24 12:21:08.000000000 +0200
> +++ gcc/testsuite/gcc.dg/vect/vect.exp  2011-11-08 10:09:27.000000000 +0100
> @@ -104,9 +104,15 @@ dg-runtest [lsort [glob -nocomplain $src
>  # -ffast-math tests
>  set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
>  lappend DEFAULT_VECTCFLAGS "-ffast-math"
> -dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-*.\[cS\]]]  \
> +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-\[ipsv\]*.\[cS\]]]  \
>        "" $DEFAULT_VECTCFLAGS
>
> +# -ffast-math SLP tests
> +set VECT_SLP_CFLAGS $SAVED_VECT_SLP_CFLAGS
> +lappend VECT_SLP_CFLAGS "-ffast-math"
> +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-bb-slp-*.\[cS\]]]  \
> +        "" $VECT_SLP_CFLAGS
> +
>  # -fno-fast-math tests
>  set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
>  lappend DEFAULT_VECTCFLAGS "-fno-fast-math"
> --- gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c.jj        2011-11-08 09:28:12.000000000 +0100
> +++ gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c   2011-11-08 09:57:19.000000000 +0100
> @@ -0,0 +1,81 @@
> +#include "tree-vect.h"
> +
> +extern float copysignf (float, float);
> +extern float sqrtf (float);
> +extern float fabsf (float);
> +extern void abort (void);
> +float a[64], b[64], c[64], d[64];
> +
> +__attribute__((noinline, noclone)) void
> +f1 (int n)
> +{
> +  int i;
> +  for (i = 0; i < n; i++)
> +    {
> +      a[4 * i + 0] = copysignf (b[4 * i + 0], c[4 * i + 0]) + 1.0f + sqrtf (d[4 * i + 0]);
> +      a[4 * i + 1] = copysignf (b[4 * i + 1], c[4 * i + 1]) + 2.0f + sqrtf (d[4 * i + 1]);
> +      a[4 * i + 2] = copysignf (b[4 * i + 2], c[4 * i + 2]) + 3.0f + sqrtf (d[4 * i + 2]);
> +      a[4 * i + 3] = copysignf (b[4 * i + 3], c[4 * i + 3]) + 4.0f + sqrtf (d[4 * i + 3]);
> +    }
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f2 (int n)
> +{
> +  int i;
> +  for (i = 0; i < 2 * n; i++)
> +    {
> +      a[2 * i + 0] = copysignf (b[2 * i + 0], c[2 * i + 0]) + 1.0f + sqrtf (d[2 * i + 0]);
> +      a[2 * i + 1] = copysignf (b[2 * i + 1], c[2 * i + 1]) + 2.0f + sqrtf (d[2 * i + 1]);
> +    }
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f3 (void)
> +{
> +  int i;
> +  for (i = 0; i < 64; i++)
> +    a[i] = copysignf (b[i], c[i]) + 1.0f + sqrtf (d[i]);
> +}
> +
> +__attribute__((noinline, noclone)) int
> +main1 ()
> +{
> +  int i;
> +
> +  for (i = 0; i < 64; i++)
> +    {
> +      asm ("");
> +      b[i] = (i & 1) ? -4 * i : 4 * i;
> +      c[i] = (i & 2) ? -8 * i : 8 * i;
> +      d[i] = i * i;
> +    }
> +  f1 (16);
> +  for (i = 0; i < 64; i++)
> +    if (fabsf (((i & 2) ? -4 * i : 4 * i) + 1 + (i & 3) + i - a[i]) >= 0.0001f)
> +      abort ();
> +    else
> +      a[i] = 131.25;
> +  f2 (16);
> +  for (i = 0; i < 64; i++)
> +    if (fabsf (((i & 2) ? -4 * i : 4 * i) + 1 + (i & 1) + i - a[i]) >= 0.0001f)
> +      abort ();
> +    else
> +      a[i] = 131.25;
> +  f3 ();
> +  for (i = 0; i < 64; i++)
> +    if (fabsf (((i & 2) ? -4 * i : 4 * i) + 1 + i - a[i]) >= 0.0001f)
> +      abort ();
> +  return 0;
> +}
> +
> +int
> +main ()
> +{
> +  check_vect ();
> +  return main1 ();
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 3 "vect" { target { vect_call_copysignf && vect_call_sqrtf } } } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_call_copysignf && vect_call_sqrtf } } } } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */
> --- gcc/testsuite/gcc.dg/vect/fast-math-vect-call-2.c.jj        2011-11-08 09:28:12.000000000 +0100
> +++ gcc/testsuite/gcc.dg/vect/fast-math-vect-call-2.c   2011-11-08 10:03:37.000000000 +0100
> @@ -0,0 +1,128 @@
> +#include "tree-vect.h"
> +
> +extern long int lrint (double);
> +extern void abort (void);
> +long int a[64];
> +double b[64];
> +
> +__attribute__((noinline, noclone)) void
> +f1 (int n)
> +{
> +  int i;
> +  for (i = 0; i < n; i++)
> +    {
> +      a[4 * i + 0] = lrint (b[4 * i + 0]) + 1;
> +      a[4 * i + 1] = lrint (b[4 * i + 1]) + 2;
> +      a[4 * i + 2] = lrint (b[4 * i + 2]) + 3;
> +      a[4 * i + 3] = lrint (b[4 * i + 3]) + 4;
> +    }
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f2 (int n)
> +{
> +  int i;
> +  for (i = 0; i < 2 * n; i++)
> +    {
> +      a[2 * i + 0] = lrint (b[2 * i + 0]) + 1;
> +      a[2 * i + 1] = lrint (b[2 * i + 1]) + 2;
> +    }
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f3 (void)
> +{
> +  int i;
> +  for (i = 0; i < 64; i++)
> +    a[i] = lrint (b[i]) + 1;
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f4 (int n)
> +{
> +  int i;
> +  for (i = 0; i < n; i++)
> +    {
> +      a[4 * i + 0] = lrint (b[4 * i + 0]);
> +      a[4 * i + 1] = lrint (b[4 * i + 1]);
> +      a[4 * i + 2] = lrint (b[4 * i + 2]);
> +      a[4 * i + 3] = lrint (b[4 * i + 3]);
> +    }
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f5 (int n)
> +{
> +  int i;
> +  for (i = 0; i < 2 * n; i++)
> +    {
> +      a[2 * i + 0] = lrint (b[2 * i + 0]);
> +      a[2 * i + 1] = lrint (b[2 * i + 1]);
> +    }
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f6 (void)
> +{
> +  int i;
> +  for (i = 0; i < 64; i++)
> +    a[i] = lrint (b[i]);
> +}
> +
> +__attribute__((noinline, noclone)) int
> +main1 ()
> +{
> +  int i;
> +
> +  for (i = 0; i < 64; i++)
> +    {
> +      asm ("");
> +      b[i] = ((i & 1) ? -4 * i : 4 * i) + 0.25;
> +    }
> +  f1 (16);
> +  for (i = 0; i < 64; i++)
> +    if (a[i] != ((i & 1) ? -4 * i : 4 * i) + 1 + (i & 3))
> +      abort ();
> +    else
> +      a[i] = 131.25;
> +  f2 (16);
> +  for (i = 0; i < 64; i++)
> +    if (a[i] != ((i & 1) ? -4 * i : 4 * i) + 1 + (i & 1))
> +      abort ();
> +    else
> +      a[i] = 131.25;
> +  f3 ();
> +  for (i = 0; i < 64; i++)
> +    if (a[i] != ((i & 1) ? -4 * i : 4 * i) + 1)
> +      abort ();
> +    else
> +      a[i] = 131.25;
> +  f4 (16);
> +  for (i = 0; i < 64; i++)
> +    if (a[i] != ((i & 1) ? -4 * i : 4 * i))
> +      abort ();
> +    else
> +      a[i] = 131.25;
> +  f5 (16);
> +  for (i = 0; i < 64; i++)
> +    if (a[i] != ((i & 1) ? -4 * i : 4 * i))
> +      abort ();
> +    else
> +      a[i] = 131.25;
> +  f6 ();
> +  for (i = 0; i < 64; i++)
> +    if (a[i] != ((i & 1) ? -4 * i : 4 * i))
> +      abort ();
> +  return 0;
> +}
> +
> +int
> +main ()
> +{
> +  check_vect ();
> +  return main1 ();
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" { target vect_call_lrint } } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target vect_call_lrint } } } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */
> --- gcc/testsuite/gcc.dg/vect/fast-math-bb-slp-call-1.c.jj      2011-11-08 09:46:00.000000000 +0100
> +++ gcc/testsuite/gcc.dg/vect/fast-math-bb-slp-call-1.c 2011-11-08 09:49:49.000000000 +0100
> @@ -0,0 +1,49 @@
> +#include "tree-vect.h"
> +
> +extern float copysignf (float, float);
> +extern float sqrtf (float);
> +extern float fabsf (float);
> +extern void abort (void);
> +float a[64], b[64], c[64], d[64];
> +
> +__attribute__((noinline, noclone)) void
> +f1 (void)
> +{
> +  a[0] = copysignf (b[0], c[0]) + 1.0f + sqrtf (d[0]);
> +  a[1] = copysignf (b[1], c[1]) + 2.0f + sqrtf (d[1]);
> +  a[2] = copysignf (b[2], c[2]) + 3.0f + sqrtf (d[2]);
> +  a[3] = copysignf (b[3], c[3]) + 4.0f + sqrtf (d[3]);
> +  a[4] = copysignf (b[4], c[4]) + 5.0f + sqrtf (d[4]);
> +  a[5] = copysignf (b[5], c[5]) + 6.0f + sqrtf (d[5]);
> +  a[6] = copysignf (b[6], c[6]) + 7.0f + sqrtf (d[6]);
> +  a[7] = copysignf (b[7], c[7]) + 8.0f + sqrtf (d[7]);
> +}
> +
> +__attribute__((noinline, noclone)) int
> +main1 ()
> +{
> +  int i;
> +
> +  for (i = 0; i < 8; i++)
> +    {
> +      asm ("");
> +      b[i] = (i & 1) ? -4 * i : 4 * i;
> +      c[i] = (i & 2) ? -8 * i : 8 * i;
> +      d[i] = i * i;
> +    }
> +  f1 ();
> +  for (i = 0; i < 8; i++)
> +    if (fabsf (((i & 2) ? -4 * i : 4 * i) + 1 + i + i - a[i]) >= 0.0001f)
> +      abort ();
> +  return 0;
> +}
> +
> +int
> +main ()
> +{
> +  check_vect ();
> +  return main1 ();
> +}
> +
> +/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 1 "slp" { target { vect_call_copysignf && vect_call_sqrtf } } } } */
> +/* { dg-final { cleanup-tree-dump "slp" } } */
> --- gcc/testsuite/gcc.dg/vect/fast-math-bb-slp-call-2.c.jj      2011-11-08 09:46:04.000000000 +0100
> +++ gcc/testsuite/gcc.dg/vect/fast-math-bb-slp-call-2.c 2011-11-08 10:11:20.000000000 +0100
> @@ -0,0 +1,65 @@
> +#include "tree-vect.h"
> +
> +extern long int lrint (double);
> +extern void abort (void);
> +long int a[64];
> +double b[64];
> +
> +__attribute__((noinline, noclone)) void
> +f1 (void)
> +{
> +  a[0] = lrint (b[0]) + 1;
> +  a[1] = lrint (b[1]) + 2;
> +  a[2] = lrint (b[2]) + 3;
> +  a[3] = lrint (b[3]) + 4;
> +  a[4] = lrint (b[4]) + 5;
> +  a[5] = lrint (b[5]) + 6;
> +  a[6] = lrint (b[6]) + 7;
> +  a[7] = lrint (b[7]) + 8;
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f2 (void)
> +{
> +  a[0] = lrint (b[0]);
> +  a[1] = lrint (b[1]);
> +  a[2] = lrint (b[2]);
> +  a[3] = lrint (b[3]);
> +  a[4] = lrint (b[4]);
> +  a[5] = lrint (b[5]);
> +  a[6] = lrint (b[6]);
> +  a[7] = lrint (b[7]);
> +}
> +
> +__attribute__((noinline, noclone)) int
> +main1 ()
> +{
> +  int i;
> +
> +  for (i = 0; i < 8; i++)
> +    {
> +      asm ("");
> +      b[i] = ((i & 1) ? -4 * i : 4 * i) + 0.25;
> +    }
> +  f1 ();
> +  for (i = 0; i < 8; i++)
> +    if (a[i] != ((i & 1) ? -4 * i : 4 * i) + 1 + i)
> +      abort ();
> +    else
> +      a[i] = 131.25;
> +  f2 ();
> +  for (i = 0; i < 8; i++)
> +    if (a[i] != ((i & 1) ? -4 * i : 4 * i))
> +      abort ();
> +  return 0;
> +}
> +
> +int
> +main ()
> +{
> +  check_vect ();
> +  return main1 ();
> +}
> +
> +/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 2 "slp" { target vect_call_lrint } } } */
> +/* { dg-final { cleanup-tree-dump "slp" } } */
>
>
>        Jakub
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2011-11-08  9:48 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-10-20 23:56 [RFC PATCH] SLP vectorize calls Jakub Jelinek
2011-10-21 12:45 ` Ira Rosen
2011-10-21 13:31   ` Jakub Jelinek
2011-10-21 14:26     ` Ira Rosen
2011-10-21 14:42       ` Jakub Jelinek
2011-10-21 15:51         ` Ira Rosen
2011-11-07 18:44       ` [PATCH] SLP vectorize calls (take 2) Jakub Jelinek
2011-11-08  8:00         ` Ira Rosen
2011-11-08  8:03           ` Jakub Jelinek
2011-11-08  8:22             ` Ira Rosen
2011-11-08 10:03               ` [PATCH] SLP vectorize calls (take 3) Jakub Jelinek
2011-11-08 10:15                 ` Ira Rosen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).