[gcc r14-2490] vect: Adjust vectorizable_load costing on VMAT_ELEMENTWISE and VMAT_STRIDED

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

* [gcc r14-2490] vect: Adjust vectorizable_load costing on VMAT_ELEMENTWISE and VMAT_STRIDED_SLP
@ 2023-07-13  4:14 Kewen Lin
  0 siblings, 0 replies; only message in thread
From: Kewen Lin @ 2023-07-13  4:14 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:090d83c25508c108756a534c765d2fa8c07eb261

commit r14-2490-g090d83c25508c108756a534c765d2fa8c07eb261
Author: Kewen Lin <linkw@linux.ibm.com>
Date:   Wed Jul 12 21:23:21 2023 -0500

    vect: Adjust vectorizable_load costing on VMAT_ELEMENTWISE and VMAT_STRIDED_SLP
    
    This patch adjusts the cost handling on VMAT_ELEMENTWISE
    and VMAT_STRIDED_SLP in function vectorizable_load.  We
    don't call function vect_model_load_cost for them any more.
    
    As PR82255 shows, we don't always need a vector construction
    there, moving costing next to the transform can make us only
    cost for vector construction when it's actually needed.
    Besides, it can count the number of loads consistently for
    some cases.
    
             PR tree-optimization/82255
    
    gcc/ChangeLog:
    
            * tree-vect-stmts.cc (vectorizable_load): Adjust the cost handling
            on VMAT_ELEMENTWISE and VMAT_STRIDED_SLP without calling
            vect_model_load_cost.
            (vect_model_load_cost): Assert it won't get VMAT_ELEMENTWISE and
            VMAT_STRIDED_SLP any more, and remove their related handlings.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.dg/vect/costmodel/ppc/costmodel-pr82255.c: New test.
    
    2023-06-13  Bill Schmidt  <wschmidt@linux.ibm.com>
                Kewen Lin  <linkw@linux.ibm.com>

Diff:
---
 .../gcc.dg/vect/costmodel/ppc/costmodel-pr82255.c  |  31 ++++
 gcc/tree-vect-stmts.cc                             | 170 +++++++++++++--------
 2 files changed, 134 insertions(+), 67 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-pr82255.c b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-pr82255.c
new file mode 100644
index 00000000000..9317ee2e15b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-pr82255.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+
+/* PR82255: Ensure we don't require a vec_construct cost when we aren't
+   going to generate a strided load.  */
+
+extern int abs (int __x) __attribute__ ((__nothrow__, __leaf__))
+__attribute__ ((__const__));
+
+static int
+foo (unsigned char *w, int i, unsigned char *x, int j)
+{
+  int tot = 0;
+  for (int a = 0; a < 16; a++)
+    {
+#pragma GCC unroll 16
+      for (int b = 0; b < 16; b++)
+	tot += abs (w[b] - x[b]);
+      w += i;
+      x += j;
+    }
+  return tot;
+}
+
+void
+bar (unsigned char *w, unsigned char *x, int i, int *result)
+{
+  *result = foo (w, 16, x, i);
+}
+
+/* { dg-final { scan-tree-dump-times "vec_construct" 0 "vect" } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 616c5293875..4622d6a04ef 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1152,7 +1152,9 @@ vect_model_load_cost (vec_info *vinfo,
 		      stmt_vector_for_cost *cost_vec)
 {
   gcc_assert ((memory_access_type != VMAT_GATHER_SCATTER || !gs_info->decl)
-	      && memory_access_type != VMAT_INVARIANT);
+	      && memory_access_type != VMAT_INVARIANT
+	      && memory_access_type != VMAT_ELEMENTWISE
+	      && memory_access_type != VMAT_STRIDED_SLP);
 
   unsigned int inside_cost = 0, prologue_cost = 0;
   bool grouped_access_p = STMT_VINFO_GROUPED_ACCESS (stmt_info);
@@ -1239,8 +1241,7 @@ vect_model_load_cost (vec_info *vinfo,
     }
 
   /* The loads themselves.  */
-  if (memory_access_type == VMAT_ELEMENTWISE
-      || memory_access_type == VMAT_GATHER_SCATTER)
+  if (memory_access_type == VMAT_GATHER_SCATTER)
     {
       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
       unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
@@ -1262,10 +1263,10 @@ vect_model_load_cost (vec_info *vinfo,
 			alignment_support_scheme, misalignment, first_stmt_p,
 			&inside_cost, &prologue_cost, 
 			cost_vec, cost_vec, true);
-  if (memory_access_type == VMAT_ELEMENTWISE
-      || memory_access_type == VMAT_STRIDED_SLP
-      || (memory_access_type == VMAT_GATHER_SCATTER
-	  && gs_info->ifn == IFN_LAST && !gs_info->decl))
+
+  if (memory_access_type == VMAT_GATHER_SCATTER
+      && gs_info->ifn == IFN_LAST
+      && !gs_info->decl)
     inside_cost += record_stmt_cost (cost_vec, ncopies, vec_construct,
 				     stmt_info, 0, vect_body);
 
@@ -9830,14 +9831,6 @@ vectorizable_load (vec_info *vinfo,
   if (memory_access_type == VMAT_ELEMENTWISE
       || memory_access_type == VMAT_STRIDED_SLP)
     {
-      if (costing_p)
-	{
-	  vect_model_load_cost (vinfo, stmt_info, ncopies, vf,
-				memory_access_type, alignment_support_scheme,
-				misalignment, &gs_info, slp_node, cost_vec);
-	  return true;
-	}
-
       gimple_stmt_iterator incr_gsi;
       bool insert_after;
       tree offvar;
@@ -9849,6 +9842,7 @@ vectorizable_load (vec_info *vinfo,
       unsigned int const_nunits = nunits.to_constant ();
       unsigned HOST_WIDE_INT cst_offset = 0;
       tree dr_offset;
+      unsigned int inside_cost = 0;
 
       gcc_assert (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
       gcc_assert (!nested_in_vect_loop);
@@ -9863,6 +9857,7 @@ vectorizable_load (vec_info *vinfo,
 	  first_stmt_info = stmt_info;
 	  first_dr_info = dr_info;
 	}
+
       if (slp && grouped_load)
 	{
 	  group_size = DR_GROUP_SIZE (first_stmt_info);
@@ -9879,43 +9874,44 @@ vectorizable_load (vec_info *vinfo,
 	  ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr));
 	}
 
-      dr_offset = get_dr_vinfo_offset (vinfo, first_dr_info);
-      stride_base
-	= fold_build_pointer_plus
-	    (DR_BASE_ADDRESS (first_dr_info->dr),
-	     size_binop (PLUS_EXPR,
-			 convert_to_ptrofftype (dr_offset),
-			 convert_to_ptrofftype (DR_INIT (first_dr_info->dr))));
-      stride_step = fold_convert (sizetype, DR_STEP (first_dr_info->dr));
+      if (!costing_p)
+	{
+	  dr_offset = get_dr_vinfo_offset (vinfo, first_dr_info);
+	  stride_base = fold_build_pointer_plus (
+	    DR_BASE_ADDRESS (first_dr_info->dr),
+	    size_binop (PLUS_EXPR, convert_to_ptrofftype (dr_offset),
+			convert_to_ptrofftype (DR_INIT (first_dr_info->dr))));
+	  stride_step = fold_convert (sizetype, DR_STEP (first_dr_info->dr));
 
-      /* For a load with loop-invariant (but other than power-of-2)
-         stride (i.e. not a grouped access) like so:
+	  /* For a load with loop-invariant (but other than power-of-2)
+	     stride (i.e. not a grouped access) like so:
 
-	   for (i = 0; i < n; i += stride)
-	     ... = array[i];
+	       for (i = 0; i < n; i += stride)
+		 ... = array[i];
 
-	 we generate a new induction variable and new accesses to
-	 form a new vector (or vectors, depending on ncopies):
+	     we generate a new induction variable and new accesses to
+	     form a new vector (or vectors, depending on ncopies):
 
-	   for (j = 0; ; j += VF*stride)
-	     tmp1 = array[j];
-	     tmp2 = array[j + stride];
-	     ...
-	     vectemp = {tmp1, tmp2, ...}
-         */
+	       for (j = 0; ; j += VF*stride)
+		 tmp1 = array[j];
+		 tmp2 = array[j + stride];
+		 ...
+		 vectemp = {tmp1, tmp2, ...}
+	     */
 
-      ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (stride_step), stride_step,
-			    build_int_cst (TREE_TYPE (stride_step), vf));
+	  ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (stride_step), stride_step,
+				build_int_cst (TREE_TYPE (stride_step), vf));
 
-      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+	  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 
-      stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base);
-      ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep);
-      create_iv (stride_base, PLUS_EXPR, ivstep, NULL,
-		 loop, &incr_gsi, insert_after,
-		 &offvar, NULL);
+	  stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base);
+	  ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep);
+	  create_iv (stride_base, PLUS_EXPR, ivstep, NULL,
+		     loop, &incr_gsi, insert_after,
+		     &offvar, NULL);
 
-      stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step);
+	  stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step);
+	}
 
       running_off = offvar;
       alias_off = build_int_cst (ref_type, 0);
@@ -9982,11 +9978,23 @@ vectorizable_load (vec_info *vinfo,
       unsigned int n_groups = 0;
       for (j = 0; j < ncopies; j++)
 	{
-	  if (nloads > 1)
+	  if (nloads > 1 && !costing_p)
 	    vec_alloc (v, nloads);
 	  gimple *new_stmt = NULL;
 	  for (i = 0; i < nloads; i++)
 	    {
+	      if (costing_p)
+		{
+		  if (VECTOR_TYPE_P (ltype))
+		    vect_get_load_cost (vinfo, stmt_info, 1,
+					alignment_support_scheme, misalignment,
+					false, &inside_cost, nullptr, cost_vec,
+					cost_vec, true);
+		  else
+		    inside_cost += record_stmt_cost (cost_vec, 1, scalar_load,
+						     stmt_info, 0, vect_body);
+		  continue;
+		}
 	      tree this_off = build_int_cst (TREE_TYPE (alias_off),
 					     group_el * elsz + cst_offset);
 	      tree data_ref = build2 (MEM_REF, ltype, running_off, this_off);
@@ -10017,42 +10025,70 @@ vectorizable_load (vec_info *vinfo,
 		  group_el = 0;
 		}
 	    }
+
 	  if (nloads > 1)
 	    {
-	      tree vec_inv = build_constructor (lvectype, v);
-	      new_temp = vect_init_vector (vinfo, stmt_info,
-					   vec_inv, lvectype, gsi);
-	      new_stmt = SSA_NAME_DEF_STMT (new_temp);
-	      if (lvectype != vectype)
+	      if (costing_p)
+		inside_cost += record_stmt_cost (cost_vec, 1, vec_construct,
+						 stmt_info, 0, vect_body);
+	      else
 		{
-		  new_stmt = gimple_build_assign (make_ssa_name (vectype),
-						  VIEW_CONVERT_EXPR,
-						  build1 (VIEW_CONVERT_EXPR,
-							  vectype, new_temp));
-		  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+		  tree vec_inv = build_constructor (lvectype, v);
+		  new_temp = vect_init_vector (vinfo, stmt_info, vec_inv,
+					       lvectype, gsi);
+		  new_stmt = SSA_NAME_DEF_STMT (new_temp);
+		  if (lvectype != vectype)
+		    {
+		      new_stmt
+			= gimple_build_assign (make_ssa_name (vectype),
+					       VIEW_CONVERT_EXPR,
+					       build1 (VIEW_CONVERT_EXPR,
+						       vectype, new_temp));
+		      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt,
+						   gsi);
+		    }
 		}
 	    }
 
-	  if (slp)
+	  if (!costing_p)
 	    {
-	      if (slp_perm)
-		dr_chain.quick_push (gimple_assign_lhs (new_stmt));
+	      if (slp)
+		{
+		  if (slp_perm)
+		    dr_chain.quick_push (gimple_assign_lhs (new_stmt));
+		  else
+		    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+		}
 	      else
-		SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
-	    }
-	  else
-	    {
-	      if (j == 0)
-		*vec_stmt = new_stmt;
-	      STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+		{
+		  if (j == 0)
+		    *vec_stmt = new_stmt;
+		  STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+		}
 	    }
 	}
       if (slp_perm)
 	{
 	  unsigned n_perms;
-	  vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
-					false, &n_perms);
+	  if (costing_p)
+	    {
+	      unsigned n_loads;
+	      vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf,
+					    true, &n_perms, &n_loads);
+	      inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm,
+					       first_stmt_info, 0, vect_body);
+	    }
+	  else
+	    vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
+					  false, &n_perms);
 	}
+
+      if (costing_p && dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "vect_model_load_cost: inside_cost = %u, "
+			 "prologue_cost = 0 .\n",
+			 inside_cost);
+
       return true;
     }

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-07-13  4:14 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-13  4:14 [gcc r14-2490] vect: Adjust vectorizable_load costing on VMAT_ELEMENTWISE and VMAT_STRIDED_SLP Kewen Lin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).