From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <rguenth@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1666)
 id 06242385AC19; Mon, 30 Aug 2021 12:04:18 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 06242385AC19
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset="utf-8"
From: Richard Biener <rguenth@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r12-3222] tree-optimization/102128 - rework if-converted BB vect
 heuristic
X-Act-Checkin: gcc
X-Git-Author: Richard Biener <rguenther@suse.de>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: 1313111fdec0d0de4228b5e839ca728b3e4b106e
X-Git-Newrev: 89f33f44addbf9853bc3e6677db1fa941713cb6c
Message-Id: <20210830120418.06242385AC19@sourceware.org>
Date: Mon, 30 Aug 2021 12:04:18 +0000 (GMT)
X-BeenThere: gcc-cvs@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-cvs mailing list <gcc-cvs.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-cvs>,
 <mailto:gcc-cvs-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-cvs/>
List-Help: <mailto:gcc-cvs-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-cvs>,
 <mailto:gcc-cvs-request@gcc.gnu.org?subject=subscribe>
X-List-Received-Date: Mon, 30 Aug 2021 12:04:18 -0000

https://gcc.gnu.org/g:89f33f44addbf9853bc3e6677db1fa941713cb6c

commit r12-3222-g89f33f44addbf9853bc3e6677db1fa941713cb6c
Author: Richard Biener <rguenther@suse.de>
Date:   Mon Aug 30 12:56:26 2021 +0200

    tree-optimization/102128 - rework if-converted BB vect heuristic
    
    This reworks the previous attempt to avoid leaving around if-converted
    scalar code in BB vectorized loop bodies to keep costing independent
    subgraphs which should address the observed regression with 519.lbm_r.
    
    For this to work we now first cost all subgraphs and only after
    doing that proceed to emit vectorized code.
    
    2021-08-30  Richard Biener  <rguenther@suse.de>
    
            PR tree-optimization/102128
            * tree-vect-slp.c (vect_bb_vectorization_profitable_p):
            Move scanning for if-converted scalar code to the caller
            and instead delay clearing the visited flag for profitable
            subgraphs.
            (vect_slp_region): Cost all subgraphs before scheduling.
            For if-converted BB vectorization scan for scalar COND_EXPRs
            and do not vectorize if any found and the cost model is
            very-cheap.

Diff:
---
 gcc/tree-vect-slp.c | 112 +++++++++++++++++++++++++++-------------------------
 1 file changed, 58 insertions(+), 54 deletions(-)
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 4d688c7a267..4ca24408249 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -5275,34 +5275,6 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
       vector_costs.safe_splice (instance->cost_vec);
       instance->cost_vec.release ();
     }
-  /* When we're vectorizing an if-converted loop body with the
-     very-cheap cost model make sure we vectorized all if-converted
-     code.  */
-  bool force_not_profitable = false;
-  if (orig_loop && flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP)
-    {
-      gcc_assert (bb_vinfo->bbs.length () == 1);
-      for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
-	   !gsi_end_p (gsi); gsi_next (&gsi))
-	{
-	  /* The costing above left us with DCEable vectorized scalar
-	     stmts having the visited flag set.  */
-	  if (gimple_visited_p (gsi_stmt (gsi)))
-	    continue;
-
-	  if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
-	    if (gimple_assign_rhs_code (ass) == COND_EXPR)
-	      {
-		force_not_profitable = true;
-		break;
-	      }
-	}
-    }
-
-  /* Unset visited flag.  */
-  stmt_info_for_cost *cost;
-  FOR_EACH_VEC_ELT (scalar_costs, i, cost)
-    gimple_set_visited  (cost->stmt_info->stmt, false);
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
@@ -5319,6 +5291,7 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
     li_scalar_costs (scalar_costs.length ());
   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
     li_vector_costs (vector_costs.length ());
+  stmt_info_for_cost *cost;
   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
     {
       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
@@ -5341,6 +5314,7 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
   /* Now cost the portions individually.  */
   unsigned vi = 0;
   unsigned si = 0;
+  bool profitable = true;
   while (si < li_scalar_costs.length ()
 	 && vi < li_vector_costs.length ())
     {
@@ -5407,30 +5381,29 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
 	 example).  */
       if (vec_outside_cost + vec_inside_cost > scalar_cost)
 	{
-	  scalar_costs.release ();
-	  vector_costs.release ();
-	  return false;
+	  profitable = false;
+	  break;
 	}
     }
-  if (vi < li_vector_costs.length ())
+  if (profitable && vi < li_vector_costs.length ())
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,
 			 "Excess vector cost for part in loop %d:\n",
 			 li_vector_costs[vi].first);
-      scalar_costs.release ();
-      vector_costs.release ();
-      return false;
+      profitable = false;
     }
 
-  if (dump_enabled_p () && force_not_profitable)
-    dump_printf_loc (MSG_NOTE, vect_location,
-		     "not profitable because of unprofitable if-converted "
-		     "scalar code\n");
+  /* Unset visited flag.  This is delayed when the subgraph is profitable
+     and we process the loop for remaining unvectorized if-converted code.  */
+  if (orig_loop && !profitable)
+    FOR_EACH_VEC_ELT (scalar_costs, i, cost)
+      gimple_set_visited  (cost->stmt_info->stmt, false);
 
   scalar_costs.release ();
   vector_costs.release ();
-  return !force_not_profitable;
+
+  return profitable;
 }
 
 /* qsort comparator for lane defs.  */
@@ -5884,9 +5857,8 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
 
 	  bb_vinfo->shared->check_datarefs ();
 
-	  unsigned i;
-	  slp_instance instance;
-	  FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
+	  auto_vec<slp_instance> profitable_subgraphs;
+	  for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
 	    {
 	      if (instance->subgraph_entries.is_empty ())
 		continue;
@@ -5894,9 +5866,7 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
 	      vect_location = instance->location ();
 	      if (!unlimited_cost_model (NULL)
 		  && !vect_bb_vectorization_profitable_p
-			(bb_vinfo,
-			 orig_loop ? BB_VINFO_SLP_INSTANCES (bb_vinfo)
-			 : instance->subgraph_entries, orig_loop))
+			(bb_vinfo, instance->subgraph_entries, orig_loop))
 		{
 		  if (dump_enabled_p ())
 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -5908,15 +5878,54 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
 	      if (!dbg_cnt (vect_slp))
 		continue;
 
+	      profitable_subgraphs.safe_push (instance);
+	    }
+
+	  /* When we're vectorizing an if-converted loop body with the
+	     very-cheap cost model make sure we vectorized all if-converted
+	     code.  */
+	  if (!profitable_subgraphs.is_empty ()
+	      && orig_loop)
+	    {
+	      gcc_assert (bb_vinfo->bbs.length () == 1);
+	      for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
+		   !gsi_end_p (gsi); gsi_next (&gsi))
+		{
+		  /* The costing above left us with DCEable vectorized scalar
+		     stmts having the visited flag set on profitable
+		     subgraphs.  Do the delayed clearing of the flag here.  */
+		  if (gimple_visited_p (gsi_stmt (gsi)))
+		    {
+		      gimple_set_visited (gsi_stmt (gsi), false);
+		      continue;
+		    }
+		  if (flag_vect_cost_model != VECT_COST_MODEL_VERY_CHEAP)
+		    continue;
+
+		  if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
+		    if (gimple_assign_rhs_code (ass) == COND_EXPR)
+		      {
+			if (!profitable_subgraphs.is_empty ()
+			    && dump_enabled_p ())
+			  dump_printf_loc (MSG_NOTE, vect_location,
+					   "not profitable because of "
+					   "unprofitable if-converted scalar "
+					   "code\n");
+			profitable_subgraphs.truncate (0);
+		      }
+		}
+	    }
+
+	  /* Finally schedule the profitable subgraphs.  */
+	  for (slp_instance instance : profitable_subgraphs)
+	    {
 	      if (!vectorized && dump_enabled_p ())
 		dump_printf_loc (MSG_NOTE, vect_location,
 				 "Basic block will be vectorized "
 				 "using SLP\n");
 	      vectorized = true;
 
-	      vect_schedule_slp (bb_vinfo,
-				 orig_loop ? BB_VINFO_SLP_INSTANCES (bb_vinfo)
-				 : instance->subgraph_entries);
+	      vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
 
 	      unsigned HOST_WIDE_INT bytes;
 	      if (dump_enabled_p ())
@@ -5931,11 +5940,6 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
 				     "basic block part vectorized using "
 				     "variable length vectors\n");
 		}
-
-	      /* When we're called from loop vectorization we're considering
-		 all subgraphs at once.  */
-	      if (orig_loop)
-		break;
 	    }
 	}
       else