public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Do not count unused scalar use when marking STMT_VINFO_LIVE_P [PR113091]
@ 2023-12-29 10:28 Feng Xue OS
  2024-01-10 23:42 ` PING: " Feng Xue OS
  2024-01-11  9:46 ` Richard Biener
  0 siblings, 2 replies; 6+ messages in thread
From: Feng Xue OS @ 2023-12-29 10:28 UTC (permalink / raw)
  To: gcc-patches

This patch is meant to fix over-estimation about SLP vector-to-scalar cost for
STMT_VINFO_LIVE_P statement. When pattern recognition is involved, a
statement whose definition is consumed in some pattern, may not be
included in the final replacement pattern statements, and would be skipped
when building SLP graph.

     * Original
      char a_c = *(char *) a;
      char b_c = *(char *) b;
      unsigned short a_s = (unsigned short) a_c;
      int a_i = (int) a_s;
      int b_i = (int) b_c;
      int r_i = a_i - b_i;

     * After pattern replacement
      a_s = (unsigned short) a_c;
      a_i = (int) a_s;

      patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
      patt_b_i = (int) patt_b_s;          // b_i = (int) b_c

      patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
      patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i

The definitions of a_i(original statement) and b_i(pattern statement)
are related to, but actually not part of widen_minus pattern.
Vectorizing the pattern does not cause these definition statements to
be marked as PURE_SLP.  For this case, we need to recursively check
whether their uses are all absorbed into vectorized code.  But there
is an exception that some use may participate in an vectorized
operation via an external SLP node containing that use as an element.

Feng

---
 .../gcc.target/aarch64/bb-slp-pr113091.c      |  22 ++
 gcc/tree-vect-slp.cc                          | 189 ++++++++++++++----
 2 files changed, 172 insertions(+), 39 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c

diff --git a/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
new file mode 100644
index 00000000000..ff822e90b4a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-slp-details -ftree-slp-vectorize" } */
+
+int test(unsigned array[8]);
+
+int foo(char *a, char *b)
+{
+  unsigned array[8];
+
+  array[0] = (a[0] - b[0]);
+  array[1] = (a[1] - b[1]);
+  array[2] = (a[2] - b[2]);
+  array[3] = (a[3] - b[3]);
+  array[4] = (a[4] - b[4]);
+  array[5] = (a[5] - b[5]);
+  array[6] = (a[6] - b[6]);
+  array[7] = (a[7] - b[7]);
+
+  return test(array);
+}
+
+/* { dg-final { scan-tree-dump-times "Basic block will be vectorized using SLP" 1 "slp2" } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index a82fca45161..d36ff37114e 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -6418,6 +6418,84 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
   return res;
 }
 
+/* Given a definition DEF, analyze if it will have any live scalar use after
+   performing SLP vectorization whose information is represented by BB_VINFO,
+   and record result into hash map SCALAR_USE_MAP as cache for later fast
+   check.  */
+
+static bool
+vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
+			hash_map<tree, bool> &scalar_use_map)
+{
+  imm_use_iterator use_iter;
+  gimple *use_stmt;
+
+  if (bool *res = scalar_use_map.get (def))
+    return *res;
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
+    {
+      if (is_gimple_debug (use_stmt))
+	continue;
+
+      stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
+
+      if (!use_stmt_info)
+	break;
+
+      if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
+	continue;
+
+      /* Do not step forward when encounter PHI statement, since it may
+	 involve cyclic reference and cause infinite recursive invocation.  */
+      if (gimple_code (use_stmt) == GIMPLE_PHI)
+	break;
+
+      /* When pattern recognition is involved, a statement whose definition is
+	 consumed in some pattern, may not be included in the final replacement
+	 pattern statements, so would be skipped when building SLP graph.
+
+	 * Original
+	  char a_c = *(char *) a;
+	  char b_c = *(char *) b;
+	  unsigned short a_s = (unsigned short) a_c;
+	  int a_i = (int) a_s;
+	  int b_i = (int) b_c;
+	  int r_i = a_i - b_i;
+
+	 * After pattern replacement
+	  a_s = (unsigned short) a_c;
+	  a_i = (int) a_s;
+
+	  patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
+	  patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
+
+	  patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
+	  patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
+
+	 The definitions of a_i(original statement) and b_i(pattern statement)
+	 are related to, but actually not part of widen_minus pattern.
+	 Vectorizing the pattern does not cause these definition statements to
+	 be marked as PURE_SLP.  For this case, we need to recursively check
+	 whether their uses are all absorbed into vectorized code.  But there
+	 is an exception that some use may participate in an vectorized
+	 operation via an external SLP node containing that use as an element.
+	 The parameter "scalar_use_map" tags such kind of SSA as having scalar
+	 use in advance.  */
+      tree lhs = gimple_get_lhs (use_stmt);
+
+      if (!lhs || TREE_CODE (lhs) != SSA_NAME
+	  || vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map))
+	break;
+    }
+
+  bool found = !end_imm_use_stmt_p (&use_iter);
+  bool added = scalar_use_map.put (def, found);
+
+  gcc_assert (!added);
+  return found;
+}
+
 /* Mark lanes of NODE that are live outside of the basic-block vectorized
    region and that can be vectorized using vectorizable_live_operation
    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
@@ -6427,6 +6505,7 @@ static void
 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
 			     slp_instance instance,
 			     stmt_vector_for_cost *cost_vec,
+			     hash_map<tree, bool> &scalar_use_map,
 			     hash_set<stmt_vec_info> &svisited,
 			     hash_set<slp_tree> &visited)
 {
@@ -6451,32 +6530,22 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
       def_operand_p def_p;
       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
 	{
-	  imm_use_iterator use_iter;
-	  gimple *use_stmt;
-	  stmt_vec_info use_stmt_info;
-	  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
-	    if (!is_gimple_debug (use_stmt))
-	      {
-		use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
-		if (!use_stmt_info
-		    || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
-		  {
-		    STMT_VINFO_LIVE_P (stmt_info) = true;
-		    if (vectorizable_live_operation (bb_vinfo, stmt_info,
-						     node, instance, i,
-						     false, cost_vec))
-		      /* ???  So we know we can vectorize the live stmt
-			 from one SLP node.  If we cannot do so from all
-			 or none consistently we'd have to record which
-			 SLP node (and lane) we want to use for the live
-			 operation.  So make sure we can code-generate
-			 from all nodes.  */
-		      mark_visited = false;
-		    else
-		      STMT_VINFO_LIVE_P (stmt_info) = false;
-		    break;
-		  }
-	      }
+	  if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
+				      scalar_use_map))
+	    {
+	      STMT_VINFO_LIVE_P (stmt_info) = true;
+	      if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
+					       instance, i, false, cost_vec))
+		/* ???  So we know we can vectorize the live stmt from one SLP
+		   node.  If we cannot do so from all or none consistently
+		   we'd have to record which SLP node (and lane) we want to
+		   use for the live operation.  So make sure we can
+		   code-generate from all nodes.  */
+		mark_visited = false;
+	      else
+		STMT_VINFO_LIVE_P (stmt_info) = false;
+	    }
+
 	  /* We have to verify whether we can insert the lane extract
 	     before all uses.  The following is a conservative approximation.
 	     We cannot put this into vectorizable_live_operation because
@@ -6495,6 +6564,10 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
 	     from the latest stmt in a node.  So we compensate for this
 	     during code-generation, simply not replacing uses for those
 	     hopefully rare cases.  */
+	  imm_use_iterator use_iter;
+	  gimple *use_stmt;
+	  stmt_vec_info use_stmt_info;
+
 	  if (STMT_VINFO_LIVE_P (stmt_info))
 	    FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
 	      if (!is_gimple_debug (use_stmt)
@@ -6517,8 +6590,56 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
   slp_tree child;
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
-      vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
-				   cost_vec, svisited, visited);
+      vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
+				   scalar_use_map, svisited, visited);
+}
+
+/* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
+   are live outside of the basic-block vectorized region and that can be
+   vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
+
+static void
+vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
+{
+  if (bb_vinfo->slp_instances.is_empty ())
+    return;
+
+  hash_set<stmt_vec_info> svisited;
+  hash_set<slp_tree> visited;
+  hash_map<tree, bool> scalar_use_map;
+  auto_vec<slp_tree> worklist;
+
+  for (slp_instance instance : bb_vinfo->slp_instances)
+    if (!visited.add (SLP_INSTANCE_TREE (instance)))
+      worklist.safe_push (SLP_INSTANCE_TREE (instance));
+
+  do
+    {
+      slp_tree node = worklist.pop ();
+
+      if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
+	{
+	  for (tree op : SLP_TREE_SCALAR_OPS (node))
+	    if (TREE_CODE (op) == SSA_NAME)
+	      scalar_use_map.put (op, true);
+	}
+      else
+	{
+	  for (slp_tree child : SLP_TREE_CHILDREN (node))
+	    if (child && !visited.add (child))
+	      worklist.safe_push (child);
+	}
+    } while (!worklist.is_empty ());
+
+  visited.empty ();
+
+  for (slp_instance instance : bb_vinfo->slp_instances)
+    {
+      vect_location = instance->location ();
+      vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
+				   instance, &instance->cost_vec,
+				   scalar_use_map, svisited, visited);
+    }
 }
 
 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
@@ -6684,17 +6805,7 @@ vect_slp_analyze_operations (vec_info *vinfo)
 
   /* Compute vectorizable live stmts.  */
   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
-    {
-      hash_set<stmt_vec_info> svisited;
-      hash_set<slp_tree> visited;
-      for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
-	{
-	  vect_location = instance->location ();
-	  vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
-				       instance, &instance->cost_vec, svisited,
-				       visited);
-	}
-    }
+    vect_bb_slp_mark_live_stmts (bb_vinfo);
 
   return !vinfo->slp_instances.is_empty ();
 }
-- 
2.17.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* PING: [PATCH] Do not count unused scalar use when marking STMT_VINFO_LIVE_P [PR113091]
  2023-12-29 10:28 [PATCH] Do not count unused scalar use when marking STMT_VINFO_LIVE_P [PR113091] Feng Xue OS
@ 2024-01-10 23:42 ` Feng Xue OS
  2024-01-11  9:46 ` Richard Biener
  1 sibling, 0 replies; 6+ messages in thread
From: Feng Xue OS @ 2024-01-10 23:42 UTC (permalink / raw)
  To: Richard Biener, gcc-patches

Hi, Richard,

  Would you please talk a look at this patch?

Thanks,
Feng

________________________________________
From: Feng Xue OS <fxue@os.amperecomputing.com>
Sent: Friday, December 29, 2023 6:28 PM
To: gcc-patches@gcc.gnu.org
Subject: [PATCH] Do not count unused scalar use when marking STMT_VINFO_LIVE_P [PR113091]

This patch is meant to fix over-estimation about SLP vector-to-scalar cost for
STMT_VINFO_LIVE_P statement. When pattern recognition is involved, a
statement whose definition is consumed in some pattern, may not be
included in the final replacement pattern statements, and would be skipped
when building SLP graph.

     * Original
      char a_c = *(char *) a;
      char b_c = *(char *) b;
      unsigned short a_s = (unsigned short) a_c;
      int a_i = (int) a_s;
      int b_i = (int) b_c;
      int r_i = a_i - b_i;

     * After pattern replacement
      a_s = (unsigned short) a_c;
      a_i = (int) a_s;

      patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
      patt_b_i = (int) patt_b_s;          // b_i = (int) b_c

      patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
      patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i

The definitions of a_i(original statement) and b_i(pattern statement)
are related to, but actually not part of widen_minus pattern.
Vectorizing the pattern does not cause these definition statements to
be marked as PURE_SLP.  For this case, we need to recursively check
whether their uses are all absorbed into vectorized code.  But there
is an exception that some use may participate in an vectorized
operation via an external SLP node containing that use as an element.

Feng

---
 .../gcc.target/aarch64/bb-slp-pr113091.c      |  22 ++
 gcc/tree-vect-slp.cc                          | 189 ++++++++++++++----
 2 files changed, 172 insertions(+), 39 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c

diff --git a/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
new file mode 100644
index 00000000000..ff822e90b4a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-slp-details -ftree-slp-vectorize" } */
+
+int test(unsigned array[8]);
+
+int foo(char *a, char *b)
+{
+  unsigned array[8];
+
+  array[0] = (a[0] - b[0]);
+  array[1] = (a[1] - b[1]);
+  array[2] = (a[2] - b[2]);
+  array[3] = (a[3] - b[3]);
+  array[4] = (a[4] - b[4]);
+  array[5] = (a[5] - b[5]);
+  array[6] = (a[6] - b[6]);
+  array[7] = (a[7] - b[7]);
+
+  return test(array);
+}
+
+/* { dg-final { scan-tree-dump-times "Basic block will be vectorized using SLP" 1 "slp2" } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index a82fca45161..d36ff37114e 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -6418,6 +6418,84 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
   return res;
 }

+/* Given a definition DEF, analyze if it will have any live scalar use after
+   performing SLP vectorization whose information is represented by BB_VINFO,
+   and record result into hash map SCALAR_USE_MAP as cache for later fast
+   check.  */
+
+static bool
+vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
+                       hash_map<tree, bool> &scalar_use_map)
+{
+  imm_use_iterator use_iter;
+  gimple *use_stmt;
+
+  if (bool *res = scalar_use_map.get (def))
+    return *res;
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
+    {
+      if (is_gimple_debug (use_stmt))
+       continue;
+
+      stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
+
+      if (!use_stmt_info)
+       break;
+
+      if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
+       continue;
+
+      /* Do not step forward when encounter PHI statement, since it may
+        involve cyclic reference and cause infinite recursive invocation.  */
+      if (gimple_code (use_stmt) == GIMPLE_PHI)
+       break;
+
+      /* When pattern recognition is involved, a statement whose definition is
+        consumed in some pattern, may not be included in the final replacement
+        pattern statements, so would be skipped when building SLP graph.
+
+        * Original
+         char a_c = *(char *) a;
+         char b_c = *(char *) b;
+         unsigned short a_s = (unsigned short) a_c;
+         int a_i = (int) a_s;
+         int b_i = (int) b_c;
+         int r_i = a_i - b_i;
+
+        * After pattern replacement
+         a_s = (unsigned short) a_c;
+         a_i = (int) a_s;
+
+         patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
+         patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
+
+         patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
+         patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
+
+        The definitions of a_i(original statement) and b_i(pattern statement)
+        are related to, but actually not part of widen_minus pattern.
+        Vectorizing the pattern does not cause these definition statements to
+        be marked as PURE_SLP.  For this case, we need to recursively check
+        whether their uses are all absorbed into vectorized code.  But there
+        is an exception that some use may participate in an vectorized
+        operation via an external SLP node containing that use as an element.
+        The parameter "scalar_use_map" tags such kind of SSA as having scalar
+        use in advance.  */
+      tree lhs = gimple_get_lhs (use_stmt);
+
+      if (!lhs || TREE_CODE (lhs) != SSA_NAME
+         || vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map))
+       break;
+    }
+
+  bool found = !end_imm_use_stmt_p (&use_iter);
+  bool added = scalar_use_map.put (def, found);
+
+  gcc_assert (!added);
+  return found;
+}
+
 /* Mark lanes of NODE that are live outside of the basic-block vectorized
    region and that can be vectorized using vectorizable_live_operation
    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
@@ -6427,6 +6505,7 @@ static void
 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
                             slp_instance instance,
                             stmt_vector_for_cost *cost_vec,
+                            hash_map<tree, bool> &scalar_use_map,
                             hash_set<stmt_vec_info> &svisited,
                             hash_set<slp_tree> &visited)
 {
@@ -6451,32 +6530,22 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
       def_operand_p def_p;
       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
        {
-         imm_use_iterator use_iter;
-         gimple *use_stmt;
-         stmt_vec_info use_stmt_info;
-         FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
-           if (!is_gimple_debug (use_stmt))
-             {
-               use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
-               if (!use_stmt_info
-                   || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
-                 {
-                   STMT_VINFO_LIVE_P (stmt_info) = true;
-                   if (vectorizable_live_operation (bb_vinfo, stmt_info,
-                                                    node, instance, i,
-                                                    false, cost_vec))
-                     /* ???  So we know we can vectorize the live stmt
-                        from one SLP node.  If we cannot do so from all
-                        or none consistently we'd have to record which
-                        SLP node (and lane) we want to use for the live
-                        operation.  So make sure we can code-generate
-                        from all nodes.  */
-                     mark_visited = false;
-                   else
-                     STMT_VINFO_LIVE_P (stmt_info) = false;
-                   break;
-                 }
-             }
+         if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
+                                     scalar_use_map))
+           {
+             STMT_VINFO_LIVE_P (stmt_info) = true;
+             if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
+                                              instance, i, false, cost_vec))
+               /* ???  So we know we can vectorize the live stmt from one SLP
+                  node.  If we cannot do so from all or none consistently
+                  we'd have to record which SLP node (and lane) we want to
+                  use for the live operation.  So make sure we can
+                  code-generate from all nodes.  */
+               mark_visited = false;
+             else
+               STMT_VINFO_LIVE_P (stmt_info) = false;
+           }
+
          /* We have to verify whether we can insert the lane extract
             before all uses.  The following is a conservative approximation.
             We cannot put this into vectorizable_live_operation because
@@ -6495,6 +6564,10 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
             from the latest stmt in a node.  So we compensate for this
             during code-generation, simply not replacing uses for those
             hopefully rare cases.  */
+         imm_use_iterator use_iter;
+         gimple *use_stmt;
+         stmt_vec_info use_stmt_info;
+
          if (STMT_VINFO_LIVE_P (stmt_info))
            FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
              if (!is_gimple_debug (use_stmt)
@@ -6517,8 +6590,56 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
   slp_tree child;
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
-      vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
-                                  cost_vec, svisited, visited);
+      vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
+                                  scalar_use_map, svisited, visited);
+}
+
+/* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
+   are live outside of the basic-block vectorized region and that can be
+   vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
+
+static void
+vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
+{
+  if (bb_vinfo->slp_instances.is_empty ())
+    return;
+
+  hash_set<stmt_vec_info> svisited;
+  hash_set<slp_tree> visited;
+  hash_map<tree, bool> scalar_use_map;
+  auto_vec<slp_tree> worklist;
+
+  for (slp_instance instance : bb_vinfo->slp_instances)
+    if (!visited.add (SLP_INSTANCE_TREE (instance)))
+      worklist.safe_push (SLP_INSTANCE_TREE (instance));
+
+  do
+    {
+      slp_tree node = worklist.pop ();
+
+      if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
+       {
+         for (tree op : SLP_TREE_SCALAR_OPS (node))
+           if (TREE_CODE (op) == SSA_NAME)
+             scalar_use_map.put (op, true);
+       }
+      else
+       {
+         for (slp_tree child : SLP_TREE_CHILDREN (node))
+           if (child && !visited.add (child))
+             worklist.safe_push (child);
+       }
+    } while (!worklist.is_empty ());
+
+  visited.empty ();
+
+  for (slp_instance instance : bb_vinfo->slp_instances)
+    {
+      vect_location = instance->location ();
+      vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
+                                  instance, &instance->cost_vec,
+                                  scalar_use_map, svisited, visited);
+    }
 }

 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
@@ -6684,17 +6805,7 @@ vect_slp_analyze_operations (vec_info *vinfo)

   /* Compute vectorizable live stmts.  */
   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
-    {
-      hash_set<stmt_vec_info> svisited;
-      hash_set<slp_tree> visited;
-      for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
-       {
-         vect_location = instance->location ();
-         vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
-                                      instance, &instance->cost_vec, svisited,
-                                      visited);
-       }
-    }
+    vect_bb_slp_mark_live_stmts (bb_vinfo);

   return !vinfo->slp_instances.is_empty ();
 }
--
2.17.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Do not count unused scalar use when marking STMT_VINFO_LIVE_P [PR113091]
  2023-12-29 10:28 [PATCH] Do not count unused scalar use when marking STMT_VINFO_LIVE_P [PR113091] Feng Xue OS
  2024-01-10 23:42 ` PING: " Feng Xue OS
@ 2024-01-11  9:46 ` Richard Biener
  2024-01-11  9:52   ` Richard Biener
  1 sibling, 1 reply; 6+ messages in thread
From: Richard Biener @ 2024-01-11  9:46 UTC (permalink / raw)
  To: Feng Xue OS; +Cc: gcc-patches

On Fri, Dec 29, 2023 at 11:29 AM Feng Xue OS
<fxue@os.amperecomputing.com> wrote:
>
> This patch is meant to fix over-estimation about SLP vector-to-scalar cost for
> STMT_VINFO_LIVE_P statement. When pattern recognition is involved, a
> statement whose definition is consumed in some pattern, may not be
> included in the final replacement pattern statements, and would be skipped
> when building SLP graph.
>
>      * Original
>       char a_c = *(char *) a;
>       char b_c = *(char *) b;
>       unsigned short a_s = (unsigned short) a_c;
>       int a_i = (int) a_s;
>       int b_i = (int) b_c;
>       int r_i = a_i - b_i;
>
>      * After pattern replacement
>       a_s = (unsigned short) a_c;
>       a_i = (int) a_s;
>
>       patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
>       patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
>
>       patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
>       patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
>
> The definitions of a_i(original statement) and b_i(pattern statement)
> are related to, but actually not part of widen_minus pattern.
> Vectorizing the pattern does not cause these definition statements to
> be marked as PURE_SLP.  For this case, we need to recursively check
> whether their uses are all absorbed into vectorized code.  But there
> is an exception that some use may participate in an vectorized
> operation via an external SLP node containing that use as an element.
>
> Feng
>
> ---
>  .../gcc.target/aarch64/bb-slp-pr113091.c      |  22 ++
>  gcc/tree-vect-slp.cc                          | 189 ++++++++++++++----
>  2 files changed, 172 insertions(+), 39 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> new file mode 100644
> index 00000000000..ff822e90b4a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -fdump-tree-slp-details -ftree-slp-vectorize" } */
> +
> +int test(unsigned array[8]);
> +
> +int foo(char *a, char *b)
> +{
> +  unsigned array[8];
> +
> +  array[0] = (a[0] - b[0]);
> +  array[1] = (a[1] - b[1]);
> +  array[2] = (a[2] - b[2]);
> +  array[3] = (a[3] - b[3]);
> +  array[4] = (a[4] - b[4]);
> +  array[5] = (a[5] - b[5]);
> +  array[6] = (a[6] - b[6]);
> +  array[7] = (a[7] - b[7]);
> +
> +  return test(array);
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Basic block will be vectorized using SLP" 1 "slp2" } } */
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index a82fca45161..d36ff37114e 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -6418,6 +6418,84 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
>    return res;
>  }
>
> +/* Given a definition DEF, analyze if it will have any live scalar use after
> +   performing SLP vectorization whose information is represented by BB_VINFO,
> +   and record result into hash map SCALAR_USE_MAP as cache for later fast
> +   check.  */
> +
> +static bool
> +vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
> +                       hash_map<tree, bool> &scalar_use_map)
> +{
> +  imm_use_iterator use_iter;
> +  gimple *use_stmt;
> +
> +  if (bool *res = scalar_use_map.get (def))
> +    return *res;
> +
> +  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
> +    {
> +      if (is_gimple_debug (use_stmt))
> +       continue;
> +
> +      stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
> +
> +      if (!use_stmt_info)
> +       break;
> +
> +      if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
> +       continue;
> +
> +      /* Do not step forward when encounter PHI statement, since it may
> +        involve cyclic reference and cause infinite recursive invocation.  */
> +      if (gimple_code (use_stmt) == GIMPLE_PHI)
> +       break;
> +
> +      /* When pattern recognition is involved, a statement whose definition is
> +        consumed in some pattern, may not be included in the final replacement
> +        pattern statements, so would be skipped when building SLP graph.
> +
> +        * Original
> +         char a_c = *(char *) a;
> +         char b_c = *(char *) b;
> +         unsigned short a_s = (unsigned short) a_c;
> +         int a_i = (int) a_s;
> +         int b_i = (int) b_c;
> +         int r_i = a_i - b_i;
> +
> +        * After pattern replacement
> +         a_s = (unsigned short) a_c;
> +         a_i = (int) a_s;
> +
> +         patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
> +         patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
> +
> +         patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
> +         patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
> +
> +        The definitions of a_i(original statement) and b_i(pattern statement)
> +        are related to, but actually not part of widen_minus pattern.
> +        Vectorizing the pattern does not cause these definition statements to
> +        be marked as PURE_SLP.  For this case, we need to recursively check
> +        whether their uses are all absorbed into vectorized code.  But there
> +        is an exception that some use may participate in an vectorized
> +        operation via an external SLP node containing that use as an element.
> +        The parameter "scalar_use_map" tags such kind of SSA as having scalar
> +        use in advance.  */
> +      tree lhs = gimple_get_lhs (use_stmt);
> +
> +      if (!lhs || TREE_CODE (lhs) != SSA_NAME
> +         || vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map))

This looks mostly good, the only worry I have is that this recursion
will - for unvectorized uses - eventually go all the way down to the end
of the vectorized region (aka the whole function).  Since it's really
patterns we are after it should be possible to limit the recursion to
use_stmt which are covered by a pattern, aka STMT_VINFO_IN_PATTERN_P?

PHIs should be then magically covered since we have no patterns
for them (though explicitly handling is good for documentation purposes).

Thanks,
Richard.

> +       break;
> +    }
> +
> +  bool found = !end_imm_use_stmt_p (&use_iter);
> +  bool added = scalar_use_map.put (def, found);
> +
> +  gcc_assert (!added);
> +  return found;
> +}
> +
>  /* Mark lanes of NODE that are live outside of the basic-block vectorized
>     region and that can be vectorized using vectorizable_live_operation
>     with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
> @@ -6427,6 +6505,7 @@ static void
>  vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
>                              slp_instance instance,
>                              stmt_vector_for_cost *cost_vec,
> +                            hash_map<tree, bool> &scalar_use_map,
>                              hash_set<stmt_vec_info> &svisited,
>                              hash_set<slp_tree> &visited)
>  {
> @@ -6451,32 +6530,22 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
>        def_operand_p def_p;
>        FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
>         {
> -         imm_use_iterator use_iter;
> -         gimple *use_stmt;
> -         stmt_vec_info use_stmt_info;
> -         FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
> -           if (!is_gimple_debug (use_stmt))
> -             {
> -               use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
> -               if (!use_stmt_info
> -                   || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
> -                 {
> -                   STMT_VINFO_LIVE_P (stmt_info) = true;
> -                   if (vectorizable_live_operation (bb_vinfo, stmt_info,
> -                                                    node, instance, i,
> -                                                    false, cost_vec))
> -                     /* ???  So we know we can vectorize the live stmt
> -                        from one SLP node.  If we cannot do so from all
> -                        or none consistently we'd have to record which
> -                        SLP node (and lane) we want to use for the live
> -                        operation.  So make sure we can code-generate
> -                        from all nodes.  */
> -                     mark_visited = false;
> -                   else
> -                     STMT_VINFO_LIVE_P (stmt_info) = false;
> -                   break;
> -                 }
> -             }
> +         if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
> +                                     scalar_use_map))
> +           {
> +             STMT_VINFO_LIVE_P (stmt_info) = true;
> +             if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
> +                                              instance, i, false, cost_vec))
> +               /* ???  So we know we can vectorize the live stmt from one SLP
> +                  node.  If we cannot do so from all or none consistently
> +                  we'd have to record which SLP node (and lane) we want to
> +                  use for the live operation.  So make sure we can
> +                  code-generate from all nodes.  */
> +               mark_visited = false;
> +             else
> +               STMT_VINFO_LIVE_P (stmt_info) = false;
> +           }
> +
>           /* We have to verify whether we can insert the lane extract
>              before all uses.  The following is a conservative approximation.
>              We cannot put this into vectorizable_live_operation because
> @@ -6495,6 +6564,10 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
>              from the latest stmt in a node.  So we compensate for this
>              during code-generation, simply not replacing uses for those
>              hopefully rare cases.  */
> +         imm_use_iterator use_iter;
> +         gimple *use_stmt;
> +         stmt_vec_info use_stmt_info;
> +
>           if (STMT_VINFO_LIVE_P (stmt_info))
>             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
>               if (!is_gimple_debug (use_stmt)
> @@ -6517,8 +6590,56 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
>    slp_tree child;
>    FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
>      if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
> -      vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
> -                                  cost_vec, svisited, visited);
> +      vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
> +                                  scalar_use_map, svisited, visited);
> +}
> +
> +/* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
> +   are live outside of the basic-block vectorized region and that can be
> +   vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
> +
> +static void
> +vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
> +{
> +  if (bb_vinfo->slp_instances.is_empty ())
> +    return;
> +
> +  hash_set<stmt_vec_info> svisited;
> +  hash_set<slp_tree> visited;
> +  hash_map<tree, bool> scalar_use_map;
> +  auto_vec<slp_tree> worklist;
> +
> +  for (slp_instance instance : bb_vinfo->slp_instances)
> +    if (!visited.add (SLP_INSTANCE_TREE (instance)))
> +      worklist.safe_push (SLP_INSTANCE_TREE (instance));
> +
> +  do
> +    {
> +      slp_tree node = worklist.pop ();
> +
> +      if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
> +       {
> +         for (tree op : SLP_TREE_SCALAR_OPS (node))
> +           if (TREE_CODE (op) == SSA_NAME)
> +             scalar_use_map.put (op, true);
> +       }
> +      else
> +       {
> +         for (slp_tree child : SLP_TREE_CHILDREN (node))
> +           if (child && !visited.add (child))
> +             worklist.safe_push (child);
> +       }
> +    } while (!worklist.is_empty ());
> +
> +  visited.empty ();
> +
> +  for (slp_instance instance : bb_vinfo->slp_instances)
> +    {
> +      vect_location = instance->location ();
> +      vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
> +                                  instance, &instance->cost_vec,
> +                                  scalar_use_map, svisited, visited);
> +    }
>  }
>
>  /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
> @@ -6684,17 +6805,7 @@ vect_slp_analyze_operations (vec_info *vinfo)
>
>    /* Compute vectorizable live stmts.  */
>    if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
> -    {
> -      hash_set<stmt_vec_info> svisited;
> -      hash_set<slp_tree> visited;
> -      for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
> -       {
> -         vect_location = instance->location ();
> -         vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
> -                                      instance, &instance->cost_vec, svisited,
> -                                      visited);
> -       }
> -    }
> +    vect_bb_slp_mark_live_stmts (bb_vinfo);
>
>    return !vinfo->slp_instances.is_empty ();
>  }
> --
> 2.17.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Do not count unused scalar use when marking STMT_VINFO_LIVE_P [PR113091]
  2024-01-11  9:46 ` Richard Biener
@ 2024-01-11  9:52   ` Richard Biener
  2024-01-12  6:15     ` Feng Xue OS
  0 siblings, 1 reply; 6+ messages in thread
From: Richard Biener @ 2024-01-11  9:52 UTC (permalink / raw)
  To: Feng Xue OS, Richard Sandiford; +Cc: gcc-patches

On Thu, Jan 11, 2024 at 10:46 AM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Fri, Dec 29, 2023 at 11:29 AM Feng Xue OS
> <fxue@os.amperecomputing.com> wrote:
> >
> > This patch is meant to fix over-estimation about SLP vector-to-scalar cost for
> > STMT_VINFO_LIVE_P statement. When pattern recognition is involved, a
> > statement whose definition is consumed in some pattern, may not be
> > included in the final replacement pattern statements, and would be skipped
> > when building SLP graph.
> >
> >      * Original
> >       char a_c = *(char *) a;
> >       char b_c = *(char *) b;
> >       unsigned short a_s = (unsigned short) a_c;
> >       int a_i = (int) a_s;
> >       int b_i = (int) b_c;
> >       int r_i = a_i - b_i;
> >
> >      * After pattern replacement
> >       a_s = (unsigned short) a_c;
> >       a_i = (int) a_s;
> >
> >       patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
> >       patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
> >
> >       patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
> >       patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
> >
> > The definitions of a_i(original statement) and b_i(pattern statement)
> > are related to, but actually not part of widen_minus pattern.
> > Vectorizing the pattern does not cause these definition statements to
> > be marked as PURE_SLP.  For this case, we need to recursively check
> > whether their uses are all absorbed into vectorized code.  But there
> > is an exception that some use may participate in an vectorized
> > operation via an external SLP node containing that use as an element.
> >
> > Feng
> >
> > ---
> >  .../gcc.target/aarch64/bb-slp-pr113091.c      |  22 ++
> >  gcc/tree-vect-slp.cc                          | 189 ++++++++++++++----
> >  2 files changed, 172 insertions(+), 39 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> >
> > diff --git a/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> > new file mode 100644
> > index 00000000000..ff822e90b4a
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> > @@ -0,0 +1,22 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O3 -fdump-tree-slp-details -ftree-slp-vectorize" } */
> > +
> > +int test(unsigned array[8]);
> > +
> > +int foo(char *a, char *b)
> > +{
> > +  unsigned array[8];
> > +
> > +  array[0] = (a[0] - b[0]);
> > +  array[1] = (a[1] - b[1]);
> > +  array[2] = (a[2] - b[2]);
> > +  array[3] = (a[3] - b[3]);
> > +  array[4] = (a[4] - b[4]);
> > +  array[5] = (a[5] - b[5]);
> > +  array[6] = (a[6] - b[6]);
> > +  array[7] = (a[7] - b[7]);
> > +
> > +  return test(array);
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times "Basic block will be vectorized using SLP" 1 "slp2" } } */
> > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > index a82fca45161..d36ff37114e 100644
> > --- a/gcc/tree-vect-slp.cc
> > +++ b/gcc/tree-vect-slp.cc
> > @@ -6418,6 +6418,84 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
> >    return res;
> >  }
> >
> > +/* Given a definition DEF, analyze if it will have any live scalar use after
> > +   performing SLP vectorization whose information is represented by BB_VINFO,
> > +   and record result into hash map SCALAR_USE_MAP as cache for later fast
> > +   check.  */
> > +
> > +static bool
> > +vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
> > +                       hash_map<tree, bool> &scalar_use_map)
> > +{
> > +  imm_use_iterator use_iter;
> > +  gimple *use_stmt;
> > +
> > +  if (bool *res = scalar_use_map.get (def))
> > +    return *res;
> > +
> > +  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
> > +    {
> > +      if (is_gimple_debug (use_stmt))
> > +       continue;
> > +
> > +      stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
> > +
> > +      if (!use_stmt_info)
> > +       break;
> > +
> > +      if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
> > +       continue;
> > +
> > +      /* Do not step forward when encounter PHI statement, since it may
> > +        involve cyclic reference and cause infinite recursive invocation.  */
> > +      if (gimple_code (use_stmt) == GIMPLE_PHI)
> > +       break;
> > +
> > +      /* When pattern recognition is involved, a statement whose definition is
> > +        consumed in some pattern, may not be included in the final replacement
> > +        pattern statements, so would be skipped when building SLP graph.
> > +
> > +        * Original
> > +         char a_c = *(char *) a;
> > +         char b_c = *(char *) b;
> > +         unsigned short a_s = (unsigned short) a_c;
> > +         int a_i = (int) a_s;
> > +         int b_i = (int) b_c;
> > +         int r_i = a_i - b_i;
> > +
> > +        * After pattern replacement
> > +         a_s = (unsigned short) a_c;
> > +         a_i = (int) a_s;
> > +
> > +         patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
> > +         patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
> > +
> > +         patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
> > +         patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
> > +
> > +        The definitions of a_i(original statement) and b_i(pattern statement)
> > +        are related to, but actually not part of widen_minus pattern.
> > +        Vectorizing the pattern does not cause these definition statements to
> > +        be marked as PURE_SLP.  For this case, we need to recursively check
> > +        whether their uses are all absorbed into vectorized code.  But there
> > +        is an exception that some use may participate in an vectorized
> > +        operation via an external SLP node containing that use as an element.
> > +        The parameter "scalar_use_map" tags such kind of SSA as having scalar
> > +        use in advance.  */
> > +      tree lhs = gimple_get_lhs (use_stmt);
> > +
> > +      if (!lhs || TREE_CODE (lhs) != SSA_NAME
> > +         || vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map))
>
> This looks mostly good, the only worry I have is that this recursion
> will - for unvectorized uses - eventually go all the way down to the end
> of the vectorized region (aka the whole function).  Since it's really
> patterns we are after it should be possible to limit the recursion to
> use_stmt which are covered by a pattern, aka STMT_VINFO_IN_PATTERN_P?

Hmm, in_pattern_p doesn't seem to be set for original stmts replaced
by a pattern.
In fact we don't seem to mark those in any way?  It should be possible to
programmatically do this looking at the root stmt uses maybe.  Maybe we can
instead simply limit recursion depth to one and only increase when we get
testcases requiring more?  We should eventually revisit how we mark patterns
in next stage1.

> PHIs should be then magically covered since we have no patterns
> for them (though explicitly handling is good for documentation purposes).
>
> Thanks,
> Richard.
>
> > +       break;
> > +    }
> > +
> > +  bool found = !end_imm_use_stmt_p (&use_iter);
> > +  bool added = scalar_use_map.put (def, found);
> > +
> > +  gcc_assert (!added);
> > +  return found;
> > +}
> > +
> >  /* Mark lanes of NODE that are live outside of the basic-block vectorized
> >     region and that can be vectorized using vectorizable_live_operation
> >     with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
> > @@ -6427,6 +6505,7 @@ static void
> >  vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
> >                              slp_instance instance,
> >                              stmt_vector_for_cost *cost_vec,
> > +                            hash_map<tree, bool> &scalar_use_map,
> >                              hash_set<stmt_vec_info> &svisited,
> >                              hash_set<slp_tree> &visited)
> >  {
> > @@ -6451,32 +6530,22 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
> >        def_operand_p def_p;
> >        FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
> >         {
> > -         imm_use_iterator use_iter;
> > -         gimple *use_stmt;
> > -         stmt_vec_info use_stmt_info;
> > -         FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
> > -           if (!is_gimple_debug (use_stmt))
> > -             {
> > -               use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
> > -               if (!use_stmt_info
> > -                   || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
> > -                 {
> > -                   STMT_VINFO_LIVE_P (stmt_info) = true;
> > -                   if (vectorizable_live_operation (bb_vinfo, stmt_info,
> > -                                                    node, instance, i,
> > -                                                    false, cost_vec))
> > -                     /* ???  So we know we can vectorize the live stmt
> > -                        from one SLP node.  If we cannot do so from all
> > -                        or none consistently we'd have to record which
> > -                        SLP node (and lane) we want to use for the live
> > -                        operation.  So make sure we can code-generate
> > -                        from all nodes.  */
> > -                     mark_visited = false;
> > -                   else
> > -                     STMT_VINFO_LIVE_P (stmt_info) = false;
> > -                   break;
> > -                 }
> > -             }
> > +         if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
> > +                                     scalar_use_map))
> > +           {
> > +             STMT_VINFO_LIVE_P (stmt_info) = true;
> > +             if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
> > +                                              instance, i, false, cost_vec))
> > +               /* ???  So we know we can vectorize the live stmt from one SLP
> > +                  node.  If we cannot do so from all or none consistently
> > +                  we'd have to record which SLP node (and lane) we want to
> > +                  use for the live operation.  So make sure we can
> > +                  code-generate from all nodes.  */
> > +               mark_visited = false;
> > +             else
> > +               STMT_VINFO_LIVE_P (stmt_info) = false;
> > +           }
> > +
> >           /* We have to verify whether we can insert the lane extract
> >              before all uses.  The following is a conservative approximation.
> >              We cannot put this into vectorizable_live_operation because
> > @@ -6495,6 +6564,10 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
> >              from the latest stmt in a node.  So we compensate for this
> >              during code-generation, simply not replacing uses for those
> >              hopefully rare cases.  */
> > +         imm_use_iterator use_iter;
> > +         gimple *use_stmt;
> > +         stmt_vec_info use_stmt_info;
> > +
> >           if (STMT_VINFO_LIVE_P (stmt_info))
> >             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
> >               if (!is_gimple_debug (use_stmt)
> > @@ -6517,8 +6590,56 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
> >    slp_tree child;
> >    FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
> >      if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
> > -      vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
> > -                                  cost_vec, svisited, visited);
> > +      vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
> > +                                  scalar_use_map, svisited, visited);
> > +}
> > +
> > +/* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
> > +   are live outside of the basic-block vectorized region and that can be
> > +   vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
> > +
> > +static void
> > +vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
> > +{
> > +  if (bb_vinfo->slp_instances.is_empty ())
> > +    return;
> > +
> > +  hash_set<stmt_vec_info> svisited;
> > +  hash_set<slp_tree> visited;
> > +  hash_map<tree, bool> scalar_use_map;
> > +  auto_vec<slp_tree> worklist;
> > +
> > +  for (slp_instance instance : bb_vinfo->slp_instances)
> > +    if (!visited.add (SLP_INSTANCE_TREE (instance)))
> > +      worklist.safe_push (SLP_INSTANCE_TREE (instance));
> > +
> > +  do
> > +    {
> > +      slp_tree node = worklist.pop ();
> > +
> > +      if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
> > +       {
> > +         for (tree op : SLP_TREE_SCALAR_OPS (node))
> > +           if (TREE_CODE (op) == SSA_NAME)
> > +             scalar_use_map.put (op, true);
> > +       }
> > +      else
> > +       {
> > +         for (slp_tree child : SLP_TREE_CHILDREN (node))
> > +           if (child && !visited.add (child))
> > +             worklist.safe_push (child);
> > +       }
> > +    } while (!worklist.is_empty ());
> > +
> > +  visited.empty ();
> > +
> > +  for (slp_instance instance : bb_vinfo->slp_instances)
> > +    {
> > +      vect_location = instance->location ();
> > +      vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
> > +                                  instance, &instance->cost_vec,
> > +                                  scalar_use_map, svisited, visited);
> > +    }
> >  }
> >
> >  /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
> > @@ -6684,17 +6805,7 @@ vect_slp_analyze_operations (vec_info *vinfo)
> >
> >    /* Compute vectorizable live stmts.  */
> >    if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
> > -    {
> > -      hash_set<stmt_vec_info> svisited;
> > -      hash_set<slp_tree> visited;
> > -      for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
> > -       {
> > -         vect_location = instance->location ();
> > -         vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
> > -                                      instance, &instance->cost_vec, svisited,
> > -                                      visited);
> > -       }
> > -    }
> > +    vect_bb_slp_mark_live_stmts (bb_vinfo);
> >
> >    return !vinfo->slp_instances.is_empty ();
> >  }
> > --
> > 2.17.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Do not count unused scalar use when marking STMT_VINFO_LIVE_P [PR113091]
  2024-01-11  9:52   ` Richard Biener
@ 2024-01-12  6:15     ` Feng Xue OS
  2024-01-12  8:05       ` Richard Biener
  0 siblings, 1 reply; 6+ messages in thread
From: Feng Xue OS @ 2024-01-12  6:15 UTC (permalink / raw)
  To: Richard Biener, Richard Sandiford; +Cc: gcc-patches

Add a depth parameter to limit recursion of vec_slp_has_scalar_use.

Feng
-------

 .../gcc.target/aarch64/bb-slp-pr113091.c      |  22 ++
 gcc/tree-vect-slp.cc                          | 207 ++++++++++++++----
 2 files changed, 190 insertions(+), 39 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c

diff --git a/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
new file mode 100644
index 00000000000..ff822e90b4a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-slp-details -ftree-slp-vectorize" } */
+
+int test(unsigned array[8]);
+
+int foo(char *a, char *b)
+{
+  unsigned array[8];
+
+  array[0] = (a[0] - b[0]);
+  array[1] = (a[1] - b[1]);
+  array[2] = (a[2] - b[2]);
+  array[3] = (a[3] - b[3]);
+  array[4] = (a[4] - b[4]);
+  array[5] = (a[5] - b[5]);
+  array[6] = (a[6] - b[6]);
+  array[7] = (a[7] - b[7]);
+
+  return test(array);
+}
+
+/* { dg-final { scan-tree-dump-times "Basic block will be vectorized using SLP" 1 "slp2" } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index b6cce55ce90..086377a9ac0 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -6418,6 +6418,102 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
   return res;
 }

+/* Given a definition DEF, analyze if it will have any live scalar use after
+   performing SLP vectorization whose information is represented by BB_VINFO,
+   and record result into hash map SCALAR_USE_MAP as cache for later fast
+   check.  If recursion DEPTH exceeds a limit, stop analysis and make a
+   conservative assumption.  Return 0 if no scalar use, 1 if there is, -1
+   means recursion is limited.  */
+
+static int
+vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
+                       hash_map<tree, int> &scalar_use_map,
+                       int depth = 0)
+{
+  const int depth_limit = 2;
+  imm_use_iterator use_iter;
+  gimple *use_stmt;
+
+  if (int *res = scalar_use_map.get (def))
+    return *res;
+
+  int scalar_use = 1;
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
+    {
+      if (is_gimple_debug (use_stmt))
+       continue;
+
+      stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
+
+      if (!use_stmt_info)
+       break;
+
+      if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
+       continue;
+
+      /* Do not step forward when encounter PHI statement, since it may
+        involve cyclic reference and cause infinite recursive invocation.  */
+      if (gimple_code (use_stmt) == GIMPLE_PHI)
+       break;
+
+      /* When pattern recognition is involved, a statement whose definition is
+        consumed in some pattern, may not be included in the final replacement
+        pattern statements, so would be skipped when building SLP graph.
+
+        * Original
+         char a_c = *(char *) a;
+         char b_c = *(char *) b;
+         unsigned short a_s = (unsigned short) a_c;
+         int a_i = (int) a_s;
+         int b_i = (int) b_c;
+         int r_i = a_i - b_i;
+
+        * After pattern replacement
+         a_s = (unsigned short) a_c;
+         a_i = (int) a_s;
+
+         patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
+         patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
+
+         patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
+         patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
+
+        The definitions of a_i(original statement) and b_i(pattern statement)
+        are related to, but actually not part of widen_minus pattern.
+        Vectorizing the pattern does not cause these definition statements to
+        be marked as PURE_SLP.  For this case, we need to recursively check
+        whether their uses are all absorbed into vectorized code.  But there
+        is an exception that some use may participate in an vectorized
+        operation via an external SLP node containing that use as an element.
+        The parameter "scalar_use_map" tags such kind of SSA as having scalar
+        use in advance.  */
+      tree lhs = gimple_get_lhs (use_stmt);
+
+      if (!lhs || TREE_CODE (lhs) != SSA_NAME)
+       break;
+
+      if (depth_limit && depth >= depth_limit)
+       return -1;
+
+      if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
+                                               depth + 1)))
+       break;
+    }
+
+  if (end_imm_use_stmt_p (&use_iter))
+    scalar_use = 0;
+
+  /* If recursion is limited, do not cache result for non-root defs.  */
+  if (!depth || scalar_use >= 0)
+    {
+      bool added = scalar_use_map.put (def, scalar_use);
+      gcc_assert (!added);
+    }
+
+  return scalar_use;
+}
+
 /* Mark lanes of NODE that are live outside of the basic-block vectorized
    region and that can be vectorized using vectorizable_live_operation
    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
@@ -6427,6 +6523,7 @@ static void
 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
                             slp_instance instance,
                             stmt_vector_for_cost *cost_vec,
+                            hash_map<tree, int> &scalar_use_map,
                             hash_set<stmt_vec_info> &svisited,
                             hash_set<slp_tree> &visited)
 {
@@ -6451,32 +6548,22 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
       def_operand_p def_p;
       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
        {
-         imm_use_iterator use_iter;
-         gimple *use_stmt;
-         stmt_vec_info use_stmt_info;
-         FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
-           if (!is_gimple_debug (use_stmt))
-             {
-               use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
-               if (!use_stmt_info
-                   || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
-                 {
-                   STMT_VINFO_LIVE_P (stmt_info) = true;
-                   if (vectorizable_live_operation (bb_vinfo, stmt_info,
-                                                    node, instance, i,
-                                                    false, cost_vec))
-                     /* ???  So we know we can vectorize the live stmt
-                        from one SLP node.  If we cannot do so from all
-                        or none consistently we'd have to record which
-                        SLP node (and lane) we want to use for the live
-                        operation.  So make sure we can code-generate
-                        from all nodes.  */
-                     mark_visited = false;
-                   else
-                     STMT_VINFO_LIVE_P (stmt_info) = false;
-                   break;
-                 }
-             }
+         if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
+                                     scalar_use_map))
+           {
+             STMT_VINFO_LIVE_P (stmt_info) = true;
+             if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
+                                              instance, i, false, cost_vec))
+               /* ???  So we know we can vectorize the live stmt from one SLP
+                  node.  If we cannot do so from all or none consistently
+                  we'd have to record which SLP node (and lane) we want to
+                  use for the live operation.  So make sure we can
+                  code-generate from all nodes.  */
+               mark_visited = false;
+             else
+               STMT_VINFO_LIVE_P (stmt_info) = false;
+           }
+
          /* We have to verify whether we can insert the lane extract
             before all uses.  The following is a conservative approximation.
             We cannot put this into vectorizable_live_operation because
@@ -6495,6 +6582,10 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
             from the latest stmt in a node.  So we compensate for this
             during code-generation, simply not replacing uses for those
             hopefully rare cases.  */
+         imm_use_iterator use_iter;
+         gimple *use_stmt;
+         stmt_vec_info use_stmt_info;
+
          if (STMT_VINFO_LIVE_P (stmt_info))
            FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
              if (!is_gimple_debug (use_stmt)
@@ -6517,8 +6608,56 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
   slp_tree child;
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
-      vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
-                                  cost_vec, svisited, visited);
+      vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
+                                  scalar_use_map, svisited, visited);
+}
+
+/* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
+   are live outside of the basic-block vectorized region and that can be
+   vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
+
+static void
+vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
+{
+  if (bb_vinfo->slp_instances.is_empty ())
+    return;
+
+  hash_set<stmt_vec_info> svisited;
+  hash_set<slp_tree> visited;
+  hash_map<tree, int> scalar_use_map;
+  auto_vec<slp_tree> worklist;
+
+  for (slp_instance instance : bb_vinfo->slp_instances)
+    if (!visited.add (SLP_INSTANCE_TREE (instance)))
+      worklist.safe_push (SLP_INSTANCE_TREE (instance));
+
+  do
+    {
+      slp_tree node = worklist.pop ();
+
+      if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
+       {
+         for (tree op : SLP_TREE_SCALAR_OPS (node))
+           if (TREE_CODE (op) == SSA_NAME)
+             scalar_use_map.put (op, 1);
+       }
+      else
+       {
+         for (slp_tree child : SLP_TREE_CHILDREN (node))
+           if (child && !visited.add (child))
+             worklist.safe_push (child);
+       }
+    } while (!worklist.is_empty ());
+
+  visited.empty ();
+
+  for (slp_instance instance : bb_vinfo->slp_instances)
+    {
+      vect_location = instance->location ();
+      vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
+                                  instance, &instance->cost_vec,
+                                  scalar_use_map, svisited, visited);
+    }
 }

 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
@@ -6684,17 +6823,7 @@ vect_slp_analyze_operations (vec_info *vinfo)

   /* Compute vectorizable live stmts.  */
   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
-    {
-      hash_set<stmt_vec_info> svisited;
-      hash_set<slp_tree> visited;
-      for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
-       {
-         vect_location = instance->location ();
-         vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
-                                      instance, &instance->cost_vec, svisited,
-                                      visited);
-       }
-    }
+    vect_bb_slp_mark_live_stmts (bb_vinfo);

   return !vinfo->slp_instances.is_empty ();
 }
--
2.17.1

________________________________________
From: Richard Biener <richard.guenther@gmail.com>
Sent: Thursday, January 11, 2024 5:52 PM
To: Feng Xue OS; Richard Sandiford
Cc: gcc-patches@gcc.gnu.org
Subject: Re: [PATCH] Do not count unused scalar use when marking STMT_VINFO_LIVE_P [PR113091]

On Thu, Jan 11, 2024 at 10:46 AM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Fri, Dec 29, 2023 at 11:29 AM Feng Xue OS
> <fxue@os.amperecomputing.com> wrote:
> >
> > This patch is meant to fix over-estimation about SLP vector-to-scalar cost for
> > STMT_VINFO_LIVE_P statement. When pattern recognition is involved, a
> > statement whose definition is consumed in some pattern, may not be
> > included in the final replacement pattern statements, and would be skipped
> > when building SLP graph.
> >
> >      * Original
> >       char a_c = *(char *) a;
> >       char b_c = *(char *) b;
> >       unsigned short a_s = (unsigned short) a_c;
> >       int a_i = (int) a_s;
> >       int b_i = (int) b_c;
> >       int r_i = a_i - b_i;
> >
> >      * After pattern replacement
> >       a_s = (unsigned short) a_c;
> >       a_i = (int) a_s;
> >
> >       patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
> >       patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
> >
> >       patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
> >       patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
> >
> > The definitions of a_i(original statement) and b_i(pattern statement)
> > are related to, but actually not part of widen_minus pattern.
> > Vectorizing the pattern does not cause these definition statements to
> > be marked as PURE_SLP.  For this case, we need to recursively check
> > whether their uses are all absorbed into vectorized code.  But there
> > is an exception that some use may participate in an vectorized
> > operation via an external SLP node containing that use as an element.
> >
> > Feng
> >
> > ---
> >  .../gcc.target/aarch64/bb-slp-pr113091.c      |  22 ++
> >  gcc/tree-vect-slp.cc                          | 189 ++++++++++++++----
> >  2 files changed, 172 insertions(+), 39 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> >
> > diff --git a/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> > new file mode 100644
> > index 00000000000..ff822e90b4a
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> > @@ -0,0 +1,22 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O3 -fdump-tree-slp-details -ftree-slp-vectorize" } */
> > +
> > +int test(unsigned array[8]);
> > +
> > +int foo(char *a, char *b)
> > +{
> > +  unsigned array[8];
> > +
> > +  array[0] = (a[0] - b[0]);
> > +  array[1] = (a[1] - b[1]);
> > +  array[2] = (a[2] - b[2]);
> > +  array[3] = (a[3] - b[3]);
> > +  array[4] = (a[4] - b[4]);
> > +  array[5] = (a[5] - b[5]);
> > +  array[6] = (a[6] - b[6]);
> > +  array[7] = (a[7] - b[7]);
> > +
> > +  return test(array);
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times "Basic block will be vectorized using SLP" 1 "slp2" } } */
> > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > index a82fca45161..d36ff37114e 100644
> > --- a/gcc/tree-vect-slp.cc
> > +++ b/gcc/tree-vect-slp.cc
> > @@ -6418,6 +6418,84 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
> >    return res;
> >  }
> >
> > +/* Given a definition DEF, analyze if it will have any live scalar use after
> > +   performing SLP vectorization whose information is represented by BB_VINFO,
> > +   and record result into hash map SCALAR_USE_MAP as cache for later fast
> > +   check.  */
> > +
> > +static bool
> > +vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
> > +                       hash_map<tree, bool> &scalar_use_map)
> > +{
> > +  imm_use_iterator use_iter;
> > +  gimple *use_stmt;
> > +
> > +  if (bool *res = scalar_use_map.get (def))
> > +    return *res;
> > +
> > +  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
> > +    {
> > +      if (is_gimple_debug (use_stmt))
> > +       continue;
> > +
> > +      stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
> > +
> > +      if (!use_stmt_info)
> > +       break;
> > +
> > +      if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
> > +       continue;
> > +
> > +      /* Do not step forward when encounter PHI statement, since it may
> > +        involve cyclic reference and cause infinite recursive invocation.  */
> > +      if (gimple_code (use_stmt) == GIMPLE_PHI)
> > +       break;
> > +
> > +      /* When pattern recognition is involved, a statement whose definition is
> > +        consumed in some pattern, may not be included in the final replacement
> > +        pattern statements, so would be skipped when building SLP graph.
> > +
> > +        * Original
> > +         char a_c = *(char *) a;
> > +         char b_c = *(char *) b;
> > +         unsigned short a_s = (unsigned short) a_c;
> > +         int a_i = (int) a_s;
> > +         int b_i = (int) b_c;
> > +         int r_i = a_i - b_i;
> > +
> > +        * After pattern replacement
> > +         a_s = (unsigned short) a_c;
> > +         a_i = (int) a_s;
> > +
> > +         patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
> > +         patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
> > +
> > +         patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
> > +         patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
> > +
> > +        The definitions of a_i(original statement) and b_i(pattern statement)
> > +        are related to, but actually not part of widen_minus pattern.
> > +        Vectorizing the pattern does not cause these definition statements to
> > +        be marked as PURE_SLP.  For this case, we need to recursively check
> > +        whether their uses are all absorbed into vectorized code.  But there
> > +        is an exception that some use may participate in an vectorized
> > +        operation via an external SLP node containing that use as an element.
> > +        The parameter "scalar_use_map" tags such kind of SSA as having scalar
> > +        use in advance.  */
> > +      tree lhs = gimple_get_lhs (use_stmt);
> > +
> > +      if (!lhs || TREE_CODE (lhs) != SSA_NAME
> > +         || vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map))
>
> This looks mostly good, the only worry I have is that this recursion
> will - for unvectorized uses - eventually go all the way down to the end
> of the vectorized region (aka the whole function).  Since it's really
> patterns we are after it should be possible to limit the recursion to
> use_stmt which are covered by a pattern, aka STMT_VINFO_IN_PATTERN_P?

Hmm, in_pattern_p doesn't seem to be set for original stmts replaced
by a pattern.
In fact we don't seem to mark those in any way?  It should be possible to
programmatically do this looking at the root stmt uses maybe.  Maybe we can
instead simply limit recursion depth to one and only increase when we get
testcases requiring more?  We should eventually revisit how we mark patterns
in next stage1.

> PHIs should be then magically covered since we have no patterns
> for them (though explicitly handling is good for documentation purposes).
>
> Thanks,
> Richard.
>
> > +       break;
> > +    }
> > +
> > +  bool found = !end_imm_use_stmt_p (&use_iter);
> > +  bool added = scalar_use_map.put (def, found);
> > +
> > +  gcc_assert (!added);
> > +  return found;
> > +}
> > +
> >  /* Mark lanes of NODE that are live outside of the basic-block vectorized
> >     region and that can be vectorized using vectorizable_live_operation
> >     with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
> > @@ -6427,6 +6505,7 @@ static void
> >  vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
> >                              slp_instance instance,
> >                              stmt_vector_for_cost *cost_vec,
> > +                            hash_map<tree, bool> &scalar_use_map,
> >                              hash_set<stmt_vec_info> &svisited,
> >                              hash_set<slp_tree> &visited)
> >  {
> > @@ -6451,32 +6530,22 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
> >        def_operand_p def_p;
> >        FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
> >         {
> > -         imm_use_iterator use_iter;
> > -         gimple *use_stmt;
> > -         stmt_vec_info use_stmt_info;
> > -         FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
> > -           if (!is_gimple_debug (use_stmt))
> > -             {
> > -               use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
> > -               if (!use_stmt_info
> > -                   || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
> > -                 {
> > -                   STMT_VINFO_LIVE_P (stmt_info) = true;
> > -                   if (vectorizable_live_operation (bb_vinfo, stmt_info,
> > -                                                    node, instance, i,
> > -                                                    false, cost_vec))
> > -                     /* ???  So we know we can vectorize the live stmt
> > -                        from one SLP node.  If we cannot do so from all
> > -                        or none consistently we'd have to record which
> > -                        SLP node (and lane) we want to use for the live
> > -                        operation.  So make sure we can code-generate
> > -                        from all nodes.  */
> > -                     mark_visited = false;
> > -                   else
> > -                     STMT_VINFO_LIVE_P (stmt_info) = false;
> > -                   break;
> > -                 }
> > -             }
> > +         if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
> > +                                     scalar_use_map))
> > +           {
> > +             STMT_VINFO_LIVE_P (stmt_info) = true;
> > +             if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
> > +                                              instance, i, false, cost_vec))
> > +               /* ???  So we know we can vectorize the live stmt from one SLP
> > +                  node.  If we cannot do so from all or none consistently
> > +                  we'd have to record which SLP node (and lane) we want to
> > +                  use for the live operation.  So make sure we can
> > +                  code-generate from all nodes.  */
> > +               mark_visited = false;
> > +             else
> > +               STMT_VINFO_LIVE_P (stmt_info) = false;
> > +           }
> > +
> >           /* We have to verify whether we can insert the lane extract
> >              before all uses.  The following is a conservative approximation.
> >              We cannot put this into vectorizable_live_operation because
> > @@ -6495,6 +6564,10 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
> >              from the latest stmt in a node.  So we compensate for this
> >              during code-generation, simply not replacing uses for those
> >              hopefully rare cases.  */
> > +         imm_use_iterator use_iter;
> > +         gimple *use_stmt;
> > +         stmt_vec_info use_stmt_info;
> > +
> >           if (STMT_VINFO_LIVE_P (stmt_info))
> >             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
> >               if (!is_gimple_debug (use_stmt)
> > @@ -6517,8 +6590,56 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
> >    slp_tree child;
> >    FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
> >      if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
> > -      vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
> > -                                  cost_vec, svisited, visited);
> > +      vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
> > +                                  scalar_use_map, svisited, visited);
> > +}
> > +
> > +/* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
> > +   are live outside of the basic-block vectorized region and that can be
> > +   vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
> > +
> > +static void
> > +vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
> > +{
> > +  if (bb_vinfo->slp_instances.is_empty ())
> > +    return;
> > +
> > +  hash_set<stmt_vec_info> svisited;
> > +  hash_set<slp_tree> visited;
> > +  hash_map<tree, bool> scalar_use_map;
> > +  auto_vec<slp_tree> worklist;
> > +
> > +  for (slp_instance instance : bb_vinfo->slp_instances)
> > +    if (!visited.add (SLP_INSTANCE_TREE (instance)))
> > +      worklist.safe_push (SLP_INSTANCE_TREE (instance));
> > +
> > +  do
> > +    {
> > +      slp_tree node = worklist.pop ();
> > +
> > +      if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
> > +       {
> > +         for (tree op : SLP_TREE_SCALAR_OPS (node))
> > +           if (TREE_CODE (op) == SSA_NAME)
> > +             scalar_use_map.put (op, true);
> > +       }
> > +      else
> > +       {
> > +         for (slp_tree child : SLP_TREE_CHILDREN (node))
> > +           if (child && !visited.add (child))
> > +             worklist.safe_push (child);
> > +       }
> > +    } while (!worklist.is_empty ());
> > +
> > +  visited.empty ();
> > +
> > +  for (slp_instance instance : bb_vinfo->slp_instances)
> > +    {
> > +      vect_location = instance->location ();
> > +      vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
> > +                                  instance, &instance->cost_vec,
> > +                                  scalar_use_map, svisited, visited);
> > +    }
> >  }
> >
> >  /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
> > @@ -6684,17 +6805,7 @@ vect_slp_analyze_operations (vec_info *vinfo)
> >
> >    /* Compute vectorizable live stmts.  */
> >    if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
> > -    {
> > -      hash_set<stmt_vec_info> svisited;
> > -      hash_set<slp_tree> visited;
> > -      for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
> > -       {
> > -         vect_location = instance->location ();
> > -         vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
> > -                                      instance, &instance->cost_vec, svisited,
> > -                                      visited);
> > -       }
> > -    }
> > +    vect_bb_slp_mark_live_stmts (bb_vinfo);
> >
> >    return !vinfo->slp_instances.is_empty ();
> >  }
> > --
> > 2.17.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Do not count unused scalar use when marking STMT_VINFO_LIVE_P [PR113091]
  2024-01-12  6:15     ` Feng Xue OS
@ 2024-01-12  8:05       ` Richard Biener
  0 siblings, 0 replies; 6+ messages in thread
From: Richard Biener @ 2024-01-12  8:05 UTC (permalink / raw)
  To: Feng Xue OS; +Cc: Richard Sandiford, gcc-patches

On Fri, Jan 12, 2024 at 7:15 AM Feng Xue OS <fxue@os.amperecomputing.com> wrote:
>
> Add a depth parameter to limit recursion of vec_slp_has_scalar_use.

OK.

> Feng
> -------
>
>  .../gcc.target/aarch64/bb-slp-pr113091.c      |  22 ++
>  gcc/tree-vect-slp.cc                          | 207 ++++++++++++++----
>  2 files changed, 190 insertions(+), 39 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> new file mode 100644
> index 00000000000..ff822e90b4a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -fdump-tree-slp-details -ftree-slp-vectorize" } */
> +
> +int test(unsigned array[8]);
> +
> +int foo(char *a, char *b)
> +{
> +  unsigned array[8];
> +
> +  array[0] = (a[0] - b[0]);
> +  array[1] = (a[1] - b[1]);
> +  array[2] = (a[2] - b[2]);
> +  array[3] = (a[3] - b[3]);
> +  array[4] = (a[4] - b[4]);
> +  array[5] = (a[5] - b[5]);
> +  array[6] = (a[6] - b[6]);
> +  array[7] = (a[7] - b[7]);
> +
> +  return test(array);
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Basic block will be vectorized using SLP" 1 "slp2" } } */
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index b6cce55ce90..086377a9ac0 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -6418,6 +6418,102 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
>    return res;
>  }
>
> +/* Given a definition DEF, analyze if it will have any live scalar use after
> +   performing SLP vectorization whose information is represented by BB_VINFO,
> +   and record result into hash map SCALAR_USE_MAP as cache for later fast
> +   check.  If recursion DEPTH exceeds a limit, stop analysis and make a
> +   conservative assumption.  Return 0 if no scalar use, 1 if there is, -1
> +   means recursion is limited.  */
> +
> +static int
> +vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
> +                       hash_map<tree, int> &scalar_use_map,
> +                       int depth = 0)
> +{
> +  const int depth_limit = 2;
> +  imm_use_iterator use_iter;
> +  gimple *use_stmt;
> +
> +  if (int *res = scalar_use_map.get (def))
> +    return *res;
> +
> +  int scalar_use = 1;
> +
> +  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
> +    {
> +      if (is_gimple_debug (use_stmt))
> +       continue;
> +
> +      stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
> +
> +      if (!use_stmt_info)
> +       break;
> +
> +      if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
> +       continue;
> +
> +      /* Do not step forward when encounter PHI statement, since it may
> +        involve cyclic reference and cause infinite recursive invocation.  */
> +      if (gimple_code (use_stmt) == GIMPLE_PHI)
> +       break;
> +
> +      /* When pattern recognition is involved, a statement whose definition is
> +        consumed in some pattern, may not be included in the final replacement
> +        pattern statements, so would be skipped when building SLP graph.
> +
> +        * Original
> +         char a_c = *(char *) a;
> +         char b_c = *(char *) b;
> +         unsigned short a_s = (unsigned short) a_c;
> +         int a_i = (int) a_s;
> +         int b_i = (int) b_c;
> +         int r_i = a_i - b_i;
> +
> +        * After pattern replacement
> +         a_s = (unsigned short) a_c;
> +         a_i = (int) a_s;
> +
> +         patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
> +         patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
> +
> +         patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
> +         patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
> +
> +        The definitions of a_i(original statement) and b_i(pattern statement)
> +        are related to, but actually not part of widen_minus pattern.
> +        Vectorizing the pattern does not cause these definition statements to
> +        be marked as PURE_SLP.  For this case, we need to recursively check
> +        whether their uses are all absorbed into vectorized code.  But there
> +        is an exception that some use may participate in an vectorized
> +        operation via an external SLP node containing that use as an element.
> +        The parameter "scalar_use_map" tags such kind of SSA as having scalar
> +        use in advance.  */
> +      tree lhs = gimple_get_lhs (use_stmt);
> +
> +      if (!lhs || TREE_CODE (lhs) != SSA_NAME)
> +       break;
> +
> +      if (depth_limit && depth >= depth_limit)
> +       return -1;
> +
> +      if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
> +                                               depth + 1)))
> +       break;
> +    }
> +
> +  if (end_imm_use_stmt_p (&use_iter))
> +    scalar_use = 0;
> +
> +  /* If recursion is limited, do not cache result for non-root defs.  */
> +  if (!depth || scalar_use >= 0)
> +    {
> +      bool added = scalar_use_map.put (def, scalar_use);
> +      gcc_assert (!added);
> +    }
> +
> +  return scalar_use;
> +}
> +
>  /* Mark lanes of NODE that are live outside of the basic-block vectorized
>     region and that can be vectorized using vectorizable_live_operation
>     with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
> @@ -6427,6 +6523,7 @@ static void
>  vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
>                              slp_instance instance,
>                              stmt_vector_for_cost *cost_vec,
> +                            hash_map<tree, int> &scalar_use_map,
>                              hash_set<stmt_vec_info> &svisited,
>                              hash_set<slp_tree> &visited)
>  {
> @@ -6451,32 +6548,22 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
>        def_operand_p def_p;
>        FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
>         {
> -         imm_use_iterator use_iter;
> -         gimple *use_stmt;
> -         stmt_vec_info use_stmt_info;
> -         FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
> -           if (!is_gimple_debug (use_stmt))
> -             {
> -               use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
> -               if (!use_stmt_info
> -                   || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
> -                 {
> -                   STMT_VINFO_LIVE_P (stmt_info) = true;
> -                   if (vectorizable_live_operation (bb_vinfo, stmt_info,
> -                                                    node, instance, i,
> -                                                    false, cost_vec))
> -                     /* ???  So we know we can vectorize the live stmt
> -                        from one SLP node.  If we cannot do so from all
> -                        or none consistently we'd have to record which
> -                        SLP node (and lane) we want to use for the live
> -                        operation.  So make sure we can code-generate
> -                        from all nodes.  */
> -                     mark_visited = false;
> -                   else
> -                     STMT_VINFO_LIVE_P (stmt_info) = false;
> -                   break;
> -                 }
> -             }
> +         if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
> +                                     scalar_use_map))
> +           {
> +             STMT_VINFO_LIVE_P (stmt_info) = true;
> +             if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
> +                                              instance, i, false, cost_vec))
> +               /* ???  So we know we can vectorize the live stmt from one SLP
> +                  node.  If we cannot do so from all or none consistently
> +                  we'd have to record which SLP node (and lane) we want to
> +                  use for the live operation.  So make sure we can
> +                  code-generate from all nodes.  */
> +               mark_visited = false;
> +             else
> +               STMT_VINFO_LIVE_P (stmt_info) = false;
> +           }
> +
>           /* We have to verify whether we can insert the lane extract
>              before all uses.  The following is a conservative approximation.
>              We cannot put this into vectorizable_live_operation because
> @@ -6495,6 +6582,10 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
>              from the latest stmt in a node.  So we compensate for this
>              during code-generation, simply not replacing uses for those
>              hopefully rare cases.  */
> +         imm_use_iterator use_iter;
> +         gimple *use_stmt;
> +         stmt_vec_info use_stmt_info;
> +
>           if (STMT_VINFO_LIVE_P (stmt_info))
>             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
>               if (!is_gimple_debug (use_stmt)
> @@ -6517,8 +6608,56 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
>    slp_tree child;
>    FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
>      if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
> -      vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
> -                                  cost_vec, svisited, visited);
> +      vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
> +                                  scalar_use_map, svisited, visited);
> +}
> +
> +/* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
> +   are live outside of the basic-block vectorized region and that can be
> +   vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
> +
> +static void
> +vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
> +{
> +  if (bb_vinfo->slp_instances.is_empty ())
> +    return;
> +
> +  hash_set<stmt_vec_info> svisited;
> +  hash_set<slp_tree> visited;
> +  hash_map<tree, int> scalar_use_map;
> +  auto_vec<slp_tree> worklist;
> +
> +  for (slp_instance instance : bb_vinfo->slp_instances)
> +    if (!visited.add (SLP_INSTANCE_TREE (instance)))
> +      worklist.safe_push (SLP_INSTANCE_TREE (instance));
> +
> +  do
> +    {
> +      slp_tree node = worklist.pop ();
> +
> +      if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
> +       {
> +         for (tree op : SLP_TREE_SCALAR_OPS (node))
> +           if (TREE_CODE (op) == SSA_NAME)
> +             scalar_use_map.put (op, 1);
> +       }
> +      else
> +       {
> +         for (slp_tree child : SLP_TREE_CHILDREN (node))
> +           if (child && !visited.add (child))
> +             worklist.safe_push (child);
> +       }
> +    } while (!worklist.is_empty ());
> +
> +  visited.empty ();
> +
> +  for (slp_instance instance : bb_vinfo->slp_instances)
> +    {
> +      vect_location = instance->location ();
> +      vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
> +                                  instance, &instance->cost_vec,
> +                                  scalar_use_map, svisited, visited);
> +    }
>  }
>
>  /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
> @@ -6684,17 +6823,7 @@ vect_slp_analyze_operations (vec_info *vinfo)
>
>    /* Compute vectorizable live stmts.  */
>    if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
> -    {
> -      hash_set<stmt_vec_info> svisited;
> -      hash_set<slp_tree> visited;
> -      for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
> -       {
> -         vect_location = instance->location ();
> -         vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
> -                                      instance, &instance->cost_vec, svisited,
> -                                      visited);
> -       }
> -    }
> +    vect_bb_slp_mark_live_stmts (bb_vinfo);
>
>    return !vinfo->slp_instances.is_empty ();
>  }
> --
> 2.17.1
>
> ________________________________________
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Thursday, January 11, 2024 5:52 PM
> To: Feng Xue OS; Richard Sandiford
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH] Do not count unused scalar use when marking STMT_VINFO_LIVE_P [PR113091]
>
> On Thu, Jan 11, 2024 at 10:46 AM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Fri, Dec 29, 2023 at 11:29 AM Feng Xue OS
> > <fxue@os.amperecomputing.com> wrote:
> > >
> > > This patch is meant to fix over-estimation about SLP vector-to-scalar cost for
> > > STMT_VINFO_LIVE_P statement. When pattern recognition is involved, a
> > > statement whose definition is consumed in some pattern, may not be
> > > included in the final replacement pattern statements, and would be skipped
> > > when building SLP graph.
> > >
> > >      * Original
> > >       char a_c = *(char *) a;
> > >       char b_c = *(char *) b;
> > >       unsigned short a_s = (unsigned short) a_c;
> > >       int a_i = (int) a_s;
> > >       int b_i = (int) b_c;
> > >       int r_i = a_i - b_i;
> > >
> > >      * After pattern replacement
> > >       a_s = (unsigned short) a_c;
> > >       a_i = (int) a_s;
> > >
> > >       patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
> > >       patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
> > >
> > >       patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
> > >       patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
> > >
> > > The definitions of a_i(original statement) and b_i(pattern statement)
> > > are related to, but actually not part of widen_minus pattern.
> > > Vectorizing the pattern does not cause these definition statements to
> > > be marked as PURE_SLP.  For this case, we need to recursively check
> > > whether their uses are all absorbed into vectorized code.  But there
> > > is an exception that some use may participate in an vectorized
> > > operation via an external SLP node containing that use as an element.
> > >
> > > Feng
> > >
> > > ---
> > >  .../gcc.target/aarch64/bb-slp-pr113091.c      |  22 ++
> > >  gcc/tree-vect-slp.cc                          | 189 ++++++++++++++----
> > >  2 files changed, 172 insertions(+), 39 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> > >
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> > > new file mode 100644
> > > index 00000000000..ff822e90b4a
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> > > @@ -0,0 +1,22 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-additional-options "-O3 -fdump-tree-slp-details -ftree-slp-vectorize" } */
> > > +
> > > +int test(unsigned array[8]);
> > > +
> > > +int foo(char *a, char *b)
> > > +{
> > > +  unsigned array[8];
> > > +
> > > +  array[0] = (a[0] - b[0]);
> > > +  array[1] = (a[1] - b[1]);
> > > +  array[2] = (a[2] - b[2]);
> > > +  array[3] = (a[3] - b[3]);
> > > +  array[4] = (a[4] - b[4]);
> > > +  array[5] = (a[5] - b[5]);
> > > +  array[6] = (a[6] - b[6]);
> > > +  array[7] = (a[7] - b[7]);
> > > +
> > > +  return test(array);
> > > +}
> > > +
> > > +/* { dg-final { scan-tree-dump-times "Basic block will be vectorized using SLP" 1 "slp2" } } */
> > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > > index a82fca45161..d36ff37114e 100644
> > > --- a/gcc/tree-vect-slp.cc
> > > +++ b/gcc/tree-vect-slp.cc
> > > @@ -6418,6 +6418,84 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
> > >    return res;
> > >  }
> > >
> > > +/* Given a definition DEF, analyze if it will have any live scalar use after
> > > +   performing SLP vectorization whose information is represented by BB_VINFO,
> > > +   and record result into hash map SCALAR_USE_MAP as cache for later fast
> > > +   check.  */
> > > +
> > > +static bool
> > > +vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
> > > +                       hash_map<tree, bool> &scalar_use_map)
> > > +{
> > > +  imm_use_iterator use_iter;
> > > +  gimple *use_stmt;
> > > +
> > > +  if (bool *res = scalar_use_map.get (def))
> > > +    return *res;
> > > +
> > > +  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
> > > +    {
> > > +      if (is_gimple_debug (use_stmt))
> > > +       continue;
> > > +
> > > +      stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
> > > +
> > > +      if (!use_stmt_info)
> > > +       break;
> > > +
> > > +      if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
> > > +       continue;
> > > +
> > > +      /* Do not step forward when encounter PHI statement, since it may
> > > +        involve cyclic reference and cause infinite recursive invocation.  */
> > > +      if (gimple_code (use_stmt) == GIMPLE_PHI)
> > > +       break;
> > > +
> > > +      /* When pattern recognition is involved, a statement whose definition is
> > > +        consumed in some pattern, may not be included in the final replacement
> > > +        pattern statements, so would be skipped when building SLP graph.
> > > +
> > > +        * Original
> > > +         char a_c = *(char *) a;
> > > +         char b_c = *(char *) b;
> > > +         unsigned short a_s = (unsigned short) a_c;
> > > +         int a_i = (int) a_s;
> > > +         int b_i = (int) b_c;
> > > +         int r_i = a_i - b_i;
> > > +
> > > +        * After pattern replacement
> > > +         a_s = (unsigned short) a_c;
> > > +         a_i = (int) a_s;
> > > +
> > > +         patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
> > > +         patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
> > > +
> > > +         patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
> > > +         patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
> > > +
> > > +        The definitions of a_i(original statement) and b_i(pattern statement)
> > > +        are related to, but actually not part of widen_minus pattern.
> > > +        Vectorizing the pattern does not cause these definition statements to
> > > +        be marked as PURE_SLP.  For this case, we need to recursively check
> > > +        whether their uses are all absorbed into vectorized code.  But there
> > > +        is an exception that some use may participate in an vectorized
> > > +        operation via an external SLP node containing that use as an element.
> > > +        The parameter "scalar_use_map" tags such kind of SSA as having scalar
> > > +        use in advance.  */
> > > +      tree lhs = gimple_get_lhs (use_stmt);
> > > +
> > > +      if (!lhs || TREE_CODE (lhs) != SSA_NAME
> > > +         || vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map))
> >
> > This looks mostly good, the only worry I have is that this recursion
> > will - for unvectorized uses - eventually go all the way down to the end
> > of the vectorized region (aka the whole function).  Since it's really
> > patterns we are after it should be possible to limit the recursion to
> > use_stmt which are covered by a pattern, aka STMT_VINFO_IN_PATTERN_P?
>
> Hmm, in_pattern_p doesn't seem to be set for original stmts replaced
> by a pattern.
> In fact we don't seem to mark those in any way?  It should be possible to
> programmatically do this looking at the root stmt uses maybe.  Maybe we can
> instead simply limit recursion depth to one and only increase when we get
> testcases requiring more?  We should eventually revisit how we mark patterns
> in next stage1.
>
> > PHIs should be then magically covered since we have no patterns
> > for them (though explicitly handling is good for documentation purposes).
> >
> > Thanks,
> > Richard.
> >
> > > +       break;
> > > +    }
> > > +
> > > +  bool found = !end_imm_use_stmt_p (&use_iter);
> > > +  bool added = scalar_use_map.put (def, found);
> > > +
> > > +  gcc_assert (!added);
> > > +  return found;
> > > +}
> > > +
> > >  /* Mark lanes of NODE that are live outside of the basic-block vectorized
> > >     region and that can be vectorized using vectorizable_live_operation
> > >     with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
> > > @@ -6427,6 +6505,7 @@ static void
> > >  vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
> > >                              slp_instance instance,
> > >                              stmt_vector_for_cost *cost_vec,
> > > +                            hash_map<tree, bool> &scalar_use_map,
> > >                              hash_set<stmt_vec_info> &svisited,
> > >                              hash_set<slp_tree> &visited)
> > >  {
> > > @@ -6451,32 +6530,22 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
> > >        def_operand_p def_p;
> > >        FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
> > >         {
> > > -         imm_use_iterator use_iter;
> > > -         gimple *use_stmt;
> > > -         stmt_vec_info use_stmt_info;
> > > -         FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
> > > -           if (!is_gimple_debug (use_stmt))
> > > -             {
> > > -               use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
> > > -               if (!use_stmt_info
> > > -                   || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
> > > -                 {
> > > -                   STMT_VINFO_LIVE_P (stmt_info) = true;
> > > -                   if (vectorizable_live_operation (bb_vinfo, stmt_info,
> > > -                                                    node, instance, i,
> > > -                                                    false, cost_vec))
> > > -                     /* ???  So we know we can vectorize the live stmt
> > > -                        from one SLP node.  If we cannot do so from all
> > > -                        or none consistently we'd have to record which
> > > -                        SLP node (and lane) we want to use for the live
> > > -                        operation.  So make sure we can code-generate
> > > -                        from all nodes.  */
> > > -                     mark_visited = false;
> > > -                   else
> > > -                     STMT_VINFO_LIVE_P (stmt_info) = false;
> > > -                   break;
> > > -                 }
> > > -             }
> > > +         if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
> > > +                                     scalar_use_map))
> > > +           {
> > > +             STMT_VINFO_LIVE_P (stmt_info) = true;
> > > +             if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
> > > +                                              instance, i, false, cost_vec))
> > > +               /* ???  So we know we can vectorize the live stmt from one SLP
> > > +                  node.  If we cannot do so from all or none consistently
> > > +                  we'd have to record which SLP node (and lane) we want to
> > > +                  use for the live operation.  So make sure we can
> > > +                  code-generate from all nodes.  */
> > > +               mark_visited = false;
> > > +             else
> > > +               STMT_VINFO_LIVE_P (stmt_info) = false;
> > > +           }
> > > +
> > >           /* We have to verify whether we can insert the lane extract
> > >              before all uses.  The following is a conservative approximation.
> > >              We cannot put this into vectorizable_live_operation because
> > > @@ -6495,6 +6564,10 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
> > >              from the latest stmt in a node.  So we compensate for this
> > >              during code-generation, simply not replacing uses for those
> > >              hopefully rare cases.  */
> > > +         imm_use_iterator use_iter;
> > > +         gimple *use_stmt;
> > > +         stmt_vec_info use_stmt_info;
> > > +
> > >           if (STMT_VINFO_LIVE_P (stmt_info))
> > >             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
> > >               if (!is_gimple_debug (use_stmt)
> > > @@ -6517,8 +6590,56 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
> > >    slp_tree child;
> > >    FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
> > >      if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
> > > -      vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
> > > -                                  cost_vec, svisited, visited);
> > > +      vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
> > > +                                  scalar_use_map, svisited, visited);
> > > +}
> > > +
> > > +/* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
> > > +   are live outside of the basic-block vectorized region and that can be
> > > +   vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
> > > +
> > > +static void
> > > +vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
> > > +{
> > > +  if (bb_vinfo->slp_instances.is_empty ())
> > > +    return;
> > > +
> > > +  hash_set<stmt_vec_info> svisited;
> > > +  hash_set<slp_tree> visited;
> > > +  hash_map<tree, bool> scalar_use_map;
> > > +  auto_vec<slp_tree> worklist;
> > > +
> > > +  for (slp_instance instance : bb_vinfo->slp_instances)
> > > +    if (!visited.add (SLP_INSTANCE_TREE (instance)))
> > > +      worklist.safe_push (SLP_INSTANCE_TREE (instance));
> > > +
> > > +  do
> > > +    {
> > > +      slp_tree node = worklist.pop ();
> > > +
> > > +      if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
> > > +       {
> > > +         for (tree op : SLP_TREE_SCALAR_OPS (node))
> > > +           if (TREE_CODE (op) == SSA_NAME)
> > > +             scalar_use_map.put (op, true);
> > > +       }
> > > +      else
> > > +       {
> > > +         for (slp_tree child : SLP_TREE_CHILDREN (node))
> > > +           if (child && !visited.add (child))
> > > +             worklist.safe_push (child);
> > > +       }
> > > +    } while (!worklist.is_empty ());
> > > +
> > > +  visited.empty ();
> > > +
> > > +  for (slp_instance instance : bb_vinfo->slp_instances)
> > > +    {
> > > +      vect_location = instance->location ();
> > > +      vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
> > > +                                  instance, &instance->cost_vec,
> > > +                                  scalar_use_map, svisited, visited);
> > > +    }
> > >  }
> > >
> > >  /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
> > > @@ -6684,17 +6805,7 @@ vect_slp_analyze_operations (vec_info *vinfo)
> > >
> > >    /* Compute vectorizable live stmts.  */
> > >    if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
> > > -    {
> > > -      hash_set<stmt_vec_info> svisited;
> > > -      hash_set<slp_tree> visited;
> > > -      for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
> > > -       {
> > > -         vect_location = instance->location ();
> > > -         vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
> > > -                                      instance, &instance->cost_vec, svisited,
> > > -                                      visited);
> > > -       }
> > > -    }
> > > +    vect_bb_slp_mark_live_stmts (bb_vinfo);
> > >
> > >    return !vinfo->slp_instances.is_empty ();
> > >  }
> > > --
> > > 2.17.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-01-12  8:05 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-29 10:28 [PATCH] Do not count unused scalar use when marking STMT_VINFO_LIVE_P [PR113091] Feng Xue OS
2024-01-10 23:42 ` PING: " Feng Xue OS
2024-01-11  9:46 ` Richard Biener
2024-01-11  9:52   ` Richard Biener
2024-01-12  6:15     ` Feng Xue OS
2024-01-12  8:05       ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).