public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r13-1728] Improve common reduction vs builtin code generation in loop distribution
@ 2022-07-18 11:19 Richard Biener
  0 siblings, 0 replies; only message in thread
From: Richard Biener @ 2022-07-18 11:19 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:ce92603fbe3b4870e0a38efee1ee766d62942065

commit r13-1728-gce92603fbe3b4870e0a38efee1ee766d62942065
Author: Richard Biener <rguenther@suse.de>
Date:   Mon Jul 18 12:06:00 2022 +0200

    Improve common reduction vs builtin code generation in loop distribution
    
    loop distribution currently cannot handle the situation when the
    last partition is a builtin but there's a common reduction in all
    partitions (like the final IV value).  The following lifts this
    restriction by making the last non-builtin partition provide the
    definitions for the loop-closed PHI nodes.  Since we have heuristics
    in place to avoid code generating builtins last writing a testcase
    is difficult (but I ran into a case with other pending patches that
    made the heuristic ineffective).  What's remaining is the inability
    to preserve common reductions when all partitions could be builtins
    (in some cases final value replacement could come to the rescue here).
    
            * tree-loop-distribution.cc (copy_loop_before): Add
            the ability to replace the original LC PHI defs.
            (generate_loops_for_partition): Pass through a flag
            whether to redirect original LC PHI defs.
            (generate_code_for_partition): Likewise.
            (loop_distribution::distribute_loop): Compute the partition
            that should provide the LC PHI defs for common reductions
            and pass that down.

Diff:
---
 gcc/tree-loop-distribution.cc | 64 ++++++++++++++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 19 deletions(-)

diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc
index ed7f432f322..0714bc41a43 100644
--- a/gcc/tree-loop-distribution.cc
+++ b/gcc/tree-loop-distribution.cc
@@ -942,7 +942,7 @@ stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
 /* Return a copy of LOOP placed before LOOP.  */
 
 static class loop *
-copy_loop_before (class loop *loop)
+copy_loop_before (class loop *loop, bool redirect_lc_phi_defs)
 {
   class loop *res;
   edge preheader = loop_preheader_edge (loop);
@@ -950,6 +950,24 @@ copy_loop_before (class loop *loop)
   initialize_original_copy_tables ();
   res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
   gcc_assert (res != NULL);
+
+  /* When a not last partition is supposed to keep the LC PHIs computed
+     adjust their definitions.  */
+  if (redirect_lc_phi_defs)
+    {
+      edge exit = single_exit (loop);
+      for (gphi_iterator si = gsi_start_phis (exit->dest); !gsi_end_p (si);
+	   gsi_next (&si))
+	{
+	  gphi *phi = si.phi ();
+	  if (virtual_operand_p (gimple_phi_result (phi)))
+	    continue;
+	  use_operand_p use_p = PHI_ARG_DEF_PTR_FROM_EDGE (phi, exit);
+	  tree new_def = get_current_def (USE_FROM_PTR (use_p));
+	  SET_USE (use_p, new_def);
+	}
+    }
+
   free_original_copy_tables ();
   delete_update_ssa ();
 
@@ -977,7 +995,7 @@ create_bb_after_loop (class loop *loop)
 
 static void
 generate_loops_for_partition (class loop *loop, partition *partition,
-			      bool copy_p)
+			      bool copy_p, bool keep_lc_phis_p)
 {
   unsigned i;
   basic_block *bbs;
@@ -985,7 +1003,7 @@ generate_loops_for_partition (class loop *loop, partition *partition,
   if (copy_p)
     {
       int orig_loop_num = loop->orig_loop_num;
-      loop = copy_loop_before (loop);
+      loop = copy_loop_before (loop, keep_lc_phis_p);
       gcc_assert (loop != NULL);
       loop->orig_loop_num = orig_loop_num;
       create_preheader (loop, CP_SIMPLE_PREHEADERS);
@@ -1336,7 +1354,8 @@ destroy_loop (class loop *loop)
 
 static bool 
 generate_code_for_partition (class loop *loop,
-			     partition *partition, bool copy_p)
+			     partition *partition, bool copy_p,
+			     bool keep_lc_phis_p)
 {
   switch (partition->kind)
     {
@@ -1345,7 +1364,8 @@ generate_code_for_partition (class loop *loop,
       /* Reductions all have to be in the last partition.  */
       gcc_assert (!partition_reduction_p (partition)
 		  || !copy_p);
-      generate_loops_for_partition (loop, partition, copy_p);
+      generate_loops_for_partition (loop, partition, copy_p,
+				    keep_lc_phis_p);
       return false;
 
     case PKIND_MEMSET:
@@ -3013,6 +3033,7 @@ loop_distribution::distribute_loop (class loop *loop,
 
   bool any_builtin = false;
   bool reduction_in_all = false;
+  int reduction_partition_num = -1;
   FOR_EACH_VEC_ELT (partitions, i, partition)
     {
       reduction_in_all
@@ -3092,10 +3113,13 @@ loop_distribution::distribute_loop (class loop *loop,
     }
 
   /* Put a non-builtin partition last if we need to preserve a reduction.
-     ???  This is a workaround that makes sort_partitions_by_post_order do
-     the correct thing while in reality it should sort each component
-     separately and then put the component with a reduction or a non-builtin
-     last.  */
+     In most cases this helps to keep a normal partition last avoiding to
+     spill a reduction result across builtin calls.
+     ???  The proper way would be to use dependences to see whether we
+     can move builtin partitions earlier during merge_dep_scc_partitions
+     and its sort_partitions_by_post_order.  Especially when the
+     dependence graph is composed of multiple independent subgraphs the
+     heuristic does not work reliably.  */
   if (reduction_in_all
       && partition_builtin_p (partitions.last()))
     FOR_EACH_VEC_ELT (partitions, i, partition)
@@ -3126,19 +3150,20 @@ loop_distribution::distribute_loop (class loop *loop,
 
   finalize_partitions (loop, &partitions, &alias_ddrs);
 
-  /* If there is a reduction in all partitions make sure the last one
-     is not classified for builtin code generation.  */
+  /* If there is a reduction in all partitions make sure the last
+     non-builtin partition provides the LC PHI defs.  */
   if (reduction_in_all)
     {
-      partition = partitions.last ();
-      if (only_patterns_p
-	  && partition_builtin_p (partition)
-	  && !partition_builtin_p (partitions[0]))
+      FOR_EACH_VEC_ELT (partitions, i, partition)
+	if (!partition_builtin_p (partition))
+	  reduction_partition_num = i;
+      if (reduction_partition_num == -1)
 	{
-	  nbp = 0;
-	  goto ldist_done;
+	  /* If all partitions are builtin, force the last one to
+	     be code generated as normal partition.  */
+	  partition = partitions.last ();
+	  partition->kind = PKIND_NORMAL;
 	}
-      partition->kind = PKIND_NORMAL;
     }
 
   nbp = partitions.length ();
@@ -3164,7 +3189,8 @@ loop_distribution::distribute_loop (class loop *loop,
     {
       if (partition_builtin_p (partition))
 	(*nb_calls)++;
-      *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
+      *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1,
+						 i == reduction_partition_num);
     }
 
  ldist_done:


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-07-18 11:19 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-18 11:19 [gcc r13-1728] Improve common reduction vs builtin code generation in loop distribution Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).