public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
@ 2015-06-30  8:34 Ajit Kumar Agarwal
  2015-06-30 10:38 ` Bernhard Reutner-Fischer
                   ` (3 more replies)
  0 siblings, 4 replies; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-06-30  8:34 UTC (permalink / raw)
  To: law, GCC Patches
  Cc: Vinod Kathail, Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

[-- Attachment #1: Type: text/plain, Size: 24721 bytes --]

All:

The below patch added a new path Splitting optimization pass on SSA representation. The Path Splitting optimization
Pass moves the join block of if-then-else same as loop latch to its predecessors and get merged with the predecessors
Preserving the SSA representation.

The patch is tested for Microblaze and i386 target. The EEMBC/Mibench benchmarks is run with the Microblaze target
And the performance gain of 9.15% and rgbcmy01_lite(EEMBC benchmarks). The Deja GNU tests is run for Mircroblaze 
Target and no regression is seen for Microblaze target and the new testcase attached are passed.

For i386 bootstrapping goes through fine and the Spec cpu2000 benchmarks is run with this patch. Following observation
were seen with spec cpu2000 benchmarks. 

Ratio of path splitting change vs Ratio of not having path splitting change is 3653.353 vs 3652.14 for INT benchmarks.
Ratio of path splitting change vs Ratio of not having path splitting change is  4353.812 vs 4345.351 for FP benchmarks.

Based on comments from RFC patch following changes were done.

1. Added a new pass for path splitting changes.
2. Placed the new path  Splitting Optimization pass before the copy propagation pass.
3. The join block same as the Loop latch is wired into its predecessors so that the CFG Cleanup pass will merge the blocks
Wired together.
4. Copy propagation routines added for path splitting changes is not needed as suggested by Jeff. They are removed in the patch as
The copy propagation in the copied join blocks will be done by the existing copy propagation pass and the update ssa pass.
5. Only the propagation of phi results of the join block with the phi argument is done which will not be done by the existing update_ssa
Or copy propagation pass on tree ssa representation.
6. Added 2 tests.
    a) compilation check  tests.
   b) execution tests.
7. Refactoring of the code for the feasibility check and finding the join block same as loop latch node.

    [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation.
    
    Added a new pass on path splitting on tree SSA representation. The path
    splitting optimization does the CFG transformation of join block of the
    if-then-else same as the loop latch node is moved and merged with the
    predecessor blocks after preserving the SSA representation.
    
    ChangeLog:
    2015-06-30  Ajit Agarwal  <ajitkum@xilinx.com>
    
        * gcc/Makefile.in: Add the build of the new file
        tree-ssa-path-split.c
        * gcc/common.opt: Add the new flag ftree-path-split.
        * gcc/opts.c: Add an entry for Path splitting pass
        with optimization flag greater and equal to O2.
        * gcc/passes.def: Enable and add new pass path splitting.
        * gcc/timevar.def: Add the new entry for TV_TREE_PATH_SPLIT.
        * gcc/tree-pass.h: Extern Declaration of make_pass_path_split.
        * gcc/tree-ssa-path-split.c: New file for path splitting pass.
        * gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c: New testcase.
        * gcc/testsuite/gcc.dg/path-split-1.c: New testcase.
    
    Signed-off-by:Ajit Agarwal ajitkum@xilinx.com.

gcc/Makefile.in                              |   1 +
 gcc/common.opt                               |   4 +
 gcc/opts.c                                   |   1 +
 gcc/passes.def                               |   1 +
 gcc/testsuite/gcc.dg/path-split-1.c          |  65 ++++
 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c |  62 ++++
 gcc/timevar.def                              |   1 +
 gcc/tree-pass.h                              |   1 +
 gcc/tree-ssa-path-split.c                    | 462 +++++++++++++++++++++++++++
 9 files changed, 598 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/path-split-1.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
 create mode 100644 gcc/tree-ssa-path-split.c

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 5f9261f..35ac363 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1476,6 +1476,7 @@ OBJS = \
 	tree-vect-slp.o \
 	tree-vectorizer.o \
 	tree-vrp.o \
+        tree-ssa-path-split.o \
 	tree.o \
 	valtrack.o \
 	value-prof.o \
diff --git a/gcc/common.opt b/gcc/common.opt
index e104269..c63b100 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2328,6 +2328,10 @@ ftree-vrp
 Common Report Var(flag_tree_vrp) Init(0) Optimization
 Perform Value Range Propagation on trees
 
+ftree-path-split
+Common Report Var(flag_tree_path_split) Init(0) Optimization
+Perform Path Splitting
+
 funit-at-a-time
 Common Report Var(flag_unit_at_a_time) Init(1) Optimization
 Compile whole compilation unit at a time
diff --git a/gcc/opts.c b/gcc/opts.c
index 8a16116..31947ff 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -508,6 +508,7 @@ static const struct default_options default_options_table[] =
     { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
+    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },
 
     /* -O3 optimizations.  */
     { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
diff --git a/gcc/passes.def b/gcc/passes.def
index c0ddee4..43618eb 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -155,6 +155,7 @@ along with GCC; see the file COPYING3.  If not see
       NEXT_PASS (pass_ccp);
       /* After CCP we rewrite no longer addressed locals into SSA
 	 form if possible.  */
+      NEXT_PASS (pass_path_split);
       NEXT_PASS (pass_copy_prop);
       NEXT_PASS (pass_complete_unrolli);
       NEXT_PASS (pass_phiprop);
diff --git a/gcc/testsuite/gcc.dg/path-split-1.c b/gcc/testsuite/gcc.dg/path-split-1.c
new file mode 100644
index 0000000..075dc87
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/path-split-1.c
@@ -0,0 +1,65 @@
+/* { dg-do run } */
+/* { dg-options "-O2 " } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define RGBMAX 255
+
+int
+test()
+{
+  int i, Pels;
+  unsigned char sum = 0;
+  unsigned char xr, xg, xb;
+  unsigned char xc, xm, xy, xk;
+  unsigned char *ReadPtr, *EritePtr;
+
+  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+  EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+
+  for (i = 0; i < 100;i++)
+     {
+       ReadPtr[i] = 100 - i;
+     }
+
+  for (i = 0; i < 100; i++)
+     {
+       xr = *ReadPtr++;
+       xg = *ReadPtr++;
+       xb = *ReadPtr++;
+
+       xc = (unsigned char) (RGBMAX - xr);
+       xm = (unsigned char) (RGBMAX - xg);
+       xy = (unsigned char) (RGBMAX - xb);
+
+       if (xc < xm)
+         {
+           xk = (unsigned char) (xc < xy ? xc : xy);
+         }
+       else
+        {
+          xk = (unsigned char) (xm < xy ? xm : xy);
+        }
+
+       xc = (unsigned char) (xc - xk);
+       xm = (unsigned char) (xm - xk);
+       xy = (unsigned char) (xy - xk);
+
+       *EritePtr++ = xc;
+       *EritePtr++ = xm;
+       *EritePtr++ = xy;
+       *EritePtr++ = xk;
+       sum += *EritePtr;
+    }
+  return sum;
+}
+
+int
+main()
+{
+  if (test() != 33)
+    abort();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
new file mode 100644
index 0000000..19f277c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-path_split" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define RGBMAX 255
+
+int
+test()
+{
+  int i, Pels;
+  unsigned char sum = 0;
+  unsigned char xr, xg, xb;
+  unsigned char xc, xm, xy, xk;
+  unsigned char *ReadPtr, *EritePtr;
+
+  ReadPtr = (unsigned char *) malloc (sizeof (unsigned char) * 100);
+  EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+
+  for (i = 0; i < 100;i++)
+     {
+       ReadPtr[i] = 100 - i;
+     }
+
+  for (i = 0; i < 100; i++)
+     {
+       xr = *ReadPtr++;
+       xg = *ReadPtr++;
+       xb = *ReadPtr++;
+
+       xc = ( unsigned char) (RGBMAX - xr);
+       xm = ( unsigned char) (RGBMAX - xg);
+       xy = ( unsigned char) (RGBMAX - xb);
+
+       if (xc < xm)
+         {
+           xk = ( unsigned char) (xc < xy ? xc : xy);
+         }
+       else
+         {
+           xk = ( unsigned char) (xm < xy ? xm : xy);
+         }
+
+       xc = (unsigned char) (xc - xk);
+       xm = (unsigned char) (xm - xk);
+       xy = (unsigned char) (xy - xk);
+
+       *EritePtr++ = xc;
+       *EritePtr++ = xm;
+       *EritePtr++ = xy;
+       *EritePtr++ = xk;
+       sum += *EritePtr;
+    }
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "xc_[0-9][0-9]* -> { xc_[0-9][0-9]* }" "path_split"} } */
+/* { dg-final { scan-tree-dump "xm_[0-9][0-9]* -> { xm_[0-9][0-9]* }" "path_split"} } */
+/* { dg-final { scan-tree-dump "xy_[0-9][0-9]* -> { xy_[0-9][0-9]* }" "path_split"} } */
+/* { dg-final { scan-tree-dump "Merging blocks" "path_split"} } */
+/* { dg-final { cleanup-tree-dump "path_split" } } */
diff --git a/gcc/timevar.def b/gcc/timevar.def
index 711bbed..6217a8e 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -288,3 +288,4 @@ DEFTIMEVAR (TV_JIT_REPLAY	     , "replay of JIT client activity")
 DEFTIMEVAR (TV_ASSEMBLE	     , "assemble JIT code")
 DEFTIMEVAR (TV_LINK		     , "link JIT code")
 DEFTIMEVAR (TV_LOAD		     , "load JIT result")
+DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path_split")
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 398ab83..e00639e 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -379,6 +379,7 @@ extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_path_split (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_build_ssa (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_build_alias (gcc::context *ctxt);
diff --git a/gcc/tree-ssa-path-split.c b/gcc/tree-ssa-path-split.c
new file mode 100644
index 0000000..3da7791
--- /dev/null
+++ b/gcc/tree-ssa-path-split.c
@@ -0,0 +1,462 @@
+/* Support routines for Path Splitting.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
+ 
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "flags.h"
+#include "tree.h"
+#include "stor-layout.h"
+#include "calls.h"
+#include "predict.h"
+#include "vec.h"
+#include "hashtab.h"
+#include "hash-set.h"
+#include "machmode.h"
+#include "hard-reg-set.h"
+#include "input.h"
+#include "function.h"
+#include "dominance.h"
+#include "cfg.h"
+#include "cfganal.h"
+#include "basic-block.h"
+#include "tree-ssa-alias.h"
+#include "internal-fn.h"
+#include "gimple-fold.h"
+#include "tree-eh.h"
+#include "gimple-expr.h"
+#include "is-a.h"
+#include "gimple.h"
+#include "gimple-iterator.h"
+#include "gimple-walk.h"
+#include "gimple-ssa.h"
+#include "tree-cfg.h"
+#include "tree-phinodes.h"
+#include "ssa-iterators.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
+#include "tree-ssa-loop-manip.h"
+#include "tree-ssa-loop-niter.h"
+#include "tree-ssa-loop.h"
+#include "tree-into-ssa.h"
+#include "tree-ssa.h"
+#include "tree-pass.h"
+#include "tree-dump.h"
+#include "gimple-pretty-print.h"
+#include "diagnostic-core.h"
+#include "intl.h"
+#include "cfgloop.h"
+#include "tree-scalar-evolution.h"
+#include "tree-ssa-propagate.h"
+#include "tree-chrec.h"
+#include "tree-ssa-threadupdate.h"
+#include "expr.h"
+#include "insn-codes.h"
+#include "optabs.h"
+#include "tree-ssa-threadedge.h"
+#include "wide-int.h"
+
+/* Replace_uses_phi function propagates the phi results with the
+   first phi argument into each of the copied join blocks wired into
+   its predecessors. This function is called from the replace_uses_phi 
+   to replace the uses of first phi arguments with the second
+   phi arguments in the next copy of join block.  */
+
+static void
+replace_use_phi_operand1_with_operand2 (basic_block b,
+                                        tree use1,
+                                        tree use2)
+{
+  use_operand_p use;
+  ssa_op_iter iter;
+  gimple_stmt_iterator gsi;
+
+  for (gsi = gsi_start_bb (b); !gsi_end_p (gsi);)
+     {
+       gimple stmt = gsi_stmt (gsi);
+       FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
+       {
+         tree tuse = USE_FROM_PTR (use);
+          if (use1 == tuse || use1 == NULL_TREE)
+            {
+              propagate_value (use, use2);
+              update_stmt(stmt);
+            }
+        }
+       gsi_next(&gsi);
+     }
+}
+
+/* This function propagates the phi result into the use points with
+   the phi arguments. The join block is copied and wired into the
+   predecessors. Since the use points of the phi results will be same
+   in the each of the copy join blocks in the  predecessors, it
+   propagates the phi arguments in the copy of the join blocks wired
+   into its predecessor.  */
+ 
+static
+void replace_uses_phi (basic_block b, basic_block temp_bb)
+{
+  gimple_seq phis = phi_nodes (b);
+  gimple phi = gimple_seq_first_stmt (phis);
+  tree def = gimple_phi_result (phi), use = gimple_phi_arg_def (phi,0);
+  tree use2 = gimple_phi_arg_def (phi,1);
+
+  if (virtual_operand_p (def))
+    {
+      imm_use_iterator iter;
+      use_operand_p use_p;
+      gimple stmt;
+
+      FOR_EACH_IMM_USE_STMT (stmt, iter, def)
+        FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
+          SET_USE (use_p, use);
+      if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (def))
+        SSA_NAME_OCCURS_IN_ABNORMAL_PHI (use) = 1;
+    }
+   else
+     replace_uses_by (def, use);
+   replace_use_phi_operand1_with_operand2 (temp_bb, use, use2);
+}
+
+/* Returns true if the block bb has label or call statements.
+   Otherwise return false.  */
+
+static bool
+is_block_has_label_call (basic_block bb)
+{
+  gimple_stmt_iterator gsi;
+
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+     {
+       gimple stmt = gsi_stmt(gsi);
+       if (dyn_cast <glabel *> (stmt))
+         {
+           return true;
+         }
+       if (is_gimple_call (stmt))
+         return true;
+     }
+  return false;
+}
+
+/* This function performs the feasibility tests for path splitting
+   to perform. Return false if the feasibility for path splitting
+   is not done and returns true if the feasbility for path splitting
+   is done. Following feasibility tests are performed.
+ 
+   1. Return false if the join block has call gimple statements.
+   2. Return false if the join block has rhs casting for assign
+      gimple statements.
+   3. If the number of phis is greater than 1 or the phi node in
+      the join block has virtual operand return false.
+   4. Return false if the number of sequential statements is
+      greater than 2.
+   5. If the predecessors blocks has labels and call statements
+      return false.
+   6. If the phi result in the phi node of the join block is not
+      used inside the same join block return false.
+   7. Otherwise returns true.  */
+
+static bool
+is_feasible_path_splitting (basic_block join_node, basic_block pred1,
+                           basic_block pred2)
+{
+  int num_stmt = 0, num_phis = 0;
+  gimple_stmt_iterator psi, gsi;
+
+  for (gsi = gsi_start_bb (join_node); !gsi_end_p (gsi); gsi_next (&gsi))
+     {
+       gimple stmt = gsi_stmt(gsi);
+
+       if (gimple_assign_cast_p (stmt))
+         return false;
+
+       if (is_gimple_call (stmt))
+         return false;
+
+       if (!is_gimple_debug(stmt))
+         {
+           num_stmt++;
+         }
+     }
+
+   if (pred1 && pred2 && (num_stmt > 2))
+     {
+       bool found_virtual_result = false;
+
+       for (psi = gsi_start_phis (join_node); !gsi_end_p (psi); )
+          {
+            use_operand_p use_p;
+            imm_use_iterator iter;
+            gimple stmt = gsi_stmt(psi);
+
+            if (!virtual_operand_p (gimple_phi_result (stmt)))
+              num_phis++;
+            else
+              found_virtual_result = true;
+
+            FOR_EACH_IMM_USE_FAST (use_p, iter, gimple_phi_result (stmt))
+            {
+              gimple use_stmt = USE_STMT (use_p);
+
+              if (gimple_bb (use_stmt) != join_node)
+                return false;
+            }
+
+            gsi_next(&psi);
+         }
+
+       if ((num_phis >1) || found_virtual_result)
+          return false;
+
+       if(is_block_has_label_call(pred1) || is_block_has_label_call(pred2))
+         return false;
+
+       return true;
+    }
+  return false;
+}
+
+/* Update the statements in the basic block with the basic
+   basic block.  */
+
+static void
+update_stmt_bb(basic_block b)
+{
+  gimple_stmt_iterator gsi;
+  for(gsi = gsi_start_bb(b); !gsi_end_p(gsi); gsi_next(&gsi))
+   {
+     gimple stmt = gsi_stmt(gsi);
+     gimple_set_bb(stmt,b);
+   }
+}
+
+/* This function gets the join blocks same as the source
+   node of the loop latch nodes and the predecessors of
+   the join block is updated in the pred1 and pred2 passed
+   as the reference arguments into the function. Return
+   the join block.  */
+
+static basic_block
+get_join_blk_same_as_loop_latch (basic_block bb,
+                                 basic_block &pred1,
+                                 basic_block &pred2)
+{
+  vec<basic_block> bbs;
+  basic_block bb1;
+  unsigned int i;
+  edge_iterator ei;
+  edge e1;
+  bool found = false ,found1;
+  bbs = get_all_dominated_blocks (CDI_DOMINATORS,
+                                  bb );
+  FOR_EACH_VEC_ELT (bbs, i, bb1)
+  {
+    found1 = false;
+    FOR_EACH_EDGE (e1, ei, bb->succs)
+    {
+      if ( bb1 == e1->dest)
+        {
+          found = true;
+          found1 = true;
+        }
+    }
+    if (!found1 && found)
+      {
+        found = false;
+        FOR_EACH_EDGE (e1, ei, bb1->succs)
+        {
+          if (e1->flags & (EDGE_DFS_BACK))
+            found = true;
+        }
+
+        if (found && EDGE_COUNT(bb1->preds) == 2)
+          {
+            unsigned int k = 0;
+            FOR_EACH_EDGE (e1, ei, bb1->preds)
+            {
+              if ((e1->flags & (EDGE_DFS_BACK)))
+                continue;
+
+              if ( k == 1)
+                {
+                  if (single_succ_p(e1->src) &&
+                      single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
+                    {
+                      pred2 = e1->src;
+                    }
+                }
+                else
+                  {
+                    if (single_succ_p(e1->src) &&
+                        single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
+                      {
+                        pred1 = e1->src;
+                      }
+                  }
+                k++;
+            }
+            bbs.release();
+            return bb1;
+          }
+       }
+   }
+   bbs.release();
+   return NULL;
+}
+
+/* This is the core function to perform path splitting. The join
+   same as the source of the loop latch node is identified along
+   with their predecessors. Based on the feasibility tests for
+   path splitting the path splitting is performed by wiring the
+   copy of join blocks into the predecessors and propagating the phi
+   result with the corresponding phi arguments into each of the copy
+   of join blocks wired with the original predecessors of the join
+   block.
+ 
+   The  tree-cfg-cleanup will merge the blocks in the predecessors
+   path and the update-ssa will update the ssa representation after
+   the path splitting is performed.  */
+ 
+static void
+perform_path_splitting (basic_block bb)
+{
+  basic_block pred1 = NULL, pred2 = NULL, join_block = NULL;
+
+  join_block = get_join_blk_same_as_loop_latch (bb, pred1, pred2);
+
+  if (join_block  && 
+      is_feasible_path_splitting (join_block, pred1, pred2))
+    {
+      basic_block new_bb1 = NULL, new_bb2 = NULL;
+      gimple_stmt_iterator last;
+      basic_block temp_bb = NULL;
+      edge_iterator ei;
+      edge e1;
+
+      temp_bb = duplicate_block (join_block, NULL, NULL);
+
+      FOR_EACH_EDGE (e1, ei, pred1->succs)
+        new_bb1 = split_edge (e1);
+
+      FOR_EACH_EDGE (e1, ei, pred2->succs)
+        new_bb2 = split_edge (e1);
+
+      last = gsi_start_bb (new_bb1);
+      gsi_insert_seq_after (&last, bb_seq (join_block), GSI_NEW_STMT);
+      last = gsi_start_bb (new_bb2);
+      gsi_insert_seq_after (&last, bb_seq (temp_bb), GSI_NEW_STMT);
+      update_stmt_bb (new_bb1);
+      update_stmt_bb (new_bb2);
+
+      replace_uses_phi (join_block, new_bb2);
+
+      set_bb_seq (join_block, NULL);
+      set_bb_seq(temp_bb,NULL);
+      delete_basic_block (temp_bb);
+      return;
+    }
+}
+
+static unsigned int
+execute_path_split (void)
+{
+  basic_block bb;
+
+  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
+  initialize_original_copy_tables();
+
+  calculate_dominance_info (CDI_DOMINATORS);
+  calculate_dominance_info (CDI_POST_DOMINATORS);
+
+  mark_dfs_back_edges ();
+
+  FOR_EACH_BB_FN (bb, cfun)
+  {
+    gimple last;
+
+    /* We only care about blocks ending in a COND_EXPR. */
+
+    last = gsi_stmt (gsi_last_bb (bb));
+
+    /* We're basically looking for a switch or any kind of conditional with
+       integral or pointer type arguments.  Note the type of the second
+       argument will be the same as the first argument, so no need to
+       check it explicitly.  */
+    if ((last && (gimple_code (last) == GIMPLE_COND
+            && TREE_CODE (gimple_cond_lhs (last)) == SSA_NAME
+            && (INTEGRAL_TYPE_P (TREE_TYPE (gimple_cond_lhs (last)))
+            || POINTER_TYPE_P (TREE_TYPE (gimple_cond_lhs (last))))
+            && (TREE_CODE (gimple_cond_rhs (last)) == SSA_NAME
+            || is_gimple_min_invariant (gimple_cond_rhs (last))))))
+      {
+
+         if (gimple_code(last) == GIMPLE_COND)
+           {
+              perform_path_splitting (bb);
+           }
+      }
+   }
+
+   loop_optimizer_finalize ();
+   free_original_copy_tables ();
+   free_dominance_info (CDI_DOMINATORS);
+   free_dominance_info (CDI_POST_DOMINATORS);
+   return 0;
+}
+
+namespace {
+
+const pass_data pass_data_path_split =
+{
+   GIMPLE_PASS, /* type */
+   "path_split", /* name */
+    OPTGROUP_NONE, /* optinfo_flags */
+    TV_TREE_PATH_SPLIT, /* tv_id */
+    PROP_ssa, /* properties_required */
+    0, /* properties_provided */
+    0, /* properties_destroyed */
+    0, /* todo_flags_start */
+    ( TODO_cleanup_cfg | TODO_update_ssa ), /* todo_flags_finish */
+};
+
+class pass_path_split : public gimple_opt_pass
+{
+   public:
+    pass_path_split (gcc::context *ctxt)
+      : gimple_opt_pass (pass_data_path_split, ctxt)
+    {}
+ 
+   /* opt_pass methods: */
+   opt_pass * clone () { return new pass_path_split (m_ctxt); }
+   virtual bool gate (function *) { return flag_tree_path_split != 0; }
+   virtual unsigned int execute (function *) { return execute_path_split (); }
+ 
+}; // class pass_path_split
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_path_split (gcc::context *ctxt)
+{
+  return new pass_path_split (ctxt);
+}
-- 
1.8.2.1

Thanks & Regards
Ajit

[-- Attachment #2: 0001-Patch-tree-optimization-Add-new-path-Splitting-pass-.patch --]
[-- Type: application/octet-stream, Size: 22044 bytes --]

From 676a3fb64b5b13a6ae0e4a22dadce85a8bf80bae Mon Sep 17 00:00:00 2001
From: Ajit Kumar Agarwal <ajitkum@xgolinux2.xilinx.com>
Date: Tue, 30 Jun 2015 09:39:29 +0200
Subject: [PATCH] [Patch,tree-optimization]: Add new path Splitting pass on
 tree ssa representation.

Added a new pass on path splitting on tree SSA representation. The path
splitting optimization does the CFG transformation of join block of the
if-then-else same as the loop latch node is moved and merged with the
predecessor blocks after preserving the SSA representation.

ChangeLog:
2015-06-30  Ajit Agarwal  <ajitkum@xilinx.com>

	* gcc/Makefile.in: Add the build of the new file
	tree-ssa-path-split.c
	* gcc/common.opt: Add the new flag ftree-path-split.
	* gcc/opts.c: Add an entry for Path splitting pass
	with optimization flag greater and equal to O2.
	* gcc/passes.def: Enable and add new pass path splitting.
	* gcc/timevar.def: Add the new entry for TV_TREE_PATH_SPLIT.
	* gcc/tree-pass.h: Extern Declaration of make_pass_path_split.
	* gcc/tree-ssa-path-split.c: New file for path splitting pass.
	* gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c: New testcase.
	* gcc/testsuite/gcc.dg/path-split-1.c: New testcase.

Signed-off-by:Ajit Agarwal ajitkum@xilinx.com.
---
 gcc/Makefile.in                              |   1 +
 gcc/common.opt                               |   4 +
 gcc/opts.c                                   |   1 +
 gcc/passes.def                               |   1 +
 gcc/testsuite/gcc.dg/path-split-1.c          |  65 ++++
 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c |  62 ++++
 gcc/timevar.def                              |   1 +
 gcc/tree-pass.h                              |   1 +
 gcc/tree-ssa-path-split.c                    | 462 +++++++++++++++++++++++++++
 9 files changed, 598 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/path-split-1.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
 create mode 100644 gcc/tree-ssa-path-split.c

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 5f9261f..35ac363 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1476,6 +1476,7 @@ OBJS = \
 	tree-vect-slp.o \
 	tree-vectorizer.o \
 	tree-vrp.o \
+        tree-ssa-path-split.o \
 	tree.o \
 	valtrack.o \
 	value-prof.o \
diff --git a/gcc/common.opt b/gcc/common.opt
index e104269..c63b100 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2328,6 +2328,10 @@ ftree-vrp
 Common Report Var(flag_tree_vrp) Init(0) Optimization
 Perform Value Range Propagation on trees
 
+ftree-path-split
+Common Report Var(flag_tree_path_split) Init(0) Optimization
+Perform Path Splitting
+
 funit-at-a-time
 Common Report Var(flag_unit_at_a_time) Init(1) Optimization
 Compile whole compilation unit at a time
diff --git a/gcc/opts.c b/gcc/opts.c
index 8a16116..31947ff 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -508,6 +508,7 @@ static const struct default_options default_options_table[] =
     { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
+    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },
 
     /* -O3 optimizations.  */
     { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
diff --git a/gcc/passes.def b/gcc/passes.def
index c0ddee4..43618eb 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -155,6 +155,7 @@ along with GCC; see the file COPYING3.  If not see
       NEXT_PASS (pass_ccp);
       /* After CCP we rewrite no longer addressed locals into SSA
 	 form if possible.  */
+      NEXT_PASS (pass_path_split);
       NEXT_PASS (pass_copy_prop);
       NEXT_PASS (pass_complete_unrolli);
       NEXT_PASS (pass_phiprop);
diff --git a/gcc/testsuite/gcc.dg/path-split-1.c b/gcc/testsuite/gcc.dg/path-split-1.c
new file mode 100644
index 0000000..075dc87
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/path-split-1.c
@@ -0,0 +1,65 @@
+/* { dg-do run } */
+/* { dg-options "-O2 " } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define RGBMAX 255
+
+int
+test()
+{
+  int i, Pels;
+  unsigned char sum = 0;
+  unsigned char xr, xg, xb;
+  unsigned char xc, xm, xy, xk;
+  unsigned char *ReadPtr, *EritePtr;
+
+  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+  EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+
+  for (i = 0; i < 100;i++)
+     {
+       ReadPtr[i] = 100 - i;
+     }
+
+  for (i = 0; i < 100; i++)
+     {
+       xr = *ReadPtr++;
+       xg = *ReadPtr++;
+       xb = *ReadPtr++;
+
+       xc = (unsigned char) (RGBMAX - xr);
+       xm = (unsigned char) (RGBMAX - xg);
+       xy = (unsigned char) (RGBMAX - xb);
+
+       if (xc < xm)
+         {
+           xk = (unsigned char) (xc < xy ? xc : xy);
+         }
+       else
+        {
+          xk = (unsigned char) (xm < xy ? xm : xy);
+        }
+
+       xc = (unsigned char) (xc - xk);
+       xm = (unsigned char) (xm - xk);
+       xy = (unsigned char) (xy - xk);
+
+       *EritePtr++ = xc;
+       *EritePtr++ = xm;
+       *EritePtr++ = xy;
+       *EritePtr++ = xk;
+       sum += *EritePtr;
+    }
+  return sum;
+}
+
+int
+main()
+{
+  if (test() != 33)
+    abort();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
new file mode 100644
index 0000000..19f277c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-path_split" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define RGBMAX 255
+
+int
+test()
+{
+  int i, Pels;
+  unsigned char sum = 0;
+  unsigned char xr, xg, xb;
+  unsigned char xc, xm, xy, xk;
+  unsigned char *ReadPtr, *EritePtr;
+
+  ReadPtr = (unsigned char *) malloc (sizeof (unsigned char) * 100);
+  EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+
+  for (i = 0; i < 100;i++)
+     {
+       ReadPtr[i] = 100 - i;
+     }
+
+  for (i = 0; i < 100; i++)
+     {
+       xr = *ReadPtr++;
+       xg = *ReadPtr++;
+       xb = *ReadPtr++;
+
+       xc = ( unsigned char) (RGBMAX - xr);
+       xm = ( unsigned char) (RGBMAX - xg);
+       xy = ( unsigned char) (RGBMAX - xb);
+
+       if (xc < xm)
+         {
+           xk = ( unsigned char) (xc < xy ? xc : xy);
+         }
+       else
+         {
+           xk = ( unsigned char) (xm < xy ? xm : xy);
+         }
+
+       xc = (unsigned char) (xc - xk);
+       xm = (unsigned char) (xm - xk);
+       xy = (unsigned char) (xy - xk);
+
+       *EritePtr++ = xc;
+       *EritePtr++ = xm;
+       *EritePtr++ = xy;
+       *EritePtr++ = xk;
+       sum += *EritePtr;
+    }
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "xc_[0-9][0-9]* -> { xc_[0-9][0-9]* }" "path_split"} } */
+/* { dg-final { scan-tree-dump "xm_[0-9][0-9]* -> { xm_[0-9][0-9]* }" "path_split"} } */
+/* { dg-final { scan-tree-dump "xy_[0-9][0-9]* -> { xy_[0-9][0-9]* }" "path_split"} } */
+/* { dg-final { scan-tree-dump "Merging blocks" "path_split"} } */
+/* { dg-final { cleanup-tree-dump "path_split" } } */
diff --git a/gcc/timevar.def b/gcc/timevar.def
index 711bbed..6217a8e 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -288,3 +288,4 @@ DEFTIMEVAR (TV_JIT_REPLAY	     , "replay of JIT client activity")
 DEFTIMEVAR (TV_ASSEMBLE	     , "assemble JIT code")
 DEFTIMEVAR (TV_LINK		     , "link JIT code")
 DEFTIMEVAR (TV_LOAD		     , "load JIT result")
+DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path_split")
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 398ab83..e00639e 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -379,6 +379,7 @@ extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_path_split (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_build_ssa (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_build_alias (gcc::context *ctxt);
diff --git a/gcc/tree-ssa-path-split.c b/gcc/tree-ssa-path-split.c
new file mode 100644
index 0000000..3da7791
--- /dev/null
+++ b/gcc/tree-ssa-path-split.c
@@ -0,0 +1,462 @@
+/* Support routines for Path Splitting.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
+ 
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "flags.h"
+#include "tree.h"
+#include "stor-layout.h"
+#include "calls.h"
+#include "predict.h"
+#include "vec.h"
+#include "hashtab.h"
+#include "hash-set.h"
+#include "machmode.h"
+#include "hard-reg-set.h"
+#include "input.h"
+#include "function.h"
+#include "dominance.h"
+#include "cfg.h"
+#include "cfganal.h"
+#include "basic-block.h"
+#include "tree-ssa-alias.h"
+#include "internal-fn.h"
+#include "gimple-fold.h"
+#include "tree-eh.h"
+#include "gimple-expr.h"
+#include "is-a.h"
+#include "gimple.h"
+#include "gimple-iterator.h"
+#include "gimple-walk.h"
+#include "gimple-ssa.h"
+#include "tree-cfg.h"
+#include "tree-phinodes.h"
+#include "ssa-iterators.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
+#include "tree-ssa-loop-manip.h"
+#include "tree-ssa-loop-niter.h"
+#include "tree-ssa-loop.h"
+#include "tree-into-ssa.h"
+#include "tree-ssa.h"
+#include "tree-pass.h"
+#include "tree-dump.h"
+#include "gimple-pretty-print.h"
+#include "diagnostic-core.h"
+#include "intl.h"
+#include "cfgloop.h"
+#include "tree-scalar-evolution.h"
+#include "tree-ssa-propagate.h"
+#include "tree-chrec.h"
+#include "tree-ssa-threadupdate.h"
+#include "expr.h"
+#include "insn-codes.h"
+#include "optabs.h"
+#include "tree-ssa-threadedge.h"
+#include "wide-int.h"
+
+/* Replace_uses_phi function propagates the phi results with the
+   first phi argument into each of the copied join blocks wired into
+   its predecessors. This function is called from the replace_uses_phi 
+   to replace the uses of first phi arguments with the second
+   phi arguments in the next copy of join block.  */
+
+static void
+replace_use_phi_operand1_with_operand2 (basic_block b,
+                                        tree use1,
+                                        tree use2)
+{
+  use_operand_p use;
+  ssa_op_iter iter;
+  gimple_stmt_iterator gsi;
+
+  for (gsi = gsi_start_bb (b); !gsi_end_p (gsi);)
+     {
+       gimple stmt = gsi_stmt (gsi);
+       FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
+       {
+         tree tuse = USE_FROM_PTR (use);
+          if (use1 == tuse || use1 == NULL_TREE)
+            {
+              propagate_value (use, use2);
+              update_stmt(stmt);
+            }
+        }
+       gsi_next(&gsi);
+     }
+}
+
+/* This function propagates the phi result into the use points with
+   the phi arguments. The join block is copied and wired into the
+   predecessors. Since the use points of the phi results will be same
+   in the each of the copy join blocks in the  predecessors, it
+   propagates the phi arguments in the copy of the join blocks wired
+   into its predecessor.  */
+ 
+static
+void replace_uses_phi (basic_block b, basic_block temp_bb)
+{
+  gimple_seq phis = phi_nodes (b);
+  gimple phi = gimple_seq_first_stmt (phis);
+  tree def = gimple_phi_result (phi), use = gimple_phi_arg_def (phi,0);
+  tree use2 = gimple_phi_arg_def (phi,1);
+
+  if (virtual_operand_p (def))
+    {
+      imm_use_iterator iter;
+      use_operand_p use_p;
+      gimple stmt;
+
+      FOR_EACH_IMM_USE_STMT (stmt, iter, def)
+        FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
+          SET_USE (use_p, use);
+      if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (def))
+        SSA_NAME_OCCURS_IN_ABNORMAL_PHI (use) = 1;
+    }
+   else
+     replace_uses_by (def, use);
+   replace_use_phi_operand1_with_operand2 (temp_bb, use, use2);
+}
+
+/* Returns true if the block bb has label or call statements.
+   Otherwise return false.  */
+
+static bool
+is_block_has_label_call (basic_block bb)
+{
+  gimple_stmt_iterator gsi;
+
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+     {
+       gimple stmt = gsi_stmt(gsi);
+       if (dyn_cast <glabel *> (stmt))
+         {
+           return true;
+         }
+       if (is_gimple_call (stmt))
+         return true;
+     }
+  return false;
+}
+
+/* This function performs the feasibility tests for path splitting
+   to perform. Return false if the feasibility for path splitting
+   is not done and returns true if the feasbility for path splitting
+   is done. Following feasibility tests are performed.
+ 
+   1. Return false if the join block has call gimple statements.
+   2. Return false if the join block has rhs casting for assign
+      gimple statements.
+   3. If the number of phis is greater than 1 or the phi node in
+      the join block has virtual operand return false.
+   4. Return false if the number of sequential statements is
+      greater than 2.
+   5. If the predecessors blocks has labels and call statements
+      return false.
+   6. If the phi result in the phi node of the join block is not
+      used inside the same join block return false.
+   7. Otherwise returns true.  */
+
+static bool
+is_feasible_path_splitting (basic_block join_node, basic_block pred1,
+                           basic_block pred2)
+{
+  int num_stmt = 0, num_phis = 0;
+  gimple_stmt_iterator psi, gsi;
+
+  for (gsi = gsi_start_bb (join_node); !gsi_end_p (gsi); gsi_next (&gsi))
+     {
+       gimple stmt = gsi_stmt(gsi);
+
+       if (gimple_assign_cast_p (stmt))
+         return false;
+
+       if (is_gimple_call (stmt))
+         return false;
+
+       if (!is_gimple_debug(stmt))
+         {
+           num_stmt++;
+         }
+     }
+
+   if (pred1 && pred2 && (num_stmt > 2))
+     {
+       bool found_virtual_result = false;
+
+       for (psi = gsi_start_phis (join_node); !gsi_end_p (psi); )
+          {
+            use_operand_p use_p;
+            imm_use_iterator iter;
+            gimple stmt = gsi_stmt(psi);
+
+            if (!virtual_operand_p (gimple_phi_result (stmt)))
+              num_phis++;
+            else
+              found_virtual_result = true;
+
+            FOR_EACH_IMM_USE_FAST (use_p, iter, gimple_phi_result (stmt))
+            {
+              gimple use_stmt = USE_STMT (use_p);
+
+              if (gimple_bb (use_stmt) != join_node)
+                return false;
+            }
+
+            gsi_next(&psi);
+         }
+
+       if ((num_phis >1) || found_virtual_result)
+          return false;
+
+       if(is_block_has_label_call(pred1) || is_block_has_label_call(pred2))
+         return false;
+
+       return true;
+    }
+  return false;
+}
+
+/* Update the statements in the basic block with the basic
+   basic block.  */
+
+static void
+update_stmt_bb(basic_block b)
+{
+  gimple_stmt_iterator gsi;
+  for(gsi = gsi_start_bb(b); !gsi_end_p(gsi); gsi_next(&gsi))
+   {
+     gimple stmt = gsi_stmt(gsi);
+     gimple_set_bb(stmt,b);
+   }
+}
+
+/* This function gets the join blocks same as the source
+   node of the loop latch nodes and the predecessors of
+   the join block is updated in the pred1 and pred2 passed
+   as the reference arguments into the function. Return
+   the join block.  */
+
+static basic_block
+get_join_blk_same_as_loop_latch (basic_block bb,
+                                 basic_block &pred1,
+                                 basic_block &pred2)
+{
+  vec<basic_block> bbs;
+  basic_block bb1;
+  unsigned int i;
+  edge_iterator ei;
+  edge e1;
+  bool found = false ,found1;
+  bbs = get_all_dominated_blocks (CDI_DOMINATORS,
+                                  bb );
+  FOR_EACH_VEC_ELT (bbs, i, bb1)
+  {
+    found1 = false;
+    FOR_EACH_EDGE (e1, ei, bb->succs)
+    {
+      if ( bb1 == e1->dest)
+        {
+          found = true;
+          found1 = true;
+        }
+    }
+    if (!found1 && found)
+      {
+        found = false;
+        FOR_EACH_EDGE (e1, ei, bb1->succs)
+        {
+          if (e1->flags & (EDGE_DFS_BACK))
+            found = true;
+        }
+
+        if (found && EDGE_COUNT(bb1->preds) == 2)
+          {
+            unsigned int k = 0;
+            FOR_EACH_EDGE (e1, ei, bb1->preds)
+            {
+              if ((e1->flags & (EDGE_DFS_BACK)))
+                continue;
+
+              if ( k == 1)
+                {
+                  if (single_succ_p(e1->src) &&
+                      single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
+                    {
+                      pred2 = e1->src;
+                    }
+                }
+                else
+                  {
+                    if (single_succ_p(e1->src) &&
+                        single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
+                      {
+                        pred1 = e1->src;
+                      }
+                  }
+                k++;
+            }
+            bbs.release();
+            return bb1;
+          }
+       }
+   }
+   bbs.release();
+   return NULL;
+}
+
+/* This is the core function to perform path splitting. The join
+   same as the source of the loop latch node is identified along
+   with their predecessors. Based on the feasibility tests for
+   path splitting the path splitting is performed by wiring the
+   copy of join blocks into the predecessors and propagating the phi
+   result with the corresponding phi arguments into each of the copy
+   of join blocks wired with the original predecessors of the join
+   block.
+ 
+   The  tree-cfg-cleanup will merge the blocks in the predecessors
+   path and the update-ssa will update the ssa representation after
+   the path splitting is performed.  */
+ 
+static void
+perform_path_splitting (basic_block bb)
+{
+  basic_block pred1 = NULL, pred2 = NULL, join_block = NULL;
+
+  join_block = get_join_blk_same_as_loop_latch (bb, pred1, pred2);
+
+  if (join_block  && 
+      is_feasible_path_splitting (join_block, pred1, pred2))
+    {
+      basic_block new_bb1 = NULL, new_bb2 = NULL;
+      gimple_stmt_iterator last;
+      basic_block temp_bb = NULL;
+      edge_iterator ei;
+      edge e1;
+
+      temp_bb = duplicate_block (join_block, NULL, NULL);
+
+      FOR_EACH_EDGE (e1, ei, pred1->succs)
+        new_bb1 = split_edge (e1);
+
+      FOR_EACH_EDGE (e1, ei, pred2->succs)
+        new_bb2 = split_edge (e1);
+
+      last = gsi_start_bb (new_bb1);
+      gsi_insert_seq_after (&last, bb_seq (join_block), GSI_NEW_STMT);
+      last = gsi_start_bb (new_bb2);
+      gsi_insert_seq_after (&last, bb_seq (temp_bb), GSI_NEW_STMT);
+      update_stmt_bb (new_bb1);
+      update_stmt_bb (new_bb2);
+
+      replace_uses_phi (join_block, new_bb2);
+
+      set_bb_seq (join_block, NULL);
+      set_bb_seq(temp_bb,NULL);
+      delete_basic_block (temp_bb);
+      return;
+    }
+}
+
+static unsigned int
+execute_path_split (void)
+{
+  basic_block bb;
+
+  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
+  initialize_original_copy_tables();
+
+  calculate_dominance_info (CDI_DOMINATORS);
+  calculate_dominance_info (CDI_POST_DOMINATORS);
+
+  mark_dfs_back_edges ();
+
+  FOR_EACH_BB_FN (bb, cfun)
+  {
+    gimple last;
+
+    /* We only care about blocks ending in a COND_EXPR. */
+
+    last = gsi_stmt (gsi_last_bb (bb));
+
+    /* We're basically looking for a switch or any kind of conditional with
+       integral or pointer type arguments.  Note the type of the second
+       argument will be the same as the first argument, so no need to
+       check it explicitly.  */
+    if ((last && (gimple_code (last) == GIMPLE_COND
+            && TREE_CODE (gimple_cond_lhs (last)) == SSA_NAME
+            && (INTEGRAL_TYPE_P (TREE_TYPE (gimple_cond_lhs (last)))
+            || POINTER_TYPE_P (TREE_TYPE (gimple_cond_lhs (last))))
+            && (TREE_CODE (gimple_cond_rhs (last)) == SSA_NAME
+            || is_gimple_min_invariant (gimple_cond_rhs (last))))))
+      {
+
+         if (gimple_code(last) == GIMPLE_COND)
+           {
+              perform_path_splitting (bb);
+           }
+      }
+   }
+
+   loop_optimizer_finalize ();
+   free_original_copy_tables ();
+   free_dominance_info (CDI_DOMINATORS);
+   free_dominance_info (CDI_POST_DOMINATORS);
+   return 0;
+}
+
+namespace {
+
+const pass_data pass_data_path_split =
+{
+   GIMPLE_PASS, /* type */
+   "path_split", /* name */
+    OPTGROUP_NONE, /* optinfo_flags */
+    TV_TREE_PATH_SPLIT, /* tv_id */
+    PROP_ssa, /* properties_required */
+    0, /* properties_provided */
+    0, /* properties_destroyed */
+    0, /* todo_flags_start */
+    ( TODO_cleanup_cfg | TODO_update_ssa ), /* todo_flags_finish */
+};
+
+class pass_path_split : public gimple_opt_pass
+{
+   public:
+    pass_path_split (gcc::context *ctxt)
+      : gimple_opt_pass (pass_data_path_split, ctxt)
+    {}
+ 
+   /* opt_pass methods: */
+   opt_pass * clone () { return new pass_path_split (m_ctxt); }
+   virtual bool gate (function *) { return flag_tree_path_split != 0; }
+   virtual unsigned int execute (function *) { return execute_path_split (); }
+ 
+}; // class pass_path_split
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_path_split (gcc::context *ctxt)
+{
+  return new pass_path_split (ctxt);
+}
-- 
1.8.2.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-06-30  8:34 [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation Ajit Kumar Agarwal
@ 2015-06-30 10:38 ` Bernhard Reutner-Fischer
  2015-06-30 10:43   ` Ajit Kumar Agarwal
  2015-06-30 11:39 ` Richard Biener
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 72+ messages in thread
From: Bernhard Reutner-Fischer @ 2015-06-30 10:38 UTC (permalink / raw)
  To: Ajit Kumar Agarwal, law, GCC Patches
  Cc: Vinod Kathail, Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On June 30, 2015 10:16:01 AM GMT+02:00, Ajit Kumar Agarwal <ajit.kumar.agarwal@xilinx.com> wrote:
>All:
>
>The below patch added a new path Splitting optimization pass on SSA
>representation. The Path Splitting optimization
>Pass moves the join block of if-then-else same as loop latch to its
>predecessors and get merged with the predecessors
>Preserving the SSA representation.
>
>The patch is tested for Microblaze and i386 target. The EEMBC/Mibench
>benchmarks is run with the Microblaze target
>And the performance gain of 9.15% and rgbcmy01_lite(EEMBC benchmarks).
>The Deja GNU tests is run for Mircroblaze 
>Target and no regression is seen for Microblaze target and the new
>testcase attached are passed.
>
>For i386 bootstrapping goes through fine and the Spec cpu2000
>benchmarks is run with this patch. Following observation
>were seen with spec cpu2000 benchmarks. 
>
>Ratio of path splitting change vs Ratio of not having path splitting
>change is 3653.353 vs 3652.14 for INT benchmarks.
>Ratio of path splitting change vs Ratio of not having path splitting
>change is  4353.812 vs 4345.351 for FP benchmarks.
>
>Based on comments from RFC patch following changes were done.
>
>1. Added a new pass for path splitting changes.
>2. Placed the new path  Splitting Optimization pass before the copy
>propagation pass.
>3. The join block same as the Loop latch is wired into its predecessors
>so that the CFG Cleanup pass will merge the blocks
>Wired together.
>4. Copy propagation routines added for path splitting changes is not
>needed as suggested by Jeff. They are removed in the patch as
>The copy propagation in the copied join blocks will be done by the
>existing copy propagation pass and the update ssa pass.
>5. Only the propagation of phi results of the join block with the phi
>argument is done which will not be done by the existing update_ssa
>Or copy propagation pass on tree ssa representation.
>6. Added 2 tests.
>    a) compilation check  tests.
>   b) execution tests.

The 2 tests seem to be identical, so why do you have both?
Also, please remove cleanup-tree-dump, this is now done automatically.

Thanks,

>7. Refactoring of the code for the feasibility check and finding the
>join block same as loop latch node.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-06-30 10:38 ` Bernhard Reutner-Fischer
@ 2015-06-30 10:43   ` Ajit Kumar Agarwal
  2015-06-30 10:51     ` Bernhard Reutner-Fischer
  0 siblings, 1 reply; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-06-30 10:43 UTC (permalink / raw)
  To: Bernhard Reutner-Fischer, law, GCC Patches
  Cc: Vinod Kathail, Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala



-----Original Message-----
From: Bernhard Reutner-Fischer [mailto:rep.dot.nop@gmail.com] 
Sent: Tuesday, June 30, 2015 3:57 PM
To: Ajit Kumar Agarwal; law@redhat.com; GCC Patches
Cc: Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On June 30, 2015 10:16:01 AM GMT+02:00, Ajit Kumar Agarwal <ajit.kumar.agarwal@xilinx.com> wrote:
>All:
>
>The below patch added a new path Splitting optimization pass on SSA 
>representation. The Path Splitting optimization Pass moves the join 
>block of if-then-else same as loop latch to its predecessors and get 
>merged with the predecessors Preserving the SSA representation.
>
>The patch is tested for Microblaze and i386 target. The EEMBC/Mibench 
>benchmarks is run with the Microblaze target And the performance gain 
>of 9.15% and rgbcmy01_lite(EEMBC benchmarks).
>The Deja GNU tests is run for Mircroblaze Target and no regression is 
>seen for Microblaze target and the new testcase attached are passed.
>
>For i386 bootstrapping goes through fine and the Spec cpu2000 
>benchmarks is run with this patch. Following observation were seen with 
>spec cpu2000 benchmarks.
>
>Ratio of path splitting change vs Ratio of not having path splitting 
>change is 3653.353 vs 3652.14 for INT benchmarks.
>Ratio of path splitting change vs Ratio of not having path splitting 
>change is  4353.812 vs 4345.351 for FP benchmarks.
>
>Based on comments from RFC patch following changes were done.
>
>1. Added a new pass for path splitting changes.
>2. Placed the new path  Splitting Optimization pass before the copy 
>propagation pass.
>3. The join block same as the Loop latch is wired into its predecessors 
>so that the CFG Cleanup pass will merge the blocks Wired together.
>4. Copy propagation routines added for path splitting changes is not 
>needed as suggested by Jeff. They are removed in the patch as The copy 
>propagation in the copied join blocks will be done by the existing copy 
>propagation pass and the update ssa pass.
>5. Only the propagation of phi results of the join block with the phi 
>argument is done which will not be done by the existing update_ssa Or 
>copy propagation pass on tree ssa representation.
>6. Added 2 tests.
>    a) compilation check  tests.
>   b) execution tests.

>>The 2 tests seem to be identical, so why do you have both?
>>Also, please remove cleanup-tree-dump, this is now done automatically.

The testcase path-split-1.c  is to check for execution which is present in gcc.dg top directory . The one
present in the gcc.dg/tree-ssa/path-split-2.c is to check the compilation as the action item is compilation. For the
execution tests path-split-1.c the action is compile and run.

Thanks & Regards
Ajit

Thanks,

>7. Refactoring of the code for the feasibility check and finding the 
>join block same as loop latch node.



^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-06-30 10:43   ` Ajit Kumar Agarwal
@ 2015-06-30 10:51     ` Bernhard Reutner-Fischer
  0 siblings, 0 replies; 72+ messages in thread
From: Bernhard Reutner-Fischer @ 2015-06-30 10:51 UTC (permalink / raw)
  To: Ajit Kumar Agarwal, law, GCC Patches
  Cc: Vinod Kathail, Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On June 30, 2015 12:38:13 PM GMT+02:00, Ajit Kumar Agarwal <ajit.kumar.agarwal@xilinx.com> wrote:

>>6. Added 2 tests.
>>    a) compilation check  tests.
>>   b) execution tests.
>
>>>The 2 tests seem to be identical, so why do you have both?
>>>Also, please remove cleanup-tree-dump, this is now done
>automatically.
>
>The testcase path-split-1.c  is to check for execution which is present
>in gcc.dg top directory . The one
>present in the gcc.dg/tree-ssa/path-split-2.c is to check the
>compilation as the action item is compilation. For the
>execution tests path-split-1.c the action is compile and run.

One is a superset of the other, no?
Doesn't make sense to me, fwiw.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-06-30  8:34 [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation Ajit Kumar Agarwal
  2015-06-30 10:38 ` Bernhard Reutner-Fischer
@ 2015-06-30 11:39 ` Richard Biener
  2015-06-30 12:07   ` Ajit Kumar Agarwal
  2015-07-04 12:40   ` Ajit Kumar Agarwal
  2015-06-30 12:39 ` Ajit Kumar Agarwal
  2015-06-30 22:18 ` Joseph Myers
  3 siblings, 2 replies; 72+ messages in thread
From: Richard Biener @ 2015-06-30 11:39 UTC (permalink / raw)
  To: Ajit Kumar Agarwal
  Cc: law, GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On Tue, Jun 30, 2015 at 10:16 AM, Ajit Kumar Agarwal
<ajit.kumar.agarwal@xilinx.com> wrote:
> All:
>
> The below patch added a new path Splitting optimization pass on SSA representation. The Path Splitting optimization
> Pass moves the join block of if-then-else same as loop latch to its predecessors and get merged with the predecessors
> Preserving the SSA representation.
>
> The patch is tested for Microblaze and i386 target. The EEMBC/Mibench benchmarks is run with the Microblaze target
> And the performance gain of 9.15% and rgbcmy01_lite(EEMBC benchmarks). The Deja GNU tests is run for Mircroblaze
> Target and no regression is seen for Microblaze target and the new testcase attached are passed.
>
> For i386 bootstrapping goes through fine and the Spec cpu2000 benchmarks is run with this patch. Following observation
> were seen with spec cpu2000 benchmarks.
>
> Ratio of path splitting change vs Ratio of not having path splitting change is 3653.353 vs 3652.14 for INT benchmarks.
> Ratio of path splitting change vs Ratio of not having path splitting change is  4353.812 vs 4345.351 for FP benchmarks.
>
> Based on comments from RFC patch following changes were done.
>
> 1. Added a new pass for path splitting changes.
> 2. Placed the new path  Splitting Optimization pass before the copy propagation pass.
> 3. The join block same as the Loop latch is wired into its predecessors so that the CFG Cleanup pass will merge the blocks
> Wired together.
> 4. Copy propagation routines added for path splitting changes is not needed as suggested by Jeff. They are removed in the patch as
> The copy propagation in the copied join blocks will be done by the existing copy propagation pass and the update ssa pass.
> 5. Only the propagation of phi results of the join block with the phi argument is done which will not be done by the existing update_ssa
> Or copy propagation pass on tree ssa representation.
> 6. Added 2 tests.
>     a) compilation check  tests.
>    b) execution tests.
> 7. Refactoring of the code for the feasibility check and finding the join block same as loop latch node.
>
>     [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation.
>
>     Added a new pass on path splitting on tree SSA representation. The path
>     splitting optimization does the CFG transformation of join block of the
>     if-then-else same as the loop latch node is moved and merged with the
>     predecessor blocks after preserving the SSA representation.
>
>     ChangeLog:
>     2015-06-30  Ajit Agarwal  <ajitkum@xilinx.com>
>
>         * gcc/Makefile.in: Add the build of the new file
>         tree-ssa-path-split.c
>         * gcc/common.opt: Add the new flag ftree-path-split.
>         * gcc/opts.c: Add an entry for Path splitting pass
>         with optimization flag greater and equal to O2.
>         * gcc/passes.def: Enable and add new pass path splitting.
>         * gcc/timevar.def: Add the new entry for TV_TREE_PATH_SPLIT.
>         * gcc/tree-pass.h: Extern Declaration of make_pass_path_split.
>         * gcc/tree-ssa-path-split.c: New file for path splitting pass.
>         * gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c: New testcase.
>         * gcc/testsuite/gcc.dg/path-split-1.c: New testcase.

I'm not 100% sure I understand the transform but what I see from the
testcases it tail-duplicates from a conditional up to
a loop latch block (not sure if it includes it and thus ends up
creating a loop nest or not).

An observation I have is that the pass should at least share the
transform stage to some extent with the existing
tracer pass (tracer.c) which essentially does the same but not
restricted to loops in any way.  So I wonder if
your pass could be simply another heuristic to compute paths to trace
in the existing tracer pass.

Thanks,
Richard.

>     Signed-off-by:Ajit Agarwal ajitkum@xilinx.com.
>
> gcc/Makefile.in                              |   1 +
>  gcc/common.opt                               |   4 +
>  gcc/opts.c                                   |   1 +
>  gcc/passes.def                               |   1 +
>  gcc/testsuite/gcc.dg/path-split-1.c          |  65 ++++
>  gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c |  62 ++++
>  gcc/timevar.def                              |   1 +
>  gcc/tree-pass.h                              |   1 +
>  gcc/tree-ssa-path-split.c                    | 462 +++++++++++++++++++++++++++
>  9 files changed, 598 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.dg/path-split-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>  create mode 100644 gcc/tree-ssa-path-split.c
>
> diff --git a/gcc/Makefile.in b/gcc/Makefile.in
> index 5f9261f..35ac363 100644
> --- a/gcc/Makefile.in
> +++ b/gcc/Makefile.in
> @@ -1476,6 +1476,7 @@ OBJS = \
>         tree-vect-slp.o \
>         tree-vectorizer.o \
>         tree-vrp.o \
> +        tree-ssa-path-split.o \
>         tree.o \
>         valtrack.o \
>         value-prof.o \
> diff --git a/gcc/common.opt b/gcc/common.opt
> index e104269..c63b100 100644
> --- a/gcc/common.opt
> +++ b/gcc/common.opt
> @@ -2328,6 +2328,10 @@ ftree-vrp
>  Common Report Var(flag_tree_vrp) Init(0) Optimization
>  Perform Value Range Propagation on trees
>
> +ftree-path-split
> +Common Report Var(flag_tree_path_split) Init(0) Optimization
> +Perform Path Splitting
> +
>  funit-at-a-time
>  Common Report Var(flag_unit_at_a_time) Init(1) Optimization
>  Compile whole compilation unit at a time
> diff --git a/gcc/opts.c b/gcc/opts.c
> index 8a16116..31947ff 100644
> --- a/gcc/opts.c
> +++ b/gcc/opts.c
> @@ -508,6 +508,7 @@ static const struct default_options default_options_table[] =
>      { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1 },
>      { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
>      { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
> +    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },
>
>      /* -O3 optimizations.  */
>      { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
> diff --git a/gcc/passes.def b/gcc/passes.def
> index c0ddee4..43618eb 100644
> --- a/gcc/passes.def
> +++ b/gcc/passes.def
> @@ -155,6 +155,7 @@ along with GCC; see the file COPYING3.  If not see
>        NEXT_PASS (pass_ccp);
>        /* After CCP we rewrite no longer addressed locals into SSA
>          form if possible.  */
> +      NEXT_PASS (pass_path_split);
>        NEXT_PASS (pass_copy_prop);
>        NEXT_PASS (pass_complete_unrolli);
>        NEXT_PASS (pass_phiprop);
> diff --git a/gcc/testsuite/gcc.dg/path-split-1.c b/gcc/testsuite/gcc.dg/path-split-1.c
> new file mode 100644
> index 0000000..075dc87
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/path-split-1.c
> @@ -0,0 +1,65 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 " } */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +
> +#define RGBMAX 255
> +
> +int
> +test()
> +{
> +  int i, Pels;
> +  unsigned char sum = 0;
> +  unsigned char xr, xg, xb;
> +  unsigned char xc, xm, xy, xk;
> +  unsigned char *ReadPtr, *EritePtr;
> +
> +  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
> +  EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
> +
> +  for (i = 0; i < 100;i++)
> +     {
> +       ReadPtr[i] = 100 - i;
> +     }
> +
> +  for (i = 0; i < 100; i++)
> +     {
> +       xr = *ReadPtr++;
> +       xg = *ReadPtr++;
> +       xb = *ReadPtr++;
> +
> +       xc = (unsigned char) (RGBMAX - xr);
> +       xm = (unsigned char) (RGBMAX - xg);
> +       xy = (unsigned char) (RGBMAX - xb);
> +
> +       if (xc < xm)
> +         {
> +           xk = (unsigned char) (xc < xy ? xc : xy);
> +         }
> +       else
> +        {
> +          xk = (unsigned char) (xm < xy ? xm : xy);
> +        }
> +
> +       xc = (unsigned char) (xc - xk);
> +       xm = (unsigned char) (xm - xk);
> +       xy = (unsigned char) (xy - xk);
> +
> +       *EritePtr++ = xc;
> +       *EritePtr++ = xm;
> +       *EritePtr++ = xy;
> +       *EritePtr++ = xk;
> +       sum += *EritePtr;
> +    }
> +  return sum;
> +}
> +
> +int
> +main()
> +{
> +  if (test() != 33)
> +    abort();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
> new file mode 100644
> index 0000000..19f277c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
> @@ -0,0 +1,62 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-path_split" } */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +
> +#define RGBMAX 255
> +
> +int
> +test()
> +{
> +  int i, Pels;
> +  unsigned char sum = 0;
> +  unsigned char xr, xg, xb;
> +  unsigned char xc, xm, xy, xk;
> +  unsigned char *ReadPtr, *EritePtr;
> +
> +  ReadPtr = (unsigned char *) malloc (sizeof (unsigned char) * 100);
> +  EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
> +
> +  for (i = 0; i < 100;i++)
> +     {
> +       ReadPtr[i] = 100 - i;
> +     }
> +
> +  for (i = 0; i < 100; i++)
> +     {
> +       xr = *ReadPtr++;
> +       xg = *ReadPtr++;
> +       xb = *ReadPtr++;
> +
> +       xc = ( unsigned char) (RGBMAX - xr);
> +       xm = ( unsigned char) (RGBMAX - xg);
> +       xy = ( unsigned char) (RGBMAX - xb);
> +
> +       if (xc < xm)
> +         {
> +           xk = ( unsigned char) (xc < xy ? xc : xy);
> +         }
> +       else
> +         {
> +           xk = ( unsigned char) (xm < xy ? xm : xy);
> +         }
> +
> +       xc = (unsigned char) (xc - xk);
> +       xm = (unsigned char) (xm - xk);
> +       xy = (unsigned char) (xy - xk);
> +
> +       *EritePtr++ = xc;
> +       *EritePtr++ = xm;
> +       *EritePtr++ = xy;
> +       *EritePtr++ = xk;
> +       sum += *EritePtr;
> +    }
> +  return sum;
> +}
> +
> +/* { dg-final { scan-tree-dump "xc_[0-9][0-9]* -> { xc_[0-9][0-9]* }" "path_split"} } */
> +/* { dg-final { scan-tree-dump "xm_[0-9][0-9]* -> { xm_[0-9][0-9]* }" "path_split"} } */
> +/* { dg-final { scan-tree-dump "xy_[0-9][0-9]* -> { xy_[0-9][0-9]* }" "path_split"} } */
> +/* { dg-final { scan-tree-dump "Merging blocks" "path_split"} } */
> +/* { dg-final { cleanup-tree-dump "path_split" } } */
> diff --git a/gcc/timevar.def b/gcc/timevar.def
> index 711bbed..6217a8e 100644
> --- a/gcc/timevar.def
> +++ b/gcc/timevar.def
> @@ -288,3 +288,4 @@ DEFTIMEVAR (TV_JIT_REPLAY        , "replay of JIT client activity")
>  DEFTIMEVAR (TV_ASSEMBLE             , "assemble JIT code")
>  DEFTIMEVAR (TV_LINK                 , "link JIT code")
>  DEFTIMEVAR (TV_LOAD                 , "load JIT result")
> +DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path_split")
> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
> index 398ab83..e00639e 100644
> --- a/gcc/tree-pass.h
> +++ b/gcc/tree-pass.h
> @@ -379,6 +379,7 @@ extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
> +extern gimple_opt_pass *make_pass_path_split (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_build_ssa (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_build_alias (gcc::context *ctxt);
> diff --git a/gcc/tree-ssa-path-split.c b/gcc/tree-ssa-path-split.c
> new file mode 100644
> index 0000000..3da7791
> --- /dev/null
> +++ b/gcc/tree-ssa-path-split.c
> @@ -0,0 +1,462 @@
> +/* Support routines for Path Splitting.
> +   Copyright (C) 2015 Free Software Foundation, Inc.
> +   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
> +
> + This file is part of GCC.
> +
> + GCC is free software; you can redistribute it and/or modify
> + it under the terms of the GNU General Public License as published by
> + the Free Software Foundation; either version 3, or (at your option)
> + any later version.
> +
> +GCC is distributed in the hope that it will be useful,
> +but WITHOUT ANY WARRANTY; without even the implied warranty of
> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +GNU General Public License for more details.
> +
> +You should have received a copy of the GNU General Public License
> +along with GCC; see the file COPYING3.  If not see
> +<http://www.gnu.org/licenses/>.  */
> +
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#include "tm.h"
> +#include "flags.h"
> +#include "tree.h"
> +#include "stor-layout.h"
> +#include "calls.h"
> +#include "predict.h"
> +#include "vec.h"
> +#include "hashtab.h"
> +#include "hash-set.h"
> +#include "machmode.h"
> +#include "hard-reg-set.h"
> +#include "input.h"
> +#include "function.h"
> +#include "dominance.h"
> +#include "cfg.h"
> +#include "cfganal.h"
> +#include "basic-block.h"
> +#include "tree-ssa-alias.h"
> +#include "internal-fn.h"
> +#include "gimple-fold.h"
> +#include "tree-eh.h"
> +#include "gimple-expr.h"
> +#include "is-a.h"
> +#include "gimple.h"
> +#include "gimple-iterator.h"
> +#include "gimple-walk.h"
> +#include "gimple-ssa.h"
> +#include "tree-cfg.h"
> +#include "tree-phinodes.h"
> +#include "ssa-iterators.h"
> +#include "stringpool.h"
> +#include "tree-ssanames.h"
> +#include "tree-ssa-loop-manip.h"
> +#include "tree-ssa-loop-niter.h"
> +#include "tree-ssa-loop.h"
> +#include "tree-into-ssa.h"
> +#include "tree-ssa.h"
> +#include "tree-pass.h"
> +#include "tree-dump.h"
> +#include "gimple-pretty-print.h"
> +#include "diagnostic-core.h"
> +#include "intl.h"
> +#include "cfgloop.h"
> +#include "tree-scalar-evolution.h"
> +#include "tree-ssa-propagate.h"
> +#include "tree-chrec.h"
> +#include "tree-ssa-threadupdate.h"
> +#include "expr.h"
> +#include "insn-codes.h"
> +#include "optabs.h"
> +#include "tree-ssa-threadedge.h"
> +#include "wide-int.h"
> +
> +/* Replace_uses_phi function propagates the phi results with the
> +   first phi argument into each of the copied join blocks wired into
> +   its predecessors. This function is called from the replace_uses_phi
> +   to replace the uses of first phi arguments with the second
> +   phi arguments in the next copy of join block.  */
> +
> +static void
> +replace_use_phi_operand1_with_operand2 (basic_block b,
> +                                        tree use1,
> +                                        tree use2)
> +{
> +  use_operand_p use;
> +  ssa_op_iter iter;
> +  gimple_stmt_iterator gsi;
> +
> +  for (gsi = gsi_start_bb (b); !gsi_end_p (gsi);)
> +     {
> +       gimple stmt = gsi_stmt (gsi);
> +       FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
> +       {
> +         tree tuse = USE_FROM_PTR (use);
> +          if (use1 == tuse || use1 == NULL_TREE)
> +            {
> +              propagate_value (use, use2);
> +              update_stmt(stmt);
> +            }
> +        }
> +       gsi_next(&gsi);
> +     }
> +}
> +
> +/* This function propagates the phi result into the use points with
> +   the phi arguments. The join block is copied and wired into the
> +   predecessors. Since the use points of the phi results will be same
> +   in the each of the copy join blocks in the  predecessors, it
> +   propagates the phi arguments in the copy of the join blocks wired
> +   into its predecessor.  */
> +
> +static
> +void replace_uses_phi (basic_block b, basic_block temp_bb)
> +{
> +  gimple_seq phis = phi_nodes (b);
> +  gimple phi = gimple_seq_first_stmt (phis);
> +  tree def = gimple_phi_result (phi), use = gimple_phi_arg_def (phi,0);
> +  tree use2 = gimple_phi_arg_def (phi,1);
> +
> +  if (virtual_operand_p (def))
> +    {
> +      imm_use_iterator iter;
> +      use_operand_p use_p;
> +      gimple stmt;
> +
> +      FOR_EACH_IMM_USE_STMT (stmt, iter, def)
> +        FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
> +          SET_USE (use_p, use);
> +      if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (def))
> +        SSA_NAME_OCCURS_IN_ABNORMAL_PHI (use) = 1;
> +    }
> +   else
> +     replace_uses_by (def, use);
> +   replace_use_phi_operand1_with_operand2 (temp_bb, use, use2);
> +}
> +
> +/* Returns true if the block bb has label or call statements.
> +   Otherwise return false.  */
> +
> +static bool
> +is_block_has_label_call (basic_block bb)
> +{
> +  gimple_stmt_iterator gsi;
> +
> +  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +     {
> +       gimple stmt = gsi_stmt(gsi);
> +       if (dyn_cast <glabel *> (stmt))
> +         {
> +           return true;
> +         }
> +       if (is_gimple_call (stmt))
> +         return true;
> +     }
> +  return false;
> +}
> +
> +/* This function performs the feasibility tests for path splitting
> +   to perform. Return false if the feasibility for path splitting
> +   is not done and returns true if the feasbility for path splitting
> +   is done. Following feasibility tests are performed.
> +
> +   1. Return false if the join block has call gimple statements.
> +   2. Return false if the join block has rhs casting for assign
> +      gimple statements.
> +   3. If the number of phis is greater than 1 or the phi node in
> +      the join block has virtual operand return false.
> +   4. Return false if the number of sequential statements is
> +      greater than 2.
> +   5. If the predecessors blocks has labels and call statements
> +      return false.
> +   6. If the phi result in the phi node of the join block is not
> +      used inside the same join block return false.
> +   7. Otherwise returns true.  */
> +
> +static bool
> +is_feasible_path_splitting (basic_block join_node, basic_block pred1,
> +                           basic_block pred2)
> +{
> +  int num_stmt = 0, num_phis = 0;
> +  gimple_stmt_iterator psi, gsi;
> +
> +  for (gsi = gsi_start_bb (join_node); !gsi_end_p (gsi); gsi_next (&gsi))
> +     {
> +       gimple stmt = gsi_stmt(gsi);
> +
> +       if (gimple_assign_cast_p (stmt))
> +         return false;
> +
> +       if (is_gimple_call (stmt))
> +         return false;
> +
> +       if (!is_gimple_debug(stmt))
> +         {
> +           num_stmt++;
> +         }
> +     }
> +
> +   if (pred1 && pred2 && (num_stmt > 2))
> +     {
> +       bool found_virtual_result = false;
> +
> +       for (psi = gsi_start_phis (join_node); !gsi_end_p (psi); )
> +          {
> +            use_operand_p use_p;
> +            imm_use_iterator iter;
> +            gimple stmt = gsi_stmt(psi);
> +
> +            if (!virtual_operand_p (gimple_phi_result (stmt)))
> +              num_phis++;
> +            else
> +              found_virtual_result = true;
> +
> +            FOR_EACH_IMM_USE_FAST (use_p, iter, gimple_phi_result (stmt))
> +            {
> +              gimple use_stmt = USE_STMT (use_p);
> +
> +              if (gimple_bb (use_stmt) != join_node)
> +                return false;
> +            }
> +
> +            gsi_next(&psi);
> +         }
> +
> +       if ((num_phis >1) || found_virtual_result)
> +          return false;
> +
> +       if(is_block_has_label_call(pred1) || is_block_has_label_call(pred2))
> +         return false;
> +
> +       return true;
> +    }
> +  return false;
> +}
> +
> +/* Update the statements in the basic block with the basic
> +   basic block.  */
> +
> +static void
> +update_stmt_bb(basic_block b)
> +{
> +  gimple_stmt_iterator gsi;
> +  for(gsi = gsi_start_bb(b); !gsi_end_p(gsi); gsi_next(&gsi))
> +   {
> +     gimple stmt = gsi_stmt(gsi);
> +     gimple_set_bb(stmt,b);
> +   }
> +}
> +
> +/* This function gets the join blocks same as the source
> +   node of the loop latch nodes and the predecessors of
> +   the join block is updated in the pred1 and pred2 passed
> +   as the reference arguments into the function. Return
> +   the join block.  */
> +
> +static basic_block
> +get_join_blk_same_as_loop_latch (basic_block bb,
> +                                 basic_block &pred1,
> +                                 basic_block &pred2)
> +{
> +  vec<basic_block> bbs;
> +  basic_block bb1;
> +  unsigned int i;
> +  edge_iterator ei;
> +  edge e1;
> +  bool found = false ,found1;
> +  bbs = get_all_dominated_blocks (CDI_DOMINATORS,
> +                                  bb );
> +  FOR_EACH_VEC_ELT (bbs, i, bb1)
> +  {
> +    found1 = false;
> +    FOR_EACH_EDGE (e1, ei, bb->succs)
> +    {
> +      if ( bb1 == e1->dest)
> +        {
> +          found = true;
> +          found1 = true;
> +        }
> +    }
> +    if (!found1 && found)
> +      {
> +        found = false;
> +        FOR_EACH_EDGE (e1, ei, bb1->succs)
> +        {
> +          if (e1->flags & (EDGE_DFS_BACK))
> +            found = true;
> +        }
> +
> +        if (found && EDGE_COUNT(bb1->preds) == 2)
> +          {
> +            unsigned int k = 0;
> +            FOR_EACH_EDGE (e1, ei, bb1->preds)
> +            {
> +              if ((e1->flags & (EDGE_DFS_BACK)))
> +                continue;
> +
> +              if ( k == 1)
> +                {
> +                  if (single_succ_p(e1->src) &&
> +                      single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
> +                    {
> +                      pred2 = e1->src;
> +                    }
> +                }
> +                else
> +                  {
> +                    if (single_succ_p(e1->src) &&
> +                        single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
> +                      {
> +                        pred1 = e1->src;
> +                      }
> +                  }
> +                k++;
> +            }
> +            bbs.release();
> +            return bb1;
> +          }
> +       }
> +   }
> +   bbs.release();
> +   return NULL;
> +}
> +
> +/* This is the core function to perform path splitting. The join
> +   same as the source of the loop latch node is identified along
> +   with their predecessors. Based on the feasibility tests for
> +   path splitting the path splitting is performed by wiring the
> +   copy of join blocks into the predecessors and propagating the phi
> +   result with the corresponding phi arguments into each of the copy
> +   of join blocks wired with the original predecessors of the join
> +   block.
> +
> +   The  tree-cfg-cleanup will merge the blocks in the predecessors
> +   path and the update-ssa will update the ssa representation after
> +   the path splitting is performed.  */
> +
> +static void
> +perform_path_splitting (basic_block bb)
> +{
> +  basic_block pred1 = NULL, pred2 = NULL, join_block = NULL;
> +
> +  join_block = get_join_blk_same_as_loop_latch (bb, pred1, pred2);
> +
> +  if (join_block  &&
> +      is_feasible_path_splitting (join_block, pred1, pred2))
> +    {
> +      basic_block new_bb1 = NULL, new_bb2 = NULL;
> +      gimple_stmt_iterator last;
> +      basic_block temp_bb = NULL;
> +      edge_iterator ei;
> +      edge e1;
> +
> +      temp_bb = duplicate_block (join_block, NULL, NULL);
> +
> +      FOR_EACH_EDGE (e1, ei, pred1->succs)
> +        new_bb1 = split_edge (e1);
> +
> +      FOR_EACH_EDGE (e1, ei, pred2->succs)
> +        new_bb2 = split_edge (e1);
> +
> +      last = gsi_start_bb (new_bb1);
> +      gsi_insert_seq_after (&last, bb_seq (join_block), GSI_NEW_STMT);
> +      last = gsi_start_bb (new_bb2);
> +      gsi_insert_seq_after (&last, bb_seq (temp_bb), GSI_NEW_STMT);
> +      update_stmt_bb (new_bb1);
> +      update_stmt_bb (new_bb2);
> +
> +      replace_uses_phi (join_block, new_bb2);
> +
> +      set_bb_seq (join_block, NULL);
> +      set_bb_seq(temp_bb,NULL);
> +      delete_basic_block (temp_bb);
> +      return;
> +    }
> +}
> +
> +static unsigned int
> +execute_path_split (void)
> +{
> +  basic_block bb;
> +
> +  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
> +  initialize_original_copy_tables();
> +
> +  calculate_dominance_info (CDI_DOMINATORS);
> +  calculate_dominance_info (CDI_POST_DOMINATORS);
> +
> +  mark_dfs_back_edges ();
> +
> +  FOR_EACH_BB_FN (bb, cfun)
> +  {
> +    gimple last;
> +
> +    /* We only care about blocks ending in a COND_EXPR. */
> +
> +    last = gsi_stmt (gsi_last_bb (bb));
> +
> +    /* We're basically looking for a switch or any kind of conditional with
> +       integral or pointer type arguments.  Note the type of the second
> +       argument will be the same as the first argument, so no need to
> +       check it explicitly.  */
> +    if ((last && (gimple_code (last) == GIMPLE_COND
> +            && TREE_CODE (gimple_cond_lhs (last)) == SSA_NAME
> +            && (INTEGRAL_TYPE_P (TREE_TYPE (gimple_cond_lhs (last)))
> +            || POINTER_TYPE_P (TREE_TYPE (gimple_cond_lhs (last))))
> +            && (TREE_CODE (gimple_cond_rhs (last)) == SSA_NAME
> +            || is_gimple_min_invariant (gimple_cond_rhs (last))))))
> +      {
> +
> +         if (gimple_code(last) == GIMPLE_COND)
> +           {
> +              perform_path_splitting (bb);
> +           }
> +      }
> +   }
> +
> +   loop_optimizer_finalize ();
> +   free_original_copy_tables ();
> +   free_dominance_info (CDI_DOMINATORS);
> +   free_dominance_info (CDI_POST_DOMINATORS);
> +   return 0;
> +}
> +
> +namespace {
> +
> +const pass_data pass_data_path_split =
> +{
> +   GIMPLE_PASS, /* type */
> +   "path_split", /* name */
> +    OPTGROUP_NONE, /* optinfo_flags */
> +    TV_TREE_PATH_SPLIT, /* tv_id */
> +    PROP_ssa, /* properties_required */
> +    0, /* properties_provided */
> +    0, /* properties_destroyed */
> +    0, /* todo_flags_start */
> +    ( TODO_cleanup_cfg | TODO_update_ssa ), /* todo_flags_finish */
> +};
> +
> +class pass_path_split : public gimple_opt_pass
> +{
> +   public:
> +    pass_path_split (gcc::context *ctxt)
> +      : gimple_opt_pass (pass_data_path_split, ctxt)
> +    {}
> +
> +   /* opt_pass methods: */
> +   opt_pass * clone () { return new pass_path_split (m_ctxt); }
> +   virtual bool gate (function *) { return flag_tree_path_split != 0; }
> +   virtual unsigned int execute (function *) { return execute_path_split (); }
> +
> +}; // class pass_path_split
> +
> +} // anon namespace
> +
> +gimple_opt_pass *
> +make_pass_path_split (gcc::context *ctxt)
> +{
> +  return new pass_path_split (ctxt);
> +}
> --
> 1.8.2.1
>
> Thanks & Regards
> Ajit

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-06-30 11:39 ` Richard Biener
@ 2015-06-30 12:07   ` Ajit Kumar Agarwal
  2015-07-04 12:40   ` Ajit Kumar Agarwal
  1 sibling, 0 replies; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-06-30 12:07 UTC (permalink / raw)
  To: Richard Biener
  Cc: law, GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala



-----Original Message-----
From: Richard Biener [mailto:richard.guenther@gmail.com] 
Sent: Tuesday, June 30, 2015 4:42 PM
To: Ajit Kumar Agarwal
Cc: law@redhat.com; GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On Tue, Jun 30, 2015 at 10:16 AM, Ajit Kumar Agarwal <ajit.kumar.agarwal@xilinx.com> wrote:
> All:
>
> The below patch added a new path Splitting optimization pass on SSA 
> representation. The Path Splitting optimization Pass moves the join 
> block of if-then-else same as loop latch to its predecessors and get merged with the predecessors Preserving the SSA representation.
>
> The patch is tested for Microblaze and i386 target. The EEMBC/Mibench 
> benchmarks is run with the Microblaze target And the performance gain 
> of 9.15% and rgbcmy01_lite(EEMBC benchmarks). The Deja GNU tests is run for Mircroblaze Target and no regression is seen for Microblaze target and the new testcase attached are passed.
>
> For i386 bootstrapping goes through fine and the Spec cpu2000 
> benchmarks is run with this patch. Following observation were seen with spec cpu2000 benchmarks.
>
> Ratio of path splitting change vs Ratio of not having path splitting change is 3653.353 vs 3652.14 for INT benchmarks.
> Ratio of path splitting change vs Ratio of not having path splitting change is  4353.812 vs 4345.351 for FP benchmarks.
>
> Based on comments from RFC patch following changes were done.
>
> 1. Added a new pass for path splitting changes.
> 2. Placed the new path  Splitting Optimization pass before the copy propagation pass.
> 3. The join block same as the Loop latch is wired into its 
> predecessors so that the CFG Cleanup pass will merge the blocks Wired together.
> 4. Copy propagation routines added for path splitting changes is not 
> needed as suggested by Jeff. They are removed in the patch as The copy propagation in the copied join blocks will be done by the existing copy propagation pass and the update ssa pass.
> 5. Only the propagation of phi results of the join block with the phi 
> argument is done which will not be done by the existing update_ssa Or copy propagation pass on tree ssa representation.
> 6. Added 2 tests.
>     a) compilation check  tests.
>    b) execution tests.
> 7. Refactoring of the code for the feasibility check and finding the join block same as loop latch node.
>
>     [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation.
>
>     Added a new pass on path splitting on tree SSA representation. The path
>     splitting optimization does the CFG transformation of join block of the
>     if-then-else same as the loop latch node is moved and merged with the
>     predecessor blocks after preserving the SSA representation.
>
>     ChangeLog:
>     2015-06-30  Ajit Agarwal  <ajitkum@xilinx.com>
>
>         * gcc/Makefile.in: Add the build of the new file
>         tree-ssa-path-split.c
>         * gcc/common.opt: Add the new flag ftree-path-split.
>         * gcc/opts.c: Add an entry for Path splitting pass
>         with optimization flag greater and equal to O2.
>         * gcc/passes.def: Enable and add new pass path splitting.
>         * gcc/timevar.def: Add the new entry for TV_TREE_PATH_SPLIT.
>         * gcc/tree-pass.h: Extern Declaration of make_pass_path_split.
>         * gcc/tree-ssa-path-split.c: New file for path splitting pass.
>         * gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c: New testcase.
>         * gcc/testsuite/gcc.dg/path-split-1.c: New testcase.

>>I'm not 100% sure I understand the transform but what I see from the testcases it tail-duplicates from a conditional up to a loop latch block (not sure if it >>includes it and thus ends up creating a loop nest or not).

The path splitting pass  wired the duplicated basic block of  the loop latch block to both of its predecessor path, if the loop latch block 
is same as join block. The CFG cleanup phase of the path splitting transformation merges the basic blocks which is wired with the original 
predecessors and thus making the loop latch block just  as forwarding block of the predecessors  with the sequential statements of the 
loop latch block is set as NULL having only the phi nodes, and the same Loop semantics with respect to loop latch edge is preserved
Also the SSA updates are preserved.

Thanks & Regards
Ajit 

>>An observation I have is that the pass should at least share the transform stage to some extent with the existing tracer pass (tracer.c) which essentially does >>the same but not restricted to loops in any way.  So I wonder if your pass could be simply another heuristic to compute paths to trace in the existing tracer >>pass.


Thanks,
Richard.

>     Signed-off-by:Ajit Agarwal ajitkum@xilinx.com.
>
> gcc/Makefile.in                              |   1 +
>  gcc/common.opt                               |   4 +
>  gcc/opts.c                                   |   1 +
>  gcc/passes.def                               |   1 +
>  gcc/testsuite/gcc.dg/path-split-1.c          |  65 ++++
>  gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c |  62 ++++
>  gcc/timevar.def                              |   1 +
>  gcc/tree-pass.h                              |   1 +
>  gcc/tree-ssa-path-split.c                    | 462 +++++++++++++++++++++++++++
>  9 files changed, 598 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.dg/path-split-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>  create mode 100644 gcc/tree-ssa-path-split.c
>
> diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 5f9261f..35ac363 
> 100644
> --- a/gcc/Makefile.in
> +++ b/gcc/Makefile.in
> @@ -1476,6 +1476,7 @@ OBJS = \
>         tree-vect-slp.o \
>         tree-vectorizer.o \
>         tree-vrp.o \
> +        tree-ssa-path-split.o \
>         tree.o \
>         valtrack.o \
>         value-prof.o \
> diff --git a/gcc/common.opt b/gcc/common.opt index e104269..c63b100 
> 100644
> --- a/gcc/common.opt
> +++ b/gcc/common.opt
> @@ -2328,6 +2328,10 @@ ftree-vrp
>  Common Report Var(flag_tree_vrp) Init(0) Optimization  Perform Value 
> Range Propagation on trees
>
> +ftree-path-split
> +Common Report Var(flag_tree_path_split) Init(0) Optimization Perform 
> +Path Splitting
> +
>  funit-at-a-time
>  Common Report Var(flag_unit_at_a_time) Init(1) Optimization  Compile 
> whole compilation unit at a time diff --git a/gcc/opts.c b/gcc/opts.c 
> index 8a16116..31947ff 100644
> --- a/gcc/opts.c
> +++ b/gcc/opts.c
> @@ -508,6 +508,7 @@ static const struct default_options default_options_table[] =
>      { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1 },
>      { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
>      { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
> +    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },
>
>      /* -O3 optimizations.  */
>      { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 
> }, diff --git a/gcc/passes.def b/gcc/passes.def index c0ddee4..43618eb 
> 100644
> --- a/gcc/passes.def
> +++ b/gcc/passes.def
> @@ -155,6 +155,7 @@ along with GCC; see the file COPYING3.  If not see
>        NEXT_PASS (pass_ccp);
>        /* After CCP we rewrite no longer addressed locals into SSA
>          form if possible.  */
> +      NEXT_PASS (pass_path_split);
>        NEXT_PASS (pass_copy_prop);
>        NEXT_PASS (pass_complete_unrolli);
>        NEXT_PASS (pass_phiprop);
> diff --git a/gcc/testsuite/gcc.dg/path-split-1.c 
> b/gcc/testsuite/gcc.dg/path-split-1.c
> new file mode 100644
> index 0000000..075dc87
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/path-split-1.c
> @@ -0,0 +1,65 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 " } */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +
> +#define RGBMAX 255
> +
> +int
> +test()
> +{
> +  int i, Pels;
> +  unsigned char sum = 0;
> +  unsigned char xr, xg, xb;
> +  unsigned char xc, xm, xy, xk;
> +  unsigned char *ReadPtr, *EritePtr;
> +
> +  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);  
> + EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
> +
> +  for (i = 0; i < 100;i++)
> +     {
> +       ReadPtr[i] = 100 - i;
> +     }
> +
> +  for (i = 0; i < 100; i++)
> +     {
> +       xr = *ReadPtr++;
> +       xg = *ReadPtr++;
> +       xb = *ReadPtr++;
> +
> +       xc = (unsigned char) (RGBMAX - xr);
> +       xm = (unsigned char) (RGBMAX - xg);
> +       xy = (unsigned char) (RGBMAX - xb);
> +
> +       if (xc < xm)
> +         {
> +           xk = (unsigned char) (xc < xy ? xc : xy);
> +         }
> +       else
> +        {
> +          xk = (unsigned char) (xm < xy ? xm : xy);
> +        }
> +
> +       xc = (unsigned char) (xc - xk);
> +       xm = (unsigned char) (xm - xk);
> +       xy = (unsigned char) (xy - xk);
> +
> +       *EritePtr++ = xc;
> +       *EritePtr++ = xm;
> +       *EritePtr++ = xy;
> +       *EritePtr++ = xk;
> +       sum += *EritePtr;
> +    }
> +  return sum;
> +}
> +
> +int
> +main()
> +{
> +  if (test() != 33)
> +    abort();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
> new file mode 100644
> index 0000000..19f277c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
> @@ -0,0 +1,62 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-path_split" } */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +
> +#define RGBMAX 255
> +
> +int
> +test()
> +{
> +  int i, Pels;
> +  unsigned char sum = 0;
> +  unsigned char xr, xg, xb;
> +  unsigned char xc, xm, xy, xk;
> +  unsigned char *ReadPtr, *EritePtr;
> +
> +  ReadPtr = (unsigned char *) malloc (sizeof (unsigned char) * 100);  
> + EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
> +
> +  for (i = 0; i < 100;i++)
> +     {
> +       ReadPtr[i] = 100 - i;
> +     }
> +
> +  for (i = 0; i < 100; i++)
> +     {
> +       xr = *ReadPtr++;
> +       xg = *ReadPtr++;
> +       xb = *ReadPtr++;
> +
> +       xc = ( unsigned char) (RGBMAX - xr);
> +       xm = ( unsigned char) (RGBMAX - xg);
> +       xy = ( unsigned char) (RGBMAX - xb);
> +
> +       if (xc < xm)
> +         {
> +           xk = ( unsigned char) (xc < xy ? xc : xy);
> +         }
> +       else
> +         {
> +           xk = ( unsigned char) (xm < xy ? xm : xy);
> +         }
> +
> +       xc = (unsigned char) (xc - xk);
> +       xm = (unsigned char) (xm - xk);
> +       xy = (unsigned char) (xy - xk);
> +
> +       *EritePtr++ = xc;
> +       *EritePtr++ = xm;
> +       *EritePtr++ = xy;
> +       *EritePtr++ = xk;
> +       sum += *EritePtr;
> +    }
> +  return sum;
> +}
> +
> +/* { dg-final { scan-tree-dump "xc_[0-9][0-9]* -> { xc_[0-9][0-9]* }" 
> +"path_split"} } */
> +/* { dg-final { scan-tree-dump "xm_[0-9][0-9]* -> { xm_[0-9][0-9]* }" 
> +"path_split"} } */
> +/* { dg-final { scan-tree-dump "xy_[0-9][0-9]* -> { xy_[0-9][0-9]* }" 
> +"path_split"} } */
> +/* { dg-final { scan-tree-dump "Merging blocks" "path_split"} } */
> +/* { dg-final { cleanup-tree-dump "path_split" } } */
> diff --git a/gcc/timevar.def b/gcc/timevar.def index 711bbed..6217a8e 
> 100644
> --- a/gcc/timevar.def
> +++ b/gcc/timevar.def
> @@ -288,3 +288,4 @@ DEFTIMEVAR (TV_JIT_REPLAY        , "replay of JIT client activity")
>  DEFTIMEVAR (TV_ASSEMBLE             , "assemble JIT code")
>  DEFTIMEVAR (TV_LINK                 , "link JIT code")
>  DEFTIMEVAR (TV_LOAD                 , "load JIT result")
> +DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path_split")
> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 398ab83..e00639e 
> 100644
> --- a/gcc/tree-pass.h
> +++ b/gcc/tree-pass.h
> @@ -379,6 +379,7 @@ extern gimple_opt_pass *make_pass_iv_optimize 
> (gcc::context *ctxt);  extern gimple_opt_pass 
> *make_pass_tree_loop_done (gcc::context *ctxt);  extern 
> gimple_opt_pass *make_pass_ch (gcc::context *ctxt);  extern 
> gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
> +extern gimple_opt_pass *make_pass_path_split (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context 
> *ctxt);  extern gimple_opt_pass *make_pass_build_ssa (gcc::context 
> *ctxt);  extern gimple_opt_pass *make_pass_build_alias (gcc::context 
> *ctxt); diff --git a/gcc/tree-ssa-path-split.c 
> b/gcc/tree-ssa-path-split.c new file mode 100644 index 
> 0000000..3da7791
> --- /dev/null
> +++ b/gcc/tree-ssa-path-split.c
> @@ -0,0 +1,462 @@
> +/* Support routines for Path Splitting.
> +   Copyright (C) 2015 Free Software Foundation, Inc.
> +   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
> +
> + This file is part of GCC.
> +
> + GCC is free software; you can redistribute it and/or modify it under 
> + the terms of the GNU General Public License as published by the Free 
> + Software Foundation; either version 3, or (at your option) any later 
> + version.
> +
> +GCC is distributed in the hope that it will be useful, but WITHOUT 
> +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
> +for more details.
> +
> +You should have received a copy of the GNU General Public License 
> +along with GCC; see the file COPYING3.  If not see 
> +<http://www.gnu.org/licenses/>.  */
> +
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#include "tm.h"
> +#include "flags.h"
> +#include "tree.h"
> +#include "stor-layout.h"
> +#include "calls.h"
> +#include "predict.h"
> +#include "vec.h"
> +#include "hashtab.h"
> +#include "hash-set.h"
> +#include "machmode.h"
> +#include "hard-reg-set.h"
> +#include "input.h"
> +#include "function.h"
> +#include "dominance.h"
> +#include "cfg.h"
> +#include "cfganal.h"
> +#include "basic-block.h"
> +#include "tree-ssa-alias.h"
> +#include "internal-fn.h"
> +#include "gimple-fold.h"
> +#include "tree-eh.h"
> +#include "gimple-expr.h"
> +#include "is-a.h"
> +#include "gimple.h"
> +#include "gimple-iterator.h"
> +#include "gimple-walk.h"
> +#include "gimple-ssa.h"
> +#include "tree-cfg.h"
> +#include "tree-phinodes.h"
> +#include "ssa-iterators.h"
> +#include "stringpool.h"
> +#include "tree-ssanames.h"
> +#include "tree-ssa-loop-manip.h"
> +#include "tree-ssa-loop-niter.h"
> +#include "tree-ssa-loop.h"
> +#include "tree-into-ssa.h"
> +#include "tree-ssa.h"
> +#include "tree-pass.h"
> +#include "tree-dump.h"
> +#include "gimple-pretty-print.h"
> +#include "diagnostic-core.h"
> +#include "intl.h"
> +#include "cfgloop.h"
> +#include "tree-scalar-evolution.h"
> +#include "tree-ssa-propagate.h"
> +#include "tree-chrec.h"
> +#include "tree-ssa-threadupdate.h"
> +#include "expr.h"
> +#include "insn-codes.h"
> +#include "optabs.h"
> +#include "tree-ssa-threadedge.h"
> +#include "wide-int.h"
> +
> +/* Replace_uses_phi function propagates the phi results with the
> +   first phi argument into each of the copied join blocks wired into
> +   its predecessors. This function is called from the replace_uses_phi
> +   to replace the uses of first phi arguments with the second
> +   phi arguments in the next copy of join block.  */
> +
> +static void
> +replace_use_phi_operand1_with_operand2 (basic_block b,
> +                                        tree use1,
> +                                        tree use2) {
> +  use_operand_p use;
> +  ssa_op_iter iter;
> +  gimple_stmt_iterator gsi;
> +
> +  for (gsi = gsi_start_bb (b); !gsi_end_p (gsi);)
> +     {
> +       gimple stmt = gsi_stmt (gsi);
> +       FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
> +       {
> +         tree tuse = USE_FROM_PTR (use);
> +          if (use1 == tuse || use1 == NULL_TREE)
> +            {
> +              propagate_value (use, use2);
> +              update_stmt(stmt);
> +            }
> +        }
> +       gsi_next(&gsi);
> +     }
> +}
> +
> +/* This function propagates the phi result into the use points with
> +   the phi arguments. The join block is copied and wired into the
> +   predecessors. Since the use points of the phi results will be same
> +   in the each of the copy join blocks in the  predecessors, it
> +   propagates the phi arguments in the copy of the join blocks wired
> +   into its predecessor.  */
> +
> +static
> +void replace_uses_phi (basic_block b, basic_block temp_bb) {
> +  gimple_seq phis = phi_nodes (b);
> +  gimple phi = gimple_seq_first_stmt (phis);
> +  tree def = gimple_phi_result (phi), use = gimple_phi_arg_def 
> +(phi,0);
> +  tree use2 = gimple_phi_arg_def (phi,1);
> +
> +  if (virtual_operand_p (def))
> +    {
> +      imm_use_iterator iter;
> +      use_operand_p use_p;
> +      gimple stmt;
> +
> +      FOR_EACH_IMM_USE_STMT (stmt, iter, def)
> +        FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
> +          SET_USE (use_p, use);
> +      if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (def))
> +        SSA_NAME_OCCURS_IN_ABNORMAL_PHI (use) = 1;
> +    }
> +   else
> +     replace_uses_by (def, use);
> +   replace_use_phi_operand1_with_operand2 (temp_bb, use, use2); }
> +
> +/* Returns true if the block bb has label or call statements.
> +   Otherwise return false.  */
> +
> +static bool
> +is_block_has_label_call (basic_block bb) {
> +  gimple_stmt_iterator gsi;
> +
> +  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +     {
> +       gimple stmt = gsi_stmt(gsi);
> +       if (dyn_cast <glabel *> (stmt))
> +         {
> +           return true;
> +         }
> +       if (is_gimple_call (stmt))
> +         return true;
> +     }
> +  return false;
> +}
> +
> +/* This function performs the feasibility tests for path splitting
> +   to perform. Return false if the feasibility for path splitting
> +   is not done and returns true if the feasbility for path splitting
> +   is done. Following feasibility tests are performed.
> +
> +   1. Return false if the join block has call gimple statements.
> +   2. Return false if the join block has rhs casting for assign
> +      gimple statements.
> +   3. If the number of phis is greater than 1 or the phi node in
> +      the join block has virtual operand return false.
> +   4. Return false if the number of sequential statements is
> +      greater than 2.
> +   5. If the predecessors blocks has labels and call statements
> +      return false.
> +   6. If the phi result in the phi node of the join block is not
> +      used inside the same join block return false.
> +   7. Otherwise returns true.  */
> +
> +static bool
> +is_feasible_path_splitting (basic_block join_node, basic_block pred1,
> +                           basic_block pred2) {
> +  int num_stmt = 0, num_phis = 0;
> +  gimple_stmt_iterator psi, gsi;
> +
> +  for (gsi = gsi_start_bb (join_node); !gsi_end_p (gsi); gsi_next (&gsi))
> +     {
> +       gimple stmt = gsi_stmt(gsi);
> +
> +       if (gimple_assign_cast_p (stmt))
> +         return false;
> +
> +       if (is_gimple_call (stmt))
> +         return false;
> +
> +       if (!is_gimple_debug(stmt))
> +         {
> +           num_stmt++;
> +         }
> +     }
> +
> +   if (pred1 && pred2 && (num_stmt > 2))
> +     {
> +       bool found_virtual_result = false;
> +
> +       for (psi = gsi_start_phis (join_node); !gsi_end_p (psi); )
> +          {
> +            use_operand_p use_p;
> +            imm_use_iterator iter;
> +            gimple stmt = gsi_stmt(psi);
> +
> +            if (!virtual_operand_p (gimple_phi_result (stmt)))
> +              num_phis++;
> +            else
> +              found_virtual_result = true;
> +
> +            FOR_EACH_IMM_USE_FAST (use_p, iter, gimple_phi_result (stmt))
> +            {
> +              gimple use_stmt = USE_STMT (use_p);
> +
> +              if (gimple_bb (use_stmt) != join_node)
> +                return false;
> +            }
> +
> +            gsi_next(&psi);
> +         }
> +
> +       if ((num_phis >1) || found_virtual_result)
> +          return false;
> +
> +       if(is_block_has_label_call(pred1) || is_block_has_label_call(pred2))
> +         return false;
> +
> +       return true;
> +    }
> +  return false;
> +}
> +
> +/* Update the statements in the basic block with the basic
> +   basic block.  */
> +
> +static void
> +update_stmt_bb(basic_block b)
> +{
> +  gimple_stmt_iterator gsi;
> +  for(gsi = gsi_start_bb(b); !gsi_end_p(gsi); gsi_next(&gsi))
> +   {
> +     gimple stmt = gsi_stmt(gsi);
> +     gimple_set_bb(stmt,b);
> +   }
> +}
> +
> +/* This function gets the join blocks same as the source
> +   node of the loop latch nodes and the predecessors of
> +   the join block is updated in the pred1 and pred2 passed
> +   as the reference arguments into the function. Return
> +   the join block.  */
> +
> +static basic_block
> +get_join_blk_same_as_loop_latch (basic_block bb,
> +                                 basic_block &pred1,
> +                                 basic_block &pred2) {
> +  vec<basic_block> bbs;
> +  basic_block bb1;
> +  unsigned int i;
> +  edge_iterator ei;
> +  edge e1;
> +  bool found = false ,found1;
> +  bbs = get_all_dominated_blocks (CDI_DOMINATORS,
> +                                  bb );
> +  FOR_EACH_VEC_ELT (bbs, i, bb1)
> +  {
> +    found1 = false;
> +    FOR_EACH_EDGE (e1, ei, bb->succs)
> +    {
> +      if ( bb1 == e1->dest)
> +        {
> +          found = true;
> +          found1 = true;
> +        }
> +    }
> +    if (!found1 && found)
> +      {
> +        found = false;
> +        FOR_EACH_EDGE (e1, ei, bb1->succs)
> +        {
> +          if (e1->flags & (EDGE_DFS_BACK))
> +            found = true;
> +        }
> +
> +        if (found && EDGE_COUNT(bb1->preds) == 2)
> +          {
> +            unsigned int k = 0;
> +            FOR_EACH_EDGE (e1, ei, bb1->preds)
> +            {
> +              if ((e1->flags & (EDGE_DFS_BACK)))
> +                continue;
> +
> +              if ( k == 1)
> +                {
> +                  if (single_succ_p(e1->src) &&
> +                      single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
> +                    {
> +                      pred2 = e1->src;
> +                    }
> +                }
> +                else
> +                  {
> +                    if (single_succ_p(e1->src) &&
> +                        single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
> +                      {
> +                        pred1 = e1->src;
> +                      }
> +                  }
> +                k++;
> +            }
> +            bbs.release();
> +            return bb1;
> +          }
> +       }
> +   }
> +   bbs.release();
> +   return NULL;
> +}
> +
> +/* This is the core function to perform path splitting. The join
> +   same as the source of the loop latch node is identified along
> +   with their predecessors. Based on the feasibility tests for
> +   path splitting the path splitting is performed by wiring the
> +   copy of join blocks into the predecessors and propagating the phi
> +   result with the corresponding phi arguments into each of the copy
> +   of join blocks wired with the original predecessors of the join
> +   block.
> +
> +   The  tree-cfg-cleanup will merge the blocks in the predecessors
> +   path and the update-ssa will update the ssa representation after
> +   the path splitting is performed.  */
> +
> +static void
> +perform_path_splitting (basic_block bb) {
> +  basic_block pred1 = NULL, pred2 = NULL, join_block = NULL;
> +
> +  join_block = get_join_blk_same_as_loop_latch (bb, pred1, pred2);
> +
> +  if (join_block  &&
> +      is_feasible_path_splitting (join_block, pred1, pred2))
> +    {
> +      basic_block new_bb1 = NULL, new_bb2 = NULL;
> +      gimple_stmt_iterator last;
> +      basic_block temp_bb = NULL;
> +      edge_iterator ei;
> +      edge e1;
> +
> +      temp_bb = duplicate_block (join_block, NULL, NULL);
> +
> +      FOR_EACH_EDGE (e1, ei, pred1->succs)
> +        new_bb1 = split_edge (e1);
> +
> +      FOR_EACH_EDGE (e1, ei, pred2->succs)
> +        new_bb2 = split_edge (e1);
> +
> +      last = gsi_start_bb (new_bb1);
> +      gsi_insert_seq_after (&last, bb_seq (join_block), GSI_NEW_STMT);
> +      last = gsi_start_bb (new_bb2);
> +      gsi_insert_seq_after (&last, bb_seq (temp_bb), GSI_NEW_STMT);
> +      update_stmt_bb (new_bb1);
> +      update_stmt_bb (new_bb2);
> +
> +      replace_uses_phi (join_block, new_bb2);
> +
> +      set_bb_seq (join_block, NULL);
> +      set_bb_seq(temp_bb,NULL);
> +      delete_basic_block (temp_bb);
> +      return;
> +    }
> +}
> +
> +static unsigned int
> +execute_path_split (void)
> +{
> +  basic_block bb;
> +
> +  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);  
> + initialize_original_copy_tables();
> +
> +  calculate_dominance_info (CDI_DOMINATORS);  
> + calculate_dominance_info (CDI_POST_DOMINATORS);
> +
> +  mark_dfs_back_edges ();
> +
> +  FOR_EACH_BB_FN (bb, cfun)
> +  {
> +    gimple last;
> +
> +    /* We only care about blocks ending in a COND_EXPR. */
> +
> +    last = gsi_stmt (gsi_last_bb (bb));
> +
> +    /* We're basically looking for a switch or any kind of conditional with
> +       integral or pointer type arguments.  Note the type of the second
> +       argument will be the same as the first argument, so no need to
> +       check it explicitly.  */
> +    if ((last && (gimple_code (last) == GIMPLE_COND
> +            && TREE_CODE (gimple_cond_lhs (last)) == SSA_NAME
> +            && (INTEGRAL_TYPE_P (TREE_TYPE (gimple_cond_lhs (last)))
> +            || POINTER_TYPE_P (TREE_TYPE (gimple_cond_lhs (last))))
> +            && (TREE_CODE (gimple_cond_rhs (last)) == SSA_NAME
> +            || is_gimple_min_invariant (gimple_cond_rhs (last))))))
> +      {
> +
> +         if (gimple_code(last) == GIMPLE_COND)
> +           {
> +              perform_path_splitting (bb);
> +           }
> +      }
> +   }
> +
> +   loop_optimizer_finalize ();
> +   free_original_copy_tables ();
> +   free_dominance_info (CDI_DOMINATORS);
> +   free_dominance_info (CDI_POST_DOMINATORS);
> +   return 0;
> +}
> +
> +namespace {
> +
> +const pass_data pass_data_path_split = {
> +   GIMPLE_PASS, /* type */
> +   "path_split", /* name */
> +    OPTGROUP_NONE, /* optinfo_flags */
> +    TV_TREE_PATH_SPLIT, /* tv_id */
> +    PROP_ssa, /* properties_required */
> +    0, /* properties_provided */
> +    0, /* properties_destroyed */
> +    0, /* todo_flags_start */
> +    ( TODO_cleanup_cfg | TODO_update_ssa ), /* todo_flags_finish */ 
> +};
> +
> +class pass_path_split : public gimple_opt_pass {
> +   public:
> +    pass_path_split (gcc::context *ctxt)
> +      : gimple_opt_pass (pass_data_path_split, ctxt)
> +    {}
> +
> +   /* opt_pass methods: */
> +   opt_pass * clone () { return new pass_path_split (m_ctxt); }
> +   virtual bool gate (function *) { return flag_tree_path_split != 0; }
> +   virtual unsigned int execute (function *) { return 
> + execute_path_split (); }
> +
> +}; // class pass_path_split
> +
> +} // anon namespace
> +
> +gimple_opt_pass *
> +make_pass_path_split (gcc::context *ctxt) {
> +  return new pass_path_split (ctxt);
> +}
> --
> 1.8.2.1
>
> Thanks & Regards
> Ajit

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-06-30  8:34 [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation Ajit Kumar Agarwal
  2015-06-30 10:38 ` Bernhard Reutner-Fischer
  2015-06-30 11:39 ` Richard Biener
@ 2015-06-30 12:39 ` Ajit Kumar Agarwal
  2015-06-30 22:18 ` Joseph Myers
  3 siblings, 0 replies; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-06-30 12:39 UTC (permalink / raw)
  To: law, GCC Patches
  Cc: Vinod Kathail, Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

I forgot to attach the Link of the RFC comments from Jeff  for reference.

https://gcc.gnu.org/ml/gcc/2015-05/msg00302.html

Thanks & Regards
Ajit

-----Original Message-----
From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-owner@gcc.gnu.org] On Behalf Of Ajit Kumar Agarwal
Sent: Tuesday, June 30, 2015 1:46 PM
To: law@redhat.com; GCC Patches
Cc: Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

All:

The below patch added a new path Splitting optimization pass on SSA representation. The Path Splitting optimization Pass moves the join block of if-then-else same as loop latch to its predecessors and get merged with the predecessors Preserving the SSA representation.

The patch is tested for Microblaze and i386 target. The EEMBC/Mibench benchmarks is run with the Microblaze target And the performance gain of 9.15% and rgbcmy01_lite(EEMBC benchmarks). The Deja GNU tests is run for Mircroblaze Target and no regression is seen for Microblaze target and the new testcase attached are passed.

For i386 bootstrapping goes through fine and the Spec cpu2000 benchmarks is run with this patch. Following observation were seen with spec cpu2000 benchmarks. 

Ratio of path splitting change vs Ratio of not having path splitting change is 3653.353 vs 3652.14 for INT benchmarks.
Ratio of path splitting change vs Ratio of not having path splitting change is  4353.812 vs 4345.351 for FP benchmarks.

Based on comments from RFC patch following changes were done.

1. Added a new pass for path splitting changes.
2. Placed the new path  Splitting Optimization pass before the copy propagation pass.
3. The join block same as the Loop latch is wired into its predecessors so that the CFG Cleanup pass will merge the blocks Wired together.
4. Copy propagation routines added for path splitting changes is not needed as suggested by Jeff. They are removed in the patch as The copy propagation in the copied join blocks will be done by the existing copy propagation pass and the update ssa pass.
5. Only the propagation of phi results of the join block with the phi argument is done which will not be done by the existing update_ssa Or copy propagation pass on tree ssa representation.
6. Added 2 tests.
    a) compilation check  tests.
   b) execution tests.
7. Refactoring of the code for the feasibility check and finding the join block same as loop latch node.

    [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation.
    
    Added a new pass on path splitting on tree SSA representation. The path
    splitting optimization does the CFG transformation of join block of the
    if-then-else same as the loop latch node is moved and merged with the
    predecessor blocks after preserving the SSA representation.
    
    ChangeLog:
    2015-06-30  Ajit Agarwal  <ajitkum@xilinx.com>
    
        * gcc/Makefile.in: Add the build of the new file
        tree-ssa-path-split.c
        * gcc/common.opt: Add the new flag ftree-path-split.
        * gcc/opts.c: Add an entry for Path splitting pass
        with optimization flag greater and equal to O2.
        * gcc/passes.def: Enable and add new pass path splitting.
        * gcc/timevar.def: Add the new entry for TV_TREE_PATH_SPLIT.
        * gcc/tree-pass.h: Extern Declaration of make_pass_path_split.
        * gcc/tree-ssa-path-split.c: New file for path splitting pass.
        * gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c: New testcase.
        * gcc/testsuite/gcc.dg/path-split-1.c: New testcase.
    
    Signed-off-by:Ajit Agarwal ajitkum@xilinx.com.

gcc/Makefile.in                              |   1 +
 gcc/common.opt                               |   4 +
 gcc/opts.c                                   |   1 +
 gcc/passes.def                               |   1 +
 gcc/testsuite/gcc.dg/path-split-1.c          |  65 ++++
 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c |  62 ++++
 gcc/timevar.def                              |   1 +
 gcc/tree-pass.h                              |   1 +
 gcc/tree-ssa-path-split.c                    | 462 +++++++++++++++++++++++++++
 9 files changed, 598 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/path-split-1.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
 create mode 100644 gcc/tree-ssa-path-split.c

diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 5f9261f..35ac363 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1476,6 +1476,7 @@ OBJS = \
 	tree-vect-slp.o \
 	tree-vectorizer.o \
 	tree-vrp.o \
+        tree-ssa-path-split.o \
 	tree.o \
 	valtrack.o \
 	value-prof.o \
diff --git a/gcc/common.opt b/gcc/common.opt index e104269..c63b100 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2328,6 +2328,10 @@ ftree-vrp
 Common Report Var(flag_tree_vrp) Init(0) Optimization  Perform Value Range Propagation on trees
 
+ftree-path-split
+Common Report Var(flag_tree_path_split) Init(0) Optimization Perform 
+Path Splitting
+
 funit-at-a-time
 Common Report Var(flag_unit_at_a_time) Init(1) Optimization  Compile whole compilation unit at a time diff --git a/gcc/opts.c b/gcc/opts.c index 8a16116..31947ff 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -508,6 +508,7 @@ static const struct default_options default_options_table[] =
     { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
+    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },
 
     /* -O3 optimizations.  */
     { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 }, diff --git a/gcc/passes.def b/gcc/passes.def index c0ddee4..43618eb 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -155,6 +155,7 @@ along with GCC; see the file COPYING3.  If not see
       NEXT_PASS (pass_ccp);
       /* After CCP we rewrite no longer addressed locals into SSA
 	 form if possible.  */
+      NEXT_PASS (pass_path_split);
       NEXT_PASS (pass_copy_prop);
       NEXT_PASS (pass_complete_unrolli);
       NEXT_PASS (pass_phiprop);
diff --git a/gcc/testsuite/gcc.dg/path-split-1.c b/gcc/testsuite/gcc.dg/path-split-1.c
new file mode 100644
index 0000000..075dc87
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/path-split-1.c
@@ -0,0 +1,65 @@
+/* { dg-do run } */
+/* { dg-options "-O2 " } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define RGBMAX 255
+
+int
+test()
+{
+  int i, Pels;
+  unsigned char sum = 0;
+  unsigned char xr, xg, xb;
+  unsigned char xc, xm, xy, xk;
+  unsigned char *ReadPtr, *EritePtr;
+
+  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);  
+ EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+
+  for (i = 0; i < 100;i++)
+     {
+       ReadPtr[i] = 100 - i;
+     }
+
+  for (i = 0; i < 100; i++)
+     {
+       xr = *ReadPtr++;
+       xg = *ReadPtr++;
+       xb = *ReadPtr++;
+
+       xc = (unsigned char) (RGBMAX - xr);
+       xm = (unsigned char) (RGBMAX - xg);
+       xy = (unsigned char) (RGBMAX - xb);
+
+       if (xc < xm)
+         {
+           xk = (unsigned char) (xc < xy ? xc : xy);
+         }
+       else
+        {
+          xk = (unsigned char) (xm < xy ? xm : xy);
+        }
+
+       xc = (unsigned char) (xc - xk);
+       xm = (unsigned char) (xm - xk);
+       xy = (unsigned char) (xy - xk);
+
+       *EritePtr++ = xc;
+       *EritePtr++ = xm;
+       *EritePtr++ = xy;
+       *EritePtr++ = xk;
+       sum += *EritePtr;
+    }
+  return sum;
+}
+
+int
+main()
+{
+  if (test() != 33)
+    abort();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
new file mode 100644
index 0000000..19f277c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-path_split" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define RGBMAX 255
+
+int
+test()
+{
+  int i, Pels;
+  unsigned char sum = 0;
+  unsigned char xr, xg, xb;
+  unsigned char xc, xm, xy, xk;
+  unsigned char *ReadPtr, *EritePtr;
+
+  ReadPtr = (unsigned char *) malloc (sizeof (unsigned char) * 100);  
+ EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+
+  for (i = 0; i < 100;i++)
+     {
+       ReadPtr[i] = 100 - i;
+     }
+
+  for (i = 0; i < 100; i++)
+     {
+       xr = *ReadPtr++;
+       xg = *ReadPtr++;
+       xb = *ReadPtr++;
+
+       xc = ( unsigned char) (RGBMAX - xr);
+       xm = ( unsigned char) (RGBMAX - xg);
+       xy = ( unsigned char) (RGBMAX - xb);
+
+       if (xc < xm)
+         {
+           xk = ( unsigned char) (xc < xy ? xc : xy);
+         }
+       else
+         {
+           xk = ( unsigned char) (xm < xy ? xm : xy);
+         }
+
+       xc = (unsigned char) (xc - xk);
+       xm = (unsigned char) (xm - xk);
+       xy = (unsigned char) (xy - xk);
+
+       *EritePtr++ = xc;
+       *EritePtr++ = xm;
+       *EritePtr++ = xy;
+       *EritePtr++ = xk;
+       sum += *EritePtr;
+    }
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "xc_[0-9][0-9]* -> { xc_[0-9][0-9]* }" 
+"path_split"} } */
+/* { dg-final { scan-tree-dump "xm_[0-9][0-9]* -> { xm_[0-9][0-9]* }" 
+"path_split"} } */
+/* { dg-final { scan-tree-dump "xy_[0-9][0-9]* -> { xy_[0-9][0-9]* }" 
+"path_split"} } */
+/* { dg-final { scan-tree-dump "Merging blocks" "path_split"} } */
+/* { dg-final { cleanup-tree-dump "path_split" } } */
diff --git a/gcc/timevar.def b/gcc/timevar.def index 711bbed..6217a8e 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -288,3 +288,4 @@ DEFTIMEVAR (TV_JIT_REPLAY	     , "replay of JIT client activity")
 DEFTIMEVAR (TV_ASSEMBLE	     , "assemble JIT code")
 DEFTIMEVAR (TV_LINK		     , "link JIT code")
 DEFTIMEVAR (TV_LOAD		     , "load JIT result")
+DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path_split")
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 398ab83..e00639e 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -379,6 +379,7 @@ extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);  extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);  extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);  extern gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_path_split (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context *ctxt);  extern gimple_opt_pass *make_pass_build_ssa (gcc::context *ctxt);  extern gimple_opt_pass *make_pass_build_alias (gcc::context *ctxt); diff --git a/gcc/tree-ssa-path-split.c b/gcc/tree-ssa-path-split.c new file mode 100644 index 0000000..3da7791
--- /dev/null
+++ b/gcc/tree-ssa-path-split.c
@@ -0,0 +1,462 @@
+/* Support routines for Path Splitting.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
+ 
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it under 
+ the terms of the GNU General Public License as published by the Free 
+ Software Foundation; either version 3, or (at your option) any later 
+ version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY 
+WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+for more details.
+
+You should have received a copy of the GNU General Public License along 
+with GCC; see the file COPYING3.  If not see 
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "flags.h"
+#include "tree.h"
+#include "stor-layout.h"
+#include "calls.h"
+#include "predict.h"
+#include "vec.h"
+#include "hashtab.h"
+#include "hash-set.h"
+#include "machmode.h"
+#include "hard-reg-set.h"
+#include "input.h"
+#include "function.h"
+#include "dominance.h"
+#include "cfg.h"
+#include "cfganal.h"
+#include "basic-block.h"
+#include "tree-ssa-alias.h"
+#include "internal-fn.h"
+#include "gimple-fold.h"
+#include "tree-eh.h"
+#include "gimple-expr.h"
+#include "is-a.h"
+#include "gimple.h"
+#include "gimple-iterator.h"
+#include "gimple-walk.h"
+#include "gimple-ssa.h"
+#include "tree-cfg.h"
+#include "tree-phinodes.h"
+#include "ssa-iterators.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
+#include "tree-ssa-loop-manip.h"
+#include "tree-ssa-loop-niter.h"
+#include "tree-ssa-loop.h"
+#include "tree-into-ssa.h"
+#include "tree-ssa.h"
+#include "tree-pass.h"
+#include "tree-dump.h"
+#include "gimple-pretty-print.h"
+#include "diagnostic-core.h"
+#include "intl.h"
+#include "cfgloop.h"
+#include "tree-scalar-evolution.h"
+#include "tree-ssa-propagate.h"
+#include "tree-chrec.h"
+#include "tree-ssa-threadupdate.h"
+#include "expr.h"
+#include "insn-codes.h"
+#include "optabs.h"
+#include "tree-ssa-threadedge.h"
+#include "wide-int.h"
+
+/* Replace_uses_phi function propagates the phi results with the
+   first phi argument into each of the copied join blocks wired into
+   its predecessors. This function is called from the replace_uses_phi 
+   to replace the uses of first phi arguments with the second
+   phi arguments in the next copy of join block.  */
+
+static void
+replace_use_phi_operand1_with_operand2 (basic_block b,
+                                        tree use1,
+                                        tree use2) {
+  use_operand_p use;
+  ssa_op_iter iter;
+  gimple_stmt_iterator gsi;
+
+  for (gsi = gsi_start_bb (b); !gsi_end_p (gsi);)
+     {
+       gimple stmt = gsi_stmt (gsi);
+       FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
+       {
+         tree tuse = USE_FROM_PTR (use);
+          if (use1 == tuse || use1 == NULL_TREE)
+            {
+              propagate_value (use, use2);
+              update_stmt(stmt);
+            }
+        }
+       gsi_next(&gsi);
+     }
+}
+
+/* This function propagates the phi result into the use points with
+   the phi arguments. The join block is copied and wired into the
+   predecessors. Since the use points of the phi results will be same
+   in the each of the copy join blocks in the  predecessors, it
+   propagates the phi arguments in the copy of the join blocks wired
+   into its predecessor.  */
+ 
+static
+void replace_uses_phi (basic_block b, basic_block temp_bb) {
+  gimple_seq phis = phi_nodes (b);
+  gimple phi = gimple_seq_first_stmt (phis);
+  tree def = gimple_phi_result (phi), use = gimple_phi_arg_def (phi,0);
+  tree use2 = gimple_phi_arg_def (phi,1);
+
+  if (virtual_operand_p (def))
+    {
+      imm_use_iterator iter;
+      use_operand_p use_p;
+      gimple stmt;
+
+      FOR_EACH_IMM_USE_STMT (stmt, iter, def)
+        FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
+          SET_USE (use_p, use);
+      if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (def))
+        SSA_NAME_OCCURS_IN_ABNORMAL_PHI (use) = 1;
+    }
+   else
+     replace_uses_by (def, use);
+   replace_use_phi_operand1_with_operand2 (temp_bb, use, use2); }
+
+/* Returns true if the block bb has label or call statements.
+   Otherwise return false.  */
+
+static bool
+is_block_has_label_call (basic_block bb) {
+  gimple_stmt_iterator gsi;
+
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+     {
+       gimple stmt = gsi_stmt(gsi);
+       if (dyn_cast <glabel *> (stmt))
+         {
+           return true;
+         }
+       if (is_gimple_call (stmt))
+         return true;
+     }
+  return false;
+}
+
+/* This function performs the feasibility tests for path splitting
+   to perform. Return false if the feasibility for path splitting
+   is not done and returns true if the feasbility for path splitting
+   is done. Following feasibility tests are performed.
+ 
+   1. Return false if the join block has call gimple statements.
+   2. Return false if the join block has rhs casting for assign
+      gimple statements.
+   3. If the number of phis is greater than 1 or the phi node in
+      the join block has virtual operand return false.
+   4. Return false if the number of sequential statements is
+      greater than 2.
+   5. If the predecessors blocks has labels and call statements
+      return false.
+   6. If the phi result in the phi node of the join block is not
+      used inside the same join block return false.
+   7. Otherwise returns true.  */
+
+static bool
+is_feasible_path_splitting (basic_block join_node, basic_block pred1,
+                           basic_block pred2) {
+  int num_stmt = 0, num_phis = 0;
+  gimple_stmt_iterator psi, gsi;
+
+  for (gsi = gsi_start_bb (join_node); !gsi_end_p (gsi); gsi_next (&gsi))
+     {
+       gimple stmt = gsi_stmt(gsi);
+
+       if (gimple_assign_cast_p (stmt))
+         return false;
+
+       if (is_gimple_call (stmt))
+         return false;
+
+       if (!is_gimple_debug(stmt))
+         {
+           num_stmt++;
+         }
+     }
+
+   if (pred1 && pred2 && (num_stmt > 2))
+     {
+       bool found_virtual_result = false;
+
+       for (psi = gsi_start_phis (join_node); !gsi_end_p (psi); )
+          {
+            use_operand_p use_p;
+            imm_use_iterator iter;
+            gimple stmt = gsi_stmt(psi);
+
+            if (!virtual_operand_p (gimple_phi_result (stmt)))
+              num_phis++;
+            else
+              found_virtual_result = true;
+
+            FOR_EACH_IMM_USE_FAST (use_p, iter, gimple_phi_result (stmt))
+            {
+              gimple use_stmt = USE_STMT (use_p);
+
+              if (gimple_bb (use_stmt) != join_node)
+                return false;
+            }
+
+            gsi_next(&psi);
+         }
+
+       if ((num_phis >1) || found_virtual_result)
+          return false;
+
+       if(is_block_has_label_call(pred1) || is_block_has_label_call(pred2))
+         return false;
+
+       return true;
+    }
+  return false;
+}
+
+/* Update the statements in the basic block with the basic
+   basic block.  */
+
+static void
+update_stmt_bb(basic_block b)
+{
+  gimple_stmt_iterator gsi;
+  for(gsi = gsi_start_bb(b); !gsi_end_p(gsi); gsi_next(&gsi))
+   {
+     gimple stmt = gsi_stmt(gsi);
+     gimple_set_bb(stmt,b);
+   }
+}
+
+/* This function gets the join blocks same as the source
+   node of the loop latch nodes and the predecessors of
+   the join block is updated in the pred1 and pred2 passed
+   as the reference arguments into the function. Return
+   the join block.  */
+
+static basic_block
+get_join_blk_same_as_loop_latch (basic_block bb,
+                                 basic_block &pred1,
+                                 basic_block &pred2) {
+  vec<basic_block> bbs;
+  basic_block bb1;
+  unsigned int i;
+  edge_iterator ei;
+  edge e1;
+  bool found = false ,found1;
+  bbs = get_all_dominated_blocks (CDI_DOMINATORS,
+                                  bb );
+  FOR_EACH_VEC_ELT (bbs, i, bb1)
+  {
+    found1 = false;
+    FOR_EACH_EDGE (e1, ei, bb->succs)
+    {
+      if ( bb1 == e1->dest)
+        {
+          found = true;
+          found1 = true;
+        }
+    }
+    if (!found1 && found)
+      {
+        found = false;
+        FOR_EACH_EDGE (e1, ei, bb1->succs)
+        {
+          if (e1->flags & (EDGE_DFS_BACK))
+            found = true;
+        }
+
+        if (found && EDGE_COUNT(bb1->preds) == 2)
+          {
+            unsigned int k = 0;
+            FOR_EACH_EDGE (e1, ei, bb1->preds)
+            {
+              if ((e1->flags & (EDGE_DFS_BACK)))
+                continue;
+
+              if ( k == 1)
+                {
+                  if (single_succ_p(e1->src) &&
+                      single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
+                    {
+                      pred2 = e1->src;
+                    }
+                }
+                else
+                  {
+                    if (single_succ_p(e1->src) &&
+                        single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
+                      {
+                        pred1 = e1->src;
+                      }
+                  }
+                k++;
+            }
+            bbs.release();
+            return bb1;
+          }
+       }
+   }
+   bbs.release();
+   return NULL;
+}
+
+/* This is the core function to perform path splitting. The join
+   same as the source of the loop latch node is identified along
+   with their predecessors. Based on the feasibility tests for
+   path splitting the path splitting is performed by wiring the
+   copy of join blocks into the predecessors and propagating the phi
+   result with the corresponding phi arguments into each of the copy
+   of join blocks wired with the original predecessors of the join
+   block.
+ 
+   The  tree-cfg-cleanup will merge the blocks in the predecessors
+   path and the update-ssa will update the ssa representation after
+   the path splitting is performed.  */
+ 
+static void
+perform_path_splitting (basic_block bb) {
+  basic_block pred1 = NULL, pred2 = NULL, join_block = NULL;
+
+  join_block = get_join_blk_same_as_loop_latch (bb, pred1, pred2);
+
+  if (join_block  && 
+      is_feasible_path_splitting (join_block, pred1, pred2))
+    {
+      basic_block new_bb1 = NULL, new_bb2 = NULL;
+      gimple_stmt_iterator last;
+      basic_block temp_bb = NULL;
+      edge_iterator ei;
+      edge e1;
+
+      temp_bb = duplicate_block (join_block, NULL, NULL);
+
+      FOR_EACH_EDGE (e1, ei, pred1->succs)
+        new_bb1 = split_edge (e1);
+
+      FOR_EACH_EDGE (e1, ei, pred2->succs)
+        new_bb2 = split_edge (e1);
+
+      last = gsi_start_bb (new_bb1);
+      gsi_insert_seq_after (&last, bb_seq (join_block), GSI_NEW_STMT);
+      last = gsi_start_bb (new_bb2);
+      gsi_insert_seq_after (&last, bb_seq (temp_bb), GSI_NEW_STMT);
+      update_stmt_bb (new_bb1);
+      update_stmt_bb (new_bb2);
+
+      replace_uses_phi (join_block, new_bb2);
+
+      set_bb_seq (join_block, NULL);
+      set_bb_seq(temp_bb,NULL);
+      delete_basic_block (temp_bb);
+      return;
+    }
+}
+
+static unsigned int
+execute_path_split (void)
+{
+  basic_block bb;
+
+  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);  
+ initialize_original_copy_tables();
+
+  calculate_dominance_info (CDI_DOMINATORS);  calculate_dominance_info 
+ (CDI_POST_DOMINATORS);
+
+  mark_dfs_back_edges ();
+
+  FOR_EACH_BB_FN (bb, cfun)
+  {
+    gimple last;
+
+    /* We only care about blocks ending in a COND_EXPR. */
+
+    last = gsi_stmt (gsi_last_bb (bb));
+
+    /* We're basically looking for a switch or any kind of conditional with
+       integral or pointer type arguments.  Note the type of the second
+       argument will be the same as the first argument, so no need to
+       check it explicitly.  */
+    if ((last && (gimple_code (last) == GIMPLE_COND
+            && TREE_CODE (gimple_cond_lhs (last)) == SSA_NAME
+            && (INTEGRAL_TYPE_P (TREE_TYPE (gimple_cond_lhs (last)))
+            || POINTER_TYPE_P (TREE_TYPE (gimple_cond_lhs (last))))
+            && (TREE_CODE (gimple_cond_rhs (last)) == SSA_NAME
+            || is_gimple_min_invariant (gimple_cond_rhs (last))))))
+      {
+
+         if (gimple_code(last) == GIMPLE_COND)
+           {
+              perform_path_splitting (bb);
+           }
+      }
+   }
+
+   loop_optimizer_finalize ();
+   free_original_copy_tables ();
+   free_dominance_info (CDI_DOMINATORS);
+   free_dominance_info (CDI_POST_DOMINATORS);
+   return 0;
+}
+
+namespace {
+
+const pass_data pass_data_path_split =
+{
+   GIMPLE_PASS, /* type */
+   "path_split", /* name */
+    OPTGROUP_NONE, /* optinfo_flags */
+    TV_TREE_PATH_SPLIT, /* tv_id */
+    PROP_ssa, /* properties_required */
+    0, /* properties_provided */
+    0, /* properties_destroyed */
+    0, /* todo_flags_start */
+    ( TODO_cleanup_cfg | TODO_update_ssa ), /* todo_flags_finish */ };
+
+class pass_path_split : public gimple_opt_pass {
+   public:
+    pass_path_split (gcc::context *ctxt)
+      : gimple_opt_pass (pass_data_path_split, ctxt)
+    {}
+ 
+   /* opt_pass methods: */
+   opt_pass * clone () { return new pass_path_split (m_ctxt); }
+   virtual bool gate (function *) { return flag_tree_path_split != 0; }
+   virtual unsigned int execute (function *) { return 
+ execute_path_split (); }
+ 
+}; // class pass_path_split
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_path_split (gcc::context *ctxt) {
+  return new pass_path_split (ctxt);
+}
--
1.8.2.1

Thanks & Regards
Ajit

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-06-30  8:34 [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation Ajit Kumar Agarwal
                   ` (2 preceding siblings ...)
  2015-06-30 12:39 ` Ajit Kumar Agarwal
@ 2015-06-30 22:18 ` Joseph Myers
  2015-07-02  3:52   ` Ajit Kumar Agarwal
  3 siblings, 1 reply; 72+ messages in thread
From: Joseph Myers @ 2015-06-30 22:18 UTC (permalink / raw)
  To: Ajit Kumar Agarwal
  Cc: law, GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On Tue, 30 Jun 2015, Ajit Kumar Agarwal wrote:

>         * gcc/common.opt: Add the new flag ftree-path-split.

All options need documenting in invoke.texi.

> +#include "tm.h"

Why?  Does some other header depend on this, or are you using a target 
macro?

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-06-30 22:18 ` Joseph Myers
@ 2015-07-02  3:52   ` Ajit Kumar Agarwal
  2015-07-06 20:08     ` Jeff Law
  0 siblings, 1 reply; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-07-02  3:52 UTC (permalink / raw)
  To: Joseph Myers
  Cc: law, GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala



-----Original Message-----
From: Joseph Myers [mailto:joseph@codesourcery.com] 
Sent: Wednesday, July 01, 2015 3:48 AM
To: Ajit Kumar Agarwal
Cc: law@redhat.com; GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On Tue, 30 Jun 2015, Ajit Kumar Agarwal wrote:

>         * gcc/common.opt: Add the new flag ftree-path-split.

>>All options need documenting in invoke.texi.

Sure.
> +#include "tm.h"

>>Why?  Does some other header depend on this, or are you using a target macro?

I am not using any target macro. There are many header files that includes the tm.h and also there are many tree-ssa optimization
files that have included  "tm.h"  listing some of them tree-ssa-threadupdate.c  tree-vrp.c ,  tree-ssa-threadedge.c.

Thanks & Regards
Ajit
--
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-06-30 11:39 ` Richard Biener
  2015-06-30 12:07   ` Ajit Kumar Agarwal
@ 2015-07-04 12:40   ` Ajit Kumar Agarwal
  2015-07-07  8:50     ` Richard Biener
  1 sibling, 1 reply; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-07-04 12:40 UTC (permalink / raw)
  To: Richard Biener
  Cc: law, GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala



-----Original Message-----
From: Richard Biener [mailto:richard.guenther@gmail.com] 
Sent: Tuesday, June 30, 2015 4:42 PM
To: Ajit Kumar Agarwal
Cc: law@redhat.com; GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On Tue, Jun 30, 2015 at 10:16 AM, Ajit Kumar Agarwal <ajit.kumar.agarwal@xilinx.com> wrote:
> All:
>
> The below patch added a new path Splitting optimization pass on SSA 
> representation. The Path Splitting optimization Pass moves the join 
> block of if-then-else same as loop latch to its predecessors and get merged with the predecessors Preserving the SSA representation.
>
> The patch is tested for Microblaze and i386 target. The EEMBC/Mibench 
> benchmarks is run with the Microblaze target And the performance gain 
> of 9.15% and rgbcmy01_lite(EEMBC benchmarks). The Deja GNU tests is run for Mircroblaze Target and no regression is seen for Microblaze target and the new testcase attached are passed.
>
> For i386 bootstrapping goes through fine and the Spec cpu2000 
> benchmarks is run with this patch. Following observation were seen with spec cpu2000 benchmarks.
>
> Ratio of path splitting change vs Ratio of not having path splitting change is 3653.353 vs 3652.14 for INT benchmarks.
> Ratio of path splitting change vs Ratio of not having path splitting change is  4353.812 vs 4345.351 for FP benchmarks.
>
> Based on comments from RFC patch following changes were done.
>
> 1. Added a new pass for path splitting changes.
> 2. Placed the new path  Splitting Optimization pass before the copy propagation pass.
> 3. The join block same as the Loop latch is wired into its 
> predecessors so that the CFG Cleanup pass will merge the blocks Wired together.
> 4. Copy propagation routines added for path splitting changes is not 
> needed as suggested by Jeff. They are removed in the patch as The copy propagation in the copied join blocks will be done by the existing copy propagation pass and the update ssa pass.
> 5. Only the propagation of phi results of the join block with the phi 
> argument is done which will not be done by the existing update_ssa Or copy propagation pass on tree ssa representation.
> 6. Added 2 tests.
>     a) compilation check  tests.
>    b) execution tests.
> 7. Refactoring of the code for the feasibility check and finding the join block same as loop latch node.
>
>     [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation.
>
>     Added a new pass on path splitting on tree SSA representation. The path
>     splitting optimization does the CFG transformation of join block of the
>     if-then-else same as the loop latch node is moved and merged with the
>     predecessor blocks after preserving the SSA representation.
>
>     ChangeLog:
>     2015-06-30  Ajit Agarwal  <ajitkum@xilinx.com>
>
>         * gcc/Makefile.in: Add the build of the new file
>         tree-ssa-path-split.c
>         * gcc/common.opt: Add the new flag ftree-path-split.
>         * gcc/opts.c: Add an entry for Path splitting pass
>         with optimization flag greater and equal to O2.
>         * gcc/passes.def: Enable and add new pass path splitting.
>         * gcc/timevar.def: Add the new entry for TV_TREE_PATH_SPLIT.
>         * gcc/tree-pass.h: Extern Declaration of make_pass_path_split.
>         * gcc/tree-ssa-path-split.c: New file for path splitting pass.
>         * gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c: New testcase.
>         * gcc/testsuite/gcc.dg/path-split-1.c: New testcase.

>>I'm not 100% sure I understand the transform but what I see from the testcases it tail-duplicates from a conditional up to a loop latch block (not sure if it >>includes it and thus ends up creating a loop nest or not).

>>An observation I have is that the pass should at least share the transform stage to some extent with the existing tracer pass (tracer.c) which essentially does >>the same but not restricted to loops in any way.  

The following piece of code from tracer.c can be shared with the existing path splitting pass.

{
             e = find_edge (bb, bb2);

              copy = duplicate_block (bb2, e, bb);
              flush_pending_stmts (e);

              add_phi_args_after_copy (&copy, 1, NULL);
}

Sharing the above code of the transform stage of tracer.c with the path splitting pass has the following limitation.

1. The duplicated loop latch node is wired to its predecessors and the existing phi node in the loop latch node with the
Phi arguments from its corresponding predecessors is moved to the duplicated loop latch node that is wired into its predecessors. Due
To this, the duplicated loop latch nodes wired into its predecessors will not be merged with the original predecessors by CFG cleanup phase .

>> So I wonder if your pass could be simply another heuristic to compute paths to trace in the existing tracer pass.

Sorry, I am not very clear when you say the above.  I am trying to figure out whether you expect the existing pass of tracer.c should be modified
Or the path splitting pass should coexist.

My understanding of existing tracer pass is to  find out the traces  based on the frequency  with the profile data. 
Based on the traces,  tail duplication is done in order to enable the superblock regions. So I wonder the path splitting pass could be  incorporated in the 
existing  pass  to compute the path of the traces.
 
Thanks & Regards
Ajit

Thanks,
Richard.

>     Signed-off-by:Ajit Agarwal ajitkum@xilinx.com.
>
> gcc/Makefile.in                              |   1 +
>  gcc/common.opt                               |   4 +
>  gcc/opts.c                                   |   1 +
>  gcc/passes.def                               |   1 +
>  gcc/testsuite/gcc.dg/path-split-1.c          |  65 ++++
>  gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c |  62 ++++
>  gcc/timevar.def                              |   1 +
>  gcc/tree-pass.h                              |   1 +
>  gcc/tree-ssa-path-split.c                    | 462 +++++++++++++++++++++++++++
>  9 files changed, 598 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.dg/path-split-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>  create mode 100644 gcc/tree-ssa-path-split.c
>
> diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 5f9261f..35ac363 
> 100644
> --- a/gcc/Makefile.in
> +++ b/gcc/Makefile.in
> @@ -1476,6 +1476,7 @@ OBJS = \
>         tree-vect-slp.o \
>         tree-vectorizer.o \
>         tree-vrp.o \
> +        tree-ssa-path-split.o \
>         tree.o \
>         valtrack.o \
>         value-prof.o \
> diff --git a/gcc/common.opt b/gcc/common.opt index e104269..c63b100 
> 100644
> --- a/gcc/common.opt
> +++ b/gcc/common.opt
> @@ -2328,6 +2328,10 @@ ftree-vrp
>  Common Report Var(flag_tree_vrp) Init(0) Optimization  Perform Value 
> Range Propagation on trees
>
> +ftree-path-split
> +Common Report Var(flag_tree_path_split) Init(0) Optimization Perform 
> +Path Splitting
> +
>  funit-at-a-time
>  Common Report Var(flag_unit_at_a_time) Init(1) Optimization  Compile 
> whole compilation unit at a time diff --git a/gcc/opts.c b/gcc/opts.c 
> index 8a16116..31947ff 100644
> --- a/gcc/opts.c
> +++ b/gcc/opts.c
> @@ -508,6 +508,7 @@ static const struct default_options default_options_table[] =
>      { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1 },
>      { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
>      { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
> +    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },
>
>      /* -O3 optimizations.  */
>      { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 
> }, diff --git a/gcc/passes.def b/gcc/passes.def index c0ddee4..43618eb 
> 100644
> --- a/gcc/passes.def
> +++ b/gcc/passes.def
> @@ -155,6 +155,7 @@ along with GCC; see the file COPYING3.  If not see
>        NEXT_PASS (pass_ccp);
>        /* After CCP we rewrite no longer addressed locals into SSA
>          form if possible.  */
> +      NEXT_PASS (pass_path_split);
>        NEXT_PASS (pass_copy_prop);
>        NEXT_PASS (pass_complete_unrolli);
>        NEXT_PASS (pass_phiprop);
> diff --git a/gcc/testsuite/gcc.dg/path-split-1.c 
> b/gcc/testsuite/gcc.dg/path-split-1.c
> new file mode 100644
> index 0000000..075dc87
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/path-split-1.c
> @@ -0,0 +1,65 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 " } */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +
> +#define RGBMAX 255
> +
> +int
> +test()
> +{
> +  int i, Pels;
> +  unsigned char sum = 0;
> +  unsigned char xr, xg, xb;
> +  unsigned char xc, xm, xy, xk;
> +  unsigned char *ReadPtr, *EritePtr;
> +
> +  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);  
> + EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
> +
> +  for (i = 0; i < 100;i++)
> +     {
> +       ReadPtr[i] = 100 - i;
> +     }
> +
> +  for (i = 0; i < 100; i++)
> +     {
> +       xr = *ReadPtr++;
> +       xg = *ReadPtr++;
> +       xb = *ReadPtr++;
> +
> +       xc = (unsigned char) (RGBMAX - xr);
> +       xm = (unsigned char) (RGBMAX - xg);
> +       xy = (unsigned char) (RGBMAX - xb);
> +
> +       if (xc < xm)
> +         {
> +           xk = (unsigned char) (xc < xy ? xc : xy);
> +         }
> +       else
> +        {
> +          xk = (unsigned char) (xm < xy ? xm : xy);
> +        }
> +
> +       xc = (unsigned char) (xc - xk);
> +       xm = (unsigned char) (xm - xk);
> +       xy = (unsigned char) (xy - xk);
> +
> +       *EritePtr++ = xc;
> +       *EritePtr++ = xm;
> +       *EritePtr++ = xy;
> +       *EritePtr++ = xk;
> +       sum += *EritePtr;
> +    }
> +  return sum;
> +}
> +
> +int
> +main()
> +{
> +  if (test() != 33)
> +    abort();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
> new file mode 100644
> index 0000000..19f277c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
> @@ -0,0 +1,62 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-path_split" } */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +
> +#define RGBMAX 255
> +
> +int
> +test()
> +{
> +  int i, Pels;
> +  unsigned char sum = 0;
> +  unsigned char xr, xg, xb;
> +  unsigned char xc, xm, xy, xk;
> +  unsigned char *ReadPtr, *EritePtr;
> +
> +  ReadPtr = (unsigned char *) malloc (sizeof (unsigned char) * 100);  
> + EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
> +
> +  for (i = 0; i < 100;i++)
> +     {
> +       ReadPtr[i] = 100 - i;
> +     }
> +
> +  for (i = 0; i < 100; i++)
> +     {
> +       xr = *ReadPtr++;
> +       xg = *ReadPtr++;
> +       xb = *ReadPtr++;
> +
> +       xc = ( unsigned char) (RGBMAX - xr);
> +       xm = ( unsigned char) (RGBMAX - xg);
> +       xy = ( unsigned char) (RGBMAX - xb);
> +
> +       if (xc < xm)
> +         {
> +           xk = ( unsigned char) (xc < xy ? xc : xy);
> +         }
> +       else
> +         {
> +           xk = ( unsigned char) (xm < xy ? xm : xy);
> +         }
> +
> +       xc = (unsigned char) (xc - xk);
> +       xm = (unsigned char) (xm - xk);
> +       xy = (unsigned char) (xy - xk);
> +
> +       *EritePtr++ = xc;
> +       *EritePtr++ = xm;
> +       *EritePtr++ = xy;
> +       *EritePtr++ = xk;
> +       sum += *EritePtr;
> +    }
> +  return sum;
> +}
> +
> +/* { dg-final { scan-tree-dump "xc_[0-9][0-9]* -> { xc_[0-9][0-9]* }" 
> +"path_split"} } */
> +/* { dg-final { scan-tree-dump "xm_[0-9][0-9]* -> { xm_[0-9][0-9]* }" 
> +"path_split"} } */
> +/* { dg-final { scan-tree-dump "xy_[0-9][0-9]* -> { xy_[0-9][0-9]* }" 
> +"path_split"} } */
> +/* { dg-final { scan-tree-dump "Merging blocks" "path_split"} } */
> +/* { dg-final { cleanup-tree-dump "path_split" } } */
> diff --git a/gcc/timevar.def b/gcc/timevar.def index 711bbed..6217a8e 
> 100644
> --- a/gcc/timevar.def
> +++ b/gcc/timevar.def
> @@ -288,3 +288,4 @@ DEFTIMEVAR (TV_JIT_REPLAY        , "replay of JIT client activity")
>  DEFTIMEVAR (TV_ASSEMBLE             , "assemble JIT code")
>  DEFTIMEVAR (TV_LINK                 , "link JIT code")
>  DEFTIMEVAR (TV_LOAD                 , "load JIT result")
> +DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path_split")
> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 398ab83..e00639e 
> 100644
> --- a/gcc/tree-pass.h
> +++ b/gcc/tree-pass.h
> @@ -379,6 +379,7 @@ extern gimple_opt_pass *make_pass_iv_optimize 
> (gcc::context *ctxt);  extern gimple_opt_pass 
> *make_pass_tree_loop_done (gcc::context *ctxt);  extern 
> gimple_opt_pass *make_pass_ch (gcc::context *ctxt);  extern 
> gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
> +extern gimple_opt_pass *make_pass_path_split (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context 
> *ctxt);  extern gimple_opt_pass *make_pass_build_ssa (gcc::context 
> *ctxt);  extern gimple_opt_pass *make_pass_build_alias (gcc::context 
> *ctxt); diff --git a/gcc/tree-ssa-path-split.c 
> b/gcc/tree-ssa-path-split.c new file mode 100644 index 
> 0000000..3da7791
> --- /dev/null
> +++ b/gcc/tree-ssa-path-split.c
> @@ -0,0 +1,462 @@
> +/* Support routines for Path Splitting.
> +   Copyright (C) 2015 Free Software Foundation, Inc.
> +   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
> +
> + This file is part of GCC.
> +
> + GCC is free software; you can redistribute it and/or modify it under 
> + the terms of the GNU General Public License as published by the Free 
> + Software Foundation; either version 3, or (at your option) any later 
> + version.
> +
> +GCC is distributed in the hope that it will be useful, but WITHOUT 
> +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
> +for more details.
> +
> +You should have received a copy of the GNU General Public License 
> +along with GCC; see the file COPYING3.  If not see 
> +<http://www.gnu.org/licenses/>.  */
> +
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#include "tm.h"
> +#include "flags.h"
> +#include "tree.h"
> +#include "stor-layout.h"
> +#include "calls.h"
> +#include "predict.h"
> +#include "vec.h"
> +#include "hashtab.h"
> +#include "hash-set.h"
> +#include "machmode.h"
> +#include "hard-reg-set.h"
> +#include "input.h"
> +#include "function.h"
> +#include "dominance.h"
> +#include "cfg.h"
> +#include "cfganal.h"
> +#include "basic-block.h"
> +#include "tree-ssa-alias.h"
> +#include "internal-fn.h"
> +#include "gimple-fold.h"
> +#include "tree-eh.h"
> +#include "gimple-expr.h"
> +#include "is-a.h"
> +#include "gimple.h"
> +#include "gimple-iterator.h"
> +#include "gimple-walk.h"
> +#include "gimple-ssa.h"
> +#include "tree-cfg.h"
> +#include "tree-phinodes.h"
> +#include "ssa-iterators.h"
> +#include "stringpool.h"
> +#include "tree-ssanames.h"
> +#include "tree-ssa-loop-manip.h"
> +#include "tree-ssa-loop-niter.h"
> +#include "tree-ssa-loop.h"
> +#include "tree-into-ssa.h"
> +#include "tree-ssa.h"
> +#include "tree-pass.h"
> +#include "tree-dump.h"
> +#include "gimple-pretty-print.h"
> +#include "diagnostic-core.h"
> +#include "intl.h"
> +#include "cfgloop.h"
> +#include "tree-scalar-evolution.h"
> +#include "tree-ssa-propagate.h"
> +#include "tree-chrec.h"
> +#include "tree-ssa-threadupdate.h"
> +#include "expr.h"
> +#include "insn-codes.h"
> +#include "optabs.h"
> +#include "tree-ssa-threadedge.h"
> +#include "wide-int.h"
> +
> +/* Replace_uses_phi function propagates the phi results with the
> +   first phi argument into each of the copied join blocks wired into
> +   its predecessors. This function is called from the replace_uses_phi
> +   to replace the uses of first phi arguments with the second
> +   phi arguments in the next copy of join block.  */
> +
> +static void
> +replace_use_phi_operand1_with_operand2 (basic_block b,
> +                                        tree use1,
> +                                        tree use2) {
> +  use_operand_p use;
> +  ssa_op_iter iter;
> +  gimple_stmt_iterator gsi;
> +
> +  for (gsi = gsi_start_bb (b); !gsi_end_p (gsi);)
> +     {
> +       gimple stmt = gsi_stmt (gsi);
> +       FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
> +       {
> +         tree tuse = USE_FROM_PTR (use);
> +          if (use1 == tuse || use1 == NULL_TREE)
> +            {
> +              propagate_value (use, use2);
> +              update_stmt(stmt);
> +            }
> +        }
> +       gsi_next(&gsi);
> +     }
> +}
> +
> +/* This function propagates the phi result into the use points with
> +   the phi arguments. The join block is copied and wired into the
> +   predecessors. Since the use points of the phi results will be same
> +   in the each of the copy join blocks in the  predecessors, it
> +   propagates the phi arguments in the copy of the join blocks wired
> +   into its predecessor.  */
> +
> +static
> +void replace_uses_phi (basic_block b, basic_block temp_bb) {
> +  gimple_seq phis = phi_nodes (b);
> +  gimple phi = gimple_seq_first_stmt (phis);
> +  tree def = gimple_phi_result (phi), use = gimple_phi_arg_def 
> +(phi,0);
> +  tree use2 = gimple_phi_arg_def (phi,1);
> +
> +  if (virtual_operand_p (def))
> +    {
> +      imm_use_iterator iter;
> +      use_operand_p use_p;
> +      gimple stmt;
> +
> +      FOR_EACH_IMM_USE_STMT (stmt, iter, def)
> +        FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
> +          SET_USE (use_p, use);
> +      if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (def))
> +        SSA_NAME_OCCURS_IN_ABNORMAL_PHI (use) = 1;
> +    }
> +   else
> +     replace_uses_by (def, use);
> +   replace_use_phi_operand1_with_operand2 (temp_bb, use, use2); }
> +
> +/* Returns true if the block bb has label or call statements.
> +   Otherwise return false.  */
> +
> +static bool
> +is_block_has_label_call (basic_block bb) {
> +  gimple_stmt_iterator gsi;
> +
> +  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +     {
> +       gimple stmt = gsi_stmt(gsi);
> +       if (dyn_cast <glabel *> (stmt))
> +         {
> +           return true;
> +         }
> +       if (is_gimple_call (stmt))
> +         return true;
> +     }
> +  return false;
> +}
> +
> +/* This function performs the feasibility tests for path splitting
> +   to perform. Return false if the feasibility for path splitting
> +   is not done and returns true if the feasbility for path splitting
> +   is done. Following feasibility tests are performed.
> +
> +   1. Return false if the join block has call gimple statements.
> +   2. Return false if the join block has rhs casting for assign
> +      gimple statements.
> +   3. If the number of phis is greater than 1 or the phi node in
> +      the join block has virtual operand return false.
> +   4. Return false if the number of sequential statements is
> +      greater than 2.
> +   5. If the predecessors blocks has labels and call statements
> +      return false.
> +   6. If the phi result in the phi node of the join block is not
> +      used inside the same join block return false.
> +   7. Otherwise returns true.  */
> +
> +static bool
> +is_feasible_path_splitting (basic_block join_node, basic_block pred1,
> +                           basic_block pred2) {
> +  int num_stmt = 0, num_phis = 0;
> +  gimple_stmt_iterator psi, gsi;
> +
> +  for (gsi = gsi_start_bb (join_node); !gsi_end_p (gsi); gsi_next (&gsi))
> +     {
> +       gimple stmt = gsi_stmt(gsi);
> +
> +       if (gimple_assign_cast_p (stmt))
> +         return false;
> +
> +       if (is_gimple_call (stmt))
> +         return false;
> +
> +       if (!is_gimple_debug(stmt))
> +         {
> +           num_stmt++;
> +         }
> +     }
> +
> +   if (pred1 && pred2 && (num_stmt > 2))
> +     {
> +       bool found_virtual_result = false;
> +
> +       for (psi = gsi_start_phis (join_node); !gsi_end_p (psi); )
> +          {
> +            use_operand_p use_p;
> +            imm_use_iterator iter;
> +            gimple stmt = gsi_stmt(psi);
> +
> +            if (!virtual_operand_p (gimple_phi_result (stmt)))
> +              num_phis++;
> +            else
> +              found_virtual_result = true;
> +
> +            FOR_EACH_IMM_USE_FAST (use_p, iter, gimple_phi_result (stmt))
> +            {
> +              gimple use_stmt = USE_STMT (use_p);
> +
> +              if (gimple_bb (use_stmt) != join_node)
> +                return false;
> +            }
> +
> +            gsi_next(&psi);
> +         }
> +
> +       if ((num_phis >1) || found_virtual_result)
> +          return false;
> +
> +       if(is_block_has_label_call(pred1) || is_block_has_label_call(pred2))
> +         return false;
> +
> +       return true;
> +    }
> +  return false;
> +}
> +
> +/* Update the statements in the basic block with the basic
> +   basic block.  */
> +
> +static void
> +update_stmt_bb(basic_block b)
> +{
> +  gimple_stmt_iterator gsi;
> +  for(gsi = gsi_start_bb(b); !gsi_end_p(gsi); gsi_next(&gsi))
> +   {
> +     gimple stmt = gsi_stmt(gsi);
> +     gimple_set_bb(stmt,b);
> +   }
> +}
> +
> +/* This function gets the join blocks same as the source
> +   node of the loop latch nodes and the predecessors of
> +   the join block is updated in the pred1 and pred2 passed
> +   as the reference arguments into the function. Return
> +   the join block.  */
> +
> +static basic_block
> +get_join_blk_same_as_loop_latch (basic_block bb,
> +                                 basic_block &pred1,
> +                                 basic_block &pred2) {
> +  vec<basic_block> bbs;
> +  basic_block bb1;
> +  unsigned int i;
> +  edge_iterator ei;
> +  edge e1;
> +  bool found = false ,found1;
> +  bbs = get_all_dominated_blocks (CDI_DOMINATORS,
> +                                  bb );
> +  FOR_EACH_VEC_ELT (bbs, i, bb1)
> +  {
> +    found1 = false;
> +    FOR_EACH_EDGE (e1, ei, bb->succs)
> +    {
> +      if ( bb1 == e1->dest)
> +        {
> +          found = true;
> +          found1 = true;
> +        }
> +    }
> +    if (!found1 && found)
> +      {
> +        found = false;
> +        FOR_EACH_EDGE (e1, ei, bb1->succs)
> +        {
> +          if (e1->flags & (EDGE_DFS_BACK))
> +            found = true;
> +        }
> +
> +        if (found && EDGE_COUNT(bb1->preds) == 2)
> +          {
> +            unsigned int k = 0;
> +            FOR_EACH_EDGE (e1, ei, bb1->preds)
> +            {
> +              if ((e1->flags & (EDGE_DFS_BACK)))
> +                continue;
> +
> +              if ( k == 1)
> +                {
> +                  if (single_succ_p(e1->src) &&
> +                      single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
> +                    {
> +                      pred2 = e1->src;
> +                    }
> +                }
> +                else
> +                  {
> +                    if (single_succ_p(e1->src) &&
> +                        single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
> +                      {
> +                        pred1 = e1->src;
> +                      }
> +                  }
> +                k++;
> +            }
> +            bbs.release();
> +            return bb1;
> +          }
> +       }
> +   }
> +   bbs.release();
> +   return NULL;
> +}
> +
> +/* This is the core function to perform path splitting. The join
> +   same as the source of the loop latch node is identified along
> +   with their predecessors. Based on the feasibility tests for
> +   path splitting the path splitting is performed by wiring the
> +   copy of join blocks into the predecessors and propagating the phi
> +   result with the corresponding phi arguments into each of the copy
> +   of join blocks wired with the original predecessors of the join
> +   block.
> +
> +   The  tree-cfg-cleanup will merge the blocks in the predecessors
> +   path and the update-ssa will update the ssa representation after
> +   the path splitting is performed.  */
> +
> +static void
> +perform_path_splitting (basic_block bb) {
> +  basic_block pred1 = NULL, pred2 = NULL, join_block = NULL;
> +
> +  join_block = get_join_blk_same_as_loop_latch (bb, pred1, pred2);
> +
> +  if (join_block  &&
> +      is_feasible_path_splitting (join_block, pred1, pred2))
> +    {
> +      basic_block new_bb1 = NULL, new_bb2 = NULL;
> +      gimple_stmt_iterator last;
> +      basic_block temp_bb = NULL;
> +      edge_iterator ei;
> +      edge e1;
> +
> +      temp_bb = duplicate_block (join_block, NULL, NULL);
> +
> +      FOR_EACH_EDGE (e1, ei, pred1->succs)
> +        new_bb1 = split_edge (e1);
> +
> +      FOR_EACH_EDGE (e1, ei, pred2->succs)
> +        new_bb2 = split_edge (e1);
> +
> +      last = gsi_start_bb (new_bb1);
> +      gsi_insert_seq_after (&last, bb_seq (join_block), GSI_NEW_STMT);
> +      last = gsi_start_bb (new_bb2);
> +      gsi_insert_seq_after (&last, bb_seq (temp_bb), GSI_NEW_STMT);
> +      update_stmt_bb (new_bb1);
> +      update_stmt_bb (new_bb2);
> +
> +      replace_uses_phi (join_block, new_bb2);
> +
> +      set_bb_seq (join_block, NULL);
> +      set_bb_seq(temp_bb,NULL);
> +      delete_basic_block (temp_bb);
> +      return;
> +    }
> +}
> +
> +static unsigned int
> +execute_path_split (void)
> +{
> +  basic_block bb;
> +
> +  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);  
> + initialize_original_copy_tables();
> +
> +  calculate_dominance_info (CDI_DOMINATORS);  
> + calculate_dominance_info (CDI_POST_DOMINATORS);
> +
> +  mark_dfs_back_edges ();
> +
> +  FOR_EACH_BB_FN (bb, cfun)
> +  {
> +    gimple last;
> +
> +    /* We only care about blocks ending in a COND_EXPR. */
> +
> +    last = gsi_stmt (gsi_last_bb (bb));
> +
> +    /* We're basically looking for a switch or any kind of conditional with
> +       integral or pointer type arguments.  Note the type of the second
> +       argument will be the same as the first argument, so no need to
> +       check it explicitly.  */
> +    if ((last && (gimple_code (last) == GIMPLE_COND
> +            && TREE_CODE (gimple_cond_lhs (last)) == SSA_NAME
> +            && (INTEGRAL_TYPE_P (TREE_TYPE (gimple_cond_lhs (last)))
> +            || POINTER_TYPE_P (TREE_TYPE (gimple_cond_lhs (last))))
> +            && (TREE_CODE (gimple_cond_rhs (last)) == SSA_NAME
> +            || is_gimple_min_invariant (gimple_cond_rhs (last))))))
> +      {
> +
> +         if (gimple_code(last) == GIMPLE_COND)
> +           {
> +              perform_path_splitting (bb);
> +           }
> +      }
> +   }
> +
> +   loop_optimizer_finalize ();
> +   free_original_copy_tables ();
> +   free_dominance_info (CDI_DOMINATORS);
> +   free_dominance_info (CDI_POST_DOMINATORS);
> +   return 0;
> +}
> +
> +namespace {
> +
> +const pass_data pass_data_path_split = {
> +   GIMPLE_PASS, /* type */
> +   "path_split", /* name */
> +    OPTGROUP_NONE, /* optinfo_flags */
> +    TV_TREE_PATH_SPLIT, /* tv_id */
> +    PROP_ssa, /* properties_required */
> +    0, /* properties_provided */
> +    0, /* properties_destroyed */
> +    0, /* todo_flags_start */
> +    ( TODO_cleanup_cfg | TODO_update_ssa ), /* todo_flags_finish */ 
> +};
> +
> +class pass_path_split : public gimple_opt_pass {
> +   public:
> +    pass_path_split (gcc::context *ctxt)
> +      : gimple_opt_pass (pass_data_path_split, ctxt)
> +    {}
> +
> +   /* opt_pass methods: */
> +   opt_pass * clone () { return new pass_path_split (m_ctxt); }
> +   virtual bool gate (function *) { return flag_tree_path_split != 0; }
> +   virtual unsigned int execute (function *) { return 
> + execute_path_split (); }
> +
> +}; // class pass_path_split
> +
> +} // anon namespace
> +
> +gimple_opt_pass *
> +make_pass_path_split (gcc::context *ctxt) {
> +  return new pass_path_split (ctxt);
> +}
> --
> 1.8.2.1
>
> Thanks & Regards
> Ajit

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-07-02  3:52   ` Ajit Kumar Agarwal
@ 2015-07-06 20:08     ` Jeff Law
  0 siblings, 0 replies; 72+ messages in thread
From: Jeff Law @ 2015-07-06 20:08 UTC (permalink / raw)
  To: Ajit Kumar Agarwal, Joseph Myers
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On 07/01/2015 09:51 PM, Ajit Kumar Agarwal wrote:
>
>
> -----Original Message-----
> From: Joseph Myers [mailto:joseph@codesourcery.com]
> Sent: Wednesday, July 01, 2015 3:48 AM
> To: Ajit Kumar Agarwal
> Cc: law@redhat.com; GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
> Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
>
> On Tue, 30 Jun 2015, Ajit Kumar Agarwal wrote:
>
>>          * gcc/common.opt: Add the new flag ftree-path-split.
>
>>> All options need documenting in invoke.texi.
>
> Sure.
>> +#include "tm.h"
>
>>> Why?  Does some other header depend on this, or are you using a target macro?
>
> I am not using any target macro. There are many header files that includes the tm.h and also there are many tree-ssa optimization
> files that have included  "tm.h"  listing some of them tree-ssa-threadupdate.c  tree-vrp.c ,  tree-ssa-threadedge.c.
>
But the question is do you actually need "tm.h" -- we're in the middle 
of a project to better separate the front ends from the gimple 
optimizers from the RTL optimizers & backends.

Including "tm.h" in places where it's not really needed makes achieving 
and maintaining the separation harder than it should be.

In general we should only be including the header files that are 
actually needed.

Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-07-04 12:40   ` Ajit Kumar Agarwal
@ 2015-07-07  8:50     ` Richard Biener
  2015-07-07 13:22       ` Ajit Kumar Agarwal
  0 siblings, 1 reply; 72+ messages in thread
From: Richard Biener @ 2015-07-07  8:50 UTC (permalink / raw)
  To: Ajit Kumar Agarwal
  Cc: law, GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On Sat, Jul 4, 2015 at 2:39 PM, Ajit Kumar Agarwal
<ajit.kumar.agarwal@xilinx.com> wrote:
>
>
> -----Original Message-----
> From: Richard Biener [mailto:richard.guenther@gmail.com]
> Sent: Tuesday, June 30, 2015 4:42 PM
> To: Ajit Kumar Agarwal
> Cc: law@redhat.com; GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
> Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
>
> On Tue, Jun 30, 2015 at 10:16 AM, Ajit Kumar Agarwal <ajit.kumar.agarwal@xilinx.com> wrote:
>> All:
>>
>> The below patch added a new path Splitting optimization pass on SSA
>> representation. The Path Splitting optimization Pass moves the join
>> block of if-then-else same as loop latch to its predecessors and get merged with the predecessors Preserving the SSA representation.
>>
>> The patch is tested for Microblaze and i386 target. The EEMBC/Mibench
>> benchmarks is run with the Microblaze target And the performance gain
>> of 9.15% and rgbcmy01_lite(EEMBC benchmarks). The Deja GNU tests is run for Mircroblaze Target and no regression is seen for Microblaze target and the new testcase attached are passed.
>>
>> For i386 bootstrapping goes through fine and the Spec cpu2000
>> benchmarks is run with this patch. Following observation were seen with spec cpu2000 benchmarks.
>>
>> Ratio of path splitting change vs Ratio of not having path splitting change is 3653.353 vs 3652.14 for INT benchmarks.
>> Ratio of path splitting change vs Ratio of not having path splitting change is  4353.812 vs 4345.351 for FP benchmarks.
>>
>> Based on comments from RFC patch following changes were done.
>>
>> 1. Added a new pass for path splitting changes.
>> 2. Placed the new path  Splitting Optimization pass before the copy propagation pass.
>> 3. The join block same as the Loop latch is wired into its
>> predecessors so that the CFG Cleanup pass will merge the blocks Wired together.
>> 4. Copy propagation routines added for path splitting changes is not
>> needed as suggested by Jeff. They are removed in the patch as The copy propagation in the copied join blocks will be done by the existing copy propagation pass and the update ssa pass.
>> 5. Only the propagation of phi results of the join block with the phi
>> argument is done which will not be done by the existing update_ssa Or copy propagation pass on tree ssa representation.
>> 6. Added 2 tests.
>>     a) compilation check  tests.
>>    b) execution tests.
>> 7. Refactoring of the code for the feasibility check and finding the join block same as loop latch node.
>>
>>     [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation.
>>
>>     Added a new pass on path splitting on tree SSA representation. The path
>>     splitting optimization does the CFG transformation of join block of the
>>     if-then-else same as the loop latch node is moved and merged with the
>>     predecessor blocks after preserving the SSA representation.
>>
>>     ChangeLog:
>>     2015-06-30  Ajit Agarwal  <ajitkum@xilinx.com>
>>
>>         * gcc/Makefile.in: Add the build of the new file
>>         tree-ssa-path-split.c
>>         * gcc/common.opt: Add the new flag ftree-path-split.
>>         * gcc/opts.c: Add an entry for Path splitting pass
>>         with optimization flag greater and equal to O2.
>>         * gcc/passes.def: Enable and add new pass path splitting.
>>         * gcc/timevar.def: Add the new entry for TV_TREE_PATH_SPLIT.
>>         * gcc/tree-pass.h: Extern Declaration of make_pass_path_split.
>>         * gcc/tree-ssa-path-split.c: New file for path splitting pass.
>>         * gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c: New testcase.
>>         * gcc/testsuite/gcc.dg/path-split-1.c: New testcase.
>
>>>I'm not 100% sure I understand the transform but what I see from the testcases it tail-duplicates from a conditional up to a loop latch block (not sure if it >>includes it and thus ends up creating a loop nest or not).
>
>>>An observation I have is that the pass should at least share the transform stage to some extent with the existing tracer pass (tracer.c) which essentially does >>the same but not restricted to loops in any way.
>
> The following piece of code from tracer.c can be shared with the existing path splitting pass.
>
> {
>              e = find_edge (bb, bb2);
>
>               copy = duplicate_block (bb2, e, bb);
>               flush_pending_stmts (e);
>
>               add_phi_args_after_copy (&copy, 1, NULL);
> }
>
> Sharing the above code of the transform stage of tracer.c with the path splitting pass has the following limitation.
>
> 1. The duplicated loop latch node is wired to its predecessors and the existing phi node in the loop latch node with the
> Phi arguments from its corresponding predecessors is moved to the duplicated loop latch node that is wired into its predecessors. Due
> To this, the duplicated loop latch nodes wired into its predecessors will not be merged with the original predecessors by CFG cleanup phase .
>
>>> So I wonder if your pass could be simply another heuristic to compute paths to trace in the existing tracer pass.
>
> Sorry, I am not very clear when you say the above.  I am trying to figure out whether you expect the existing pass of tracer.c should be modified
> Or the path splitting pass should coexist.

Yes, I was wondering whether tracer.c could be simply modified.  Both
transforms are doing something very similar.

> My understanding of existing tracer pass is to  find out the traces  based on the frequency  with the profile data.
> Based on the traces,  tail duplication is done in order to enable the superblock regions. So I wonder the path splitting pass could be  incorporated in the
> existing  pass  to compute the path of the traces.

Yes, your pass would simply compute extra traces based on the new heuristic.

Richard.

> Thanks & Regards
> Ajit
>
> Thanks,
> Richard.
>
>>     Signed-off-by:Ajit Agarwal ajitkum@xilinx.com.
>>
>> gcc/Makefile.in                              |   1 +
>>  gcc/common.opt                               |   4 +
>>  gcc/opts.c                                   |   1 +
>>  gcc/passes.def                               |   1 +
>>  gcc/testsuite/gcc.dg/path-split-1.c          |  65 ++++
>>  gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c |  62 ++++
>>  gcc/timevar.def                              |   1 +
>>  gcc/tree-pass.h                              |   1 +
>>  gcc/tree-ssa-path-split.c                    | 462 +++++++++++++++++++++++++++
>>  9 files changed, 598 insertions(+)
>>  create mode 100644 gcc/testsuite/gcc.dg/path-split-1.c
>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>>  create mode 100644 gcc/tree-ssa-path-split.c
>>
>> diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 5f9261f..35ac363
>> 100644
>> --- a/gcc/Makefile.in
>> +++ b/gcc/Makefile.in
>> @@ -1476,6 +1476,7 @@ OBJS = \
>>         tree-vect-slp.o \
>>         tree-vectorizer.o \
>>         tree-vrp.o \
>> +        tree-ssa-path-split.o \
>>         tree.o \
>>         valtrack.o \
>>         value-prof.o \
>> diff --git a/gcc/common.opt b/gcc/common.opt index e104269..c63b100
>> 100644
>> --- a/gcc/common.opt
>> +++ b/gcc/common.opt
>> @@ -2328,6 +2328,10 @@ ftree-vrp
>>  Common Report Var(flag_tree_vrp) Init(0) Optimization  Perform Value
>> Range Propagation on trees
>>
>> +ftree-path-split
>> +Common Report Var(flag_tree_path_split) Init(0) Optimization Perform
>> +Path Splitting
>> +
>>  funit-at-a-time
>>  Common Report Var(flag_unit_at_a_time) Init(1) Optimization  Compile
>> whole compilation unit at a time diff --git a/gcc/opts.c b/gcc/opts.c
>> index 8a16116..31947ff 100644
>> --- a/gcc/opts.c
>> +++ b/gcc/opts.c
>> @@ -508,6 +508,7 @@ static const struct default_options default_options_table[] =
>>      { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1 },
>>      { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
>>      { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
>> +    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },
>>
>>      /* -O3 optimizations.  */
>>      { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1
>> }, diff --git a/gcc/passes.def b/gcc/passes.def index c0ddee4..43618eb
>> 100644
>> --- a/gcc/passes.def
>> +++ b/gcc/passes.def
>> @@ -155,6 +155,7 @@ along with GCC; see the file COPYING3.  If not see
>>        NEXT_PASS (pass_ccp);
>>        /* After CCP we rewrite no longer addressed locals into SSA
>>          form if possible.  */
>> +      NEXT_PASS (pass_path_split);
>>        NEXT_PASS (pass_copy_prop);
>>        NEXT_PASS (pass_complete_unrolli);
>>        NEXT_PASS (pass_phiprop);
>> diff --git a/gcc/testsuite/gcc.dg/path-split-1.c
>> b/gcc/testsuite/gcc.dg/path-split-1.c
>> new file mode 100644
>> index 0000000..075dc87
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/path-split-1.c
>> @@ -0,0 +1,65 @@
>> +/* { dg-do run } */
>> +/* { dg-options "-O2 " } */
>> +
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +
>> +#define RGBMAX 255
>> +
>> +int
>> +test()
>> +{
>> +  int i, Pels;
>> +  unsigned char sum = 0;
>> +  unsigned char xr, xg, xb;
>> +  unsigned char xc, xm, xy, xk;
>> +  unsigned char *ReadPtr, *EritePtr;
>> +
>> +  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
>> + EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
>> +
>> +  for (i = 0; i < 100;i++)
>> +     {
>> +       ReadPtr[i] = 100 - i;
>> +     }
>> +
>> +  for (i = 0; i < 100; i++)
>> +     {
>> +       xr = *ReadPtr++;
>> +       xg = *ReadPtr++;
>> +       xb = *ReadPtr++;
>> +
>> +       xc = (unsigned char) (RGBMAX - xr);
>> +       xm = (unsigned char) (RGBMAX - xg);
>> +       xy = (unsigned char) (RGBMAX - xb);
>> +
>> +       if (xc < xm)
>> +         {
>> +           xk = (unsigned char) (xc < xy ? xc : xy);
>> +         }
>> +       else
>> +        {
>> +          xk = (unsigned char) (xm < xy ? xm : xy);
>> +        }
>> +
>> +       xc = (unsigned char) (xc - xk);
>> +       xm = (unsigned char) (xm - xk);
>> +       xy = (unsigned char) (xy - xk);
>> +
>> +       *EritePtr++ = xc;
>> +       *EritePtr++ = xm;
>> +       *EritePtr++ = xy;
>> +       *EritePtr++ = xk;
>> +       sum += *EritePtr;
>> +    }
>> +  return sum;
>> +}
>> +
>> +int
>> +main()
>> +{
>> +  if (test() != 33)
>> +    abort();
>> +
>> +  return 0;
>> +}
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>> b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>> new file mode 100644
>> index 0000000..19f277c
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>> @@ -0,0 +1,62 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -fdump-tree-path_split" } */
>> +
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +
>> +#define RGBMAX 255
>> +
>> +int
>> +test()
>> +{
>> +  int i, Pels;
>> +  unsigned char sum = 0;
>> +  unsigned char xr, xg, xb;
>> +  unsigned char xc, xm, xy, xk;
>> +  unsigned char *ReadPtr, *EritePtr;
>> +
>> +  ReadPtr = (unsigned char *) malloc (sizeof (unsigned char) * 100);
>> + EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
>> +
>> +  for (i = 0; i < 100;i++)
>> +     {
>> +       ReadPtr[i] = 100 - i;
>> +     }
>> +
>> +  for (i = 0; i < 100; i++)
>> +     {
>> +       xr = *ReadPtr++;
>> +       xg = *ReadPtr++;
>> +       xb = *ReadPtr++;
>> +
>> +       xc = ( unsigned char) (RGBMAX - xr);
>> +       xm = ( unsigned char) (RGBMAX - xg);
>> +       xy = ( unsigned char) (RGBMAX - xb);
>> +
>> +       if (xc < xm)
>> +         {
>> +           xk = ( unsigned char) (xc < xy ? xc : xy);
>> +         }
>> +       else
>> +         {
>> +           xk = ( unsigned char) (xm < xy ? xm : xy);
>> +         }
>> +
>> +       xc = (unsigned char) (xc - xk);
>> +       xm = (unsigned char) (xm - xk);
>> +       xy = (unsigned char) (xy - xk);
>> +
>> +       *EritePtr++ = xc;
>> +       *EritePtr++ = xm;
>> +       *EritePtr++ = xy;
>> +       *EritePtr++ = xk;
>> +       sum += *EritePtr;
>> +    }
>> +  return sum;
>> +}
>> +
>> +/* { dg-final { scan-tree-dump "xc_[0-9][0-9]* -> { xc_[0-9][0-9]* }"
>> +"path_split"} } */
>> +/* { dg-final { scan-tree-dump "xm_[0-9][0-9]* -> { xm_[0-9][0-9]* }"
>> +"path_split"} } */
>> +/* { dg-final { scan-tree-dump "xy_[0-9][0-9]* -> { xy_[0-9][0-9]* }"
>> +"path_split"} } */
>> +/* { dg-final { scan-tree-dump "Merging blocks" "path_split"} } */
>> +/* { dg-final { cleanup-tree-dump "path_split" } } */
>> diff --git a/gcc/timevar.def b/gcc/timevar.def index 711bbed..6217a8e
>> 100644
>> --- a/gcc/timevar.def
>> +++ b/gcc/timevar.def
>> @@ -288,3 +288,4 @@ DEFTIMEVAR (TV_JIT_REPLAY        , "replay of JIT client activity")
>>  DEFTIMEVAR (TV_ASSEMBLE             , "assemble JIT code")
>>  DEFTIMEVAR (TV_LINK                 , "link JIT code")
>>  DEFTIMEVAR (TV_LOAD                 , "load JIT result")
>> +DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path_split")
>> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 398ab83..e00639e
>> 100644
>> --- a/gcc/tree-pass.h
>> +++ b/gcc/tree-pass.h
>> @@ -379,6 +379,7 @@ extern gimple_opt_pass *make_pass_iv_optimize
>> (gcc::context *ctxt);  extern gimple_opt_pass
>> *make_pass_tree_loop_done (gcc::context *ctxt);  extern
>> gimple_opt_pass *make_pass_ch (gcc::context *ctxt);  extern
>> gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
>> +extern gimple_opt_pass *make_pass_path_split (gcc::context *ctxt);
>>  extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context
>> *ctxt);  extern gimple_opt_pass *make_pass_build_ssa (gcc::context
>> *ctxt);  extern gimple_opt_pass *make_pass_build_alias (gcc::context
>> *ctxt); diff --git a/gcc/tree-ssa-path-split.c
>> b/gcc/tree-ssa-path-split.c new file mode 100644 index
>> 0000000..3da7791
>> --- /dev/null
>> +++ b/gcc/tree-ssa-path-split.c
>> @@ -0,0 +1,462 @@
>> +/* Support routines for Path Splitting.
>> +   Copyright (C) 2015 Free Software Foundation, Inc.
>> +   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
>> +
>> + This file is part of GCC.
>> +
>> + GCC is free software; you can redistribute it and/or modify it under
>> + the terms of the GNU General Public License as published by the Free
>> + Software Foundation; either version 3, or (at your option) any later
>> + version.
>> +
>> +GCC is distributed in the hope that it will be useful, but WITHOUT
>> +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
>> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
>> +for more details.
>> +
>> +You should have received a copy of the GNU General Public License
>> +along with GCC; see the file COPYING3.  If not see
>> +<http://www.gnu.org/licenses/>.  */
>> +
>> +#include "config.h"
>> +#include "system.h"
>> +#include "coretypes.h"
>> +#include "tm.h"
>> +#include "flags.h"
>> +#include "tree.h"
>> +#include "stor-layout.h"
>> +#include "calls.h"
>> +#include "predict.h"
>> +#include "vec.h"
>> +#include "hashtab.h"
>> +#include "hash-set.h"
>> +#include "machmode.h"
>> +#include "hard-reg-set.h"
>> +#include "input.h"
>> +#include "function.h"
>> +#include "dominance.h"
>> +#include "cfg.h"
>> +#include "cfganal.h"
>> +#include "basic-block.h"
>> +#include "tree-ssa-alias.h"
>> +#include "internal-fn.h"
>> +#include "gimple-fold.h"
>> +#include "tree-eh.h"
>> +#include "gimple-expr.h"
>> +#include "is-a.h"
>> +#include "gimple.h"
>> +#include "gimple-iterator.h"
>> +#include "gimple-walk.h"
>> +#include "gimple-ssa.h"
>> +#include "tree-cfg.h"
>> +#include "tree-phinodes.h"
>> +#include "ssa-iterators.h"
>> +#include "stringpool.h"
>> +#include "tree-ssanames.h"
>> +#include "tree-ssa-loop-manip.h"
>> +#include "tree-ssa-loop-niter.h"
>> +#include "tree-ssa-loop.h"
>> +#include "tree-into-ssa.h"
>> +#include "tree-ssa.h"
>> +#include "tree-pass.h"
>> +#include "tree-dump.h"
>> +#include "gimple-pretty-print.h"
>> +#include "diagnostic-core.h"
>> +#include "intl.h"
>> +#include "cfgloop.h"
>> +#include "tree-scalar-evolution.h"
>> +#include "tree-ssa-propagate.h"
>> +#include "tree-chrec.h"
>> +#include "tree-ssa-threadupdate.h"
>> +#include "expr.h"
>> +#include "insn-codes.h"
>> +#include "optabs.h"
>> +#include "tree-ssa-threadedge.h"
>> +#include "wide-int.h"
>> +
>> +/* Replace_uses_phi function propagates the phi results with the
>> +   first phi argument into each of the copied join blocks wired into
>> +   its predecessors. This function is called from the replace_uses_phi
>> +   to replace the uses of first phi arguments with the second
>> +   phi arguments in the next copy of join block.  */
>> +
>> +static void
>> +replace_use_phi_operand1_with_operand2 (basic_block b,
>> +                                        tree use1,
>> +                                        tree use2) {
>> +  use_operand_p use;
>> +  ssa_op_iter iter;
>> +  gimple_stmt_iterator gsi;
>> +
>> +  for (gsi = gsi_start_bb (b); !gsi_end_p (gsi);)
>> +     {
>> +       gimple stmt = gsi_stmt (gsi);
>> +       FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
>> +       {
>> +         tree tuse = USE_FROM_PTR (use);
>> +          if (use1 == tuse || use1 == NULL_TREE)
>> +            {
>> +              propagate_value (use, use2);
>> +              update_stmt(stmt);
>> +            }
>> +        }
>> +       gsi_next(&gsi);
>> +     }
>> +}
>> +
>> +/* This function propagates the phi result into the use points with
>> +   the phi arguments. The join block is copied and wired into the
>> +   predecessors. Since the use points of the phi results will be same
>> +   in the each of the copy join blocks in the  predecessors, it
>> +   propagates the phi arguments in the copy of the join blocks wired
>> +   into its predecessor.  */
>> +
>> +static
>> +void replace_uses_phi (basic_block b, basic_block temp_bb) {
>> +  gimple_seq phis = phi_nodes (b);
>> +  gimple phi = gimple_seq_first_stmt (phis);
>> +  tree def = gimple_phi_result (phi), use = gimple_phi_arg_def
>> +(phi,0);
>> +  tree use2 = gimple_phi_arg_def (phi,1);
>> +
>> +  if (virtual_operand_p (def))
>> +    {
>> +      imm_use_iterator iter;
>> +      use_operand_p use_p;
>> +      gimple stmt;
>> +
>> +      FOR_EACH_IMM_USE_STMT (stmt, iter, def)
>> +        FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
>> +          SET_USE (use_p, use);
>> +      if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (def))
>> +        SSA_NAME_OCCURS_IN_ABNORMAL_PHI (use) = 1;
>> +    }
>> +   else
>> +     replace_uses_by (def, use);
>> +   replace_use_phi_operand1_with_operand2 (temp_bb, use, use2); }
>> +
>> +/* Returns true if the block bb has label or call statements.
>> +   Otherwise return false.  */
>> +
>> +static bool
>> +is_block_has_label_call (basic_block bb) {
>> +  gimple_stmt_iterator gsi;
>> +
>> +  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
>> +     {
>> +       gimple stmt = gsi_stmt(gsi);
>> +       if (dyn_cast <glabel *> (stmt))
>> +         {
>> +           return true;
>> +         }
>> +       if (is_gimple_call (stmt))
>> +         return true;
>> +     }
>> +  return false;
>> +}
>> +
>> +/* This function performs the feasibility tests for path splitting
>> +   to perform. Return false if the feasibility for path splitting
>> +   is not done and returns true if the feasbility for path splitting
>> +   is done. Following feasibility tests are performed.
>> +
>> +   1. Return false if the join block has call gimple statements.
>> +   2. Return false if the join block has rhs casting for assign
>> +      gimple statements.
>> +   3. If the number of phis is greater than 1 or the phi node in
>> +      the join block has virtual operand return false.
>> +   4. Return false if the number of sequential statements is
>> +      greater than 2.
>> +   5. If the predecessors blocks has labels and call statements
>> +      return false.
>> +   6. If the phi result in the phi node of the join block is not
>> +      used inside the same join block return false.
>> +   7. Otherwise returns true.  */
>> +
>> +static bool
>> +is_feasible_path_splitting (basic_block join_node, basic_block pred1,
>> +                           basic_block pred2) {
>> +  int num_stmt = 0, num_phis = 0;
>> +  gimple_stmt_iterator psi, gsi;
>> +
>> +  for (gsi = gsi_start_bb (join_node); !gsi_end_p (gsi); gsi_next (&gsi))
>> +     {
>> +       gimple stmt = gsi_stmt(gsi);
>> +
>> +       if (gimple_assign_cast_p (stmt))
>> +         return false;
>> +
>> +       if (is_gimple_call (stmt))
>> +         return false;
>> +
>> +       if (!is_gimple_debug(stmt))
>> +         {
>> +           num_stmt++;
>> +         }
>> +     }
>> +
>> +   if (pred1 && pred2 && (num_stmt > 2))
>> +     {
>> +       bool found_virtual_result = false;
>> +
>> +       for (psi = gsi_start_phis (join_node); !gsi_end_p (psi); )
>> +          {
>> +            use_operand_p use_p;
>> +            imm_use_iterator iter;
>> +            gimple stmt = gsi_stmt(psi);
>> +
>> +            if (!virtual_operand_p (gimple_phi_result (stmt)))
>> +              num_phis++;
>> +            else
>> +              found_virtual_result = true;
>> +
>> +            FOR_EACH_IMM_USE_FAST (use_p, iter, gimple_phi_result (stmt))
>> +            {
>> +              gimple use_stmt = USE_STMT (use_p);
>> +
>> +              if (gimple_bb (use_stmt) != join_node)
>> +                return false;
>> +            }
>> +
>> +            gsi_next(&psi);
>> +         }
>> +
>> +       if ((num_phis >1) || found_virtual_result)
>> +          return false;
>> +
>> +       if(is_block_has_label_call(pred1) || is_block_has_label_call(pred2))
>> +         return false;
>> +
>> +       return true;
>> +    }
>> +  return false;
>> +}
>> +
>> +/* Update the statements in the basic block with the basic
>> +   basic block.  */
>> +
>> +static void
>> +update_stmt_bb(basic_block b)
>> +{
>> +  gimple_stmt_iterator gsi;
>> +  for(gsi = gsi_start_bb(b); !gsi_end_p(gsi); gsi_next(&gsi))
>> +   {
>> +     gimple stmt = gsi_stmt(gsi);
>> +     gimple_set_bb(stmt,b);
>> +   }
>> +}
>> +
>> +/* This function gets the join blocks same as the source
>> +   node of the loop latch nodes and the predecessors of
>> +   the join block is updated in the pred1 and pred2 passed
>> +   as the reference arguments into the function. Return
>> +   the join block.  */
>> +
>> +static basic_block
>> +get_join_blk_same_as_loop_latch (basic_block bb,
>> +                                 basic_block &pred1,
>> +                                 basic_block &pred2) {
>> +  vec<basic_block> bbs;
>> +  basic_block bb1;
>> +  unsigned int i;
>> +  edge_iterator ei;
>> +  edge e1;
>> +  bool found = false ,found1;
>> +  bbs = get_all_dominated_blocks (CDI_DOMINATORS,
>> +                                  bb );
>> +  FOR_EACH_VEC_ELT (bbs, i, bb1)
>> +  {
>> +    found1 = false;
>> +    FOR_EACH_EDGE (e1, ei, bb->succs)
>> +    {
>> +      if ( bb1 == e1->dest)
>> +        {
>> +          found = true;
>> +          found1 = true;
>> +        }
>> +    }
>> +    if (!found1 && found)
>> +      {
>> +        found = false;
>> +        FOR_EACH_EDGE (e1, ei, bb1->succs)
>> +        {
>> +          if (e1->flags & (EDGE_DFS_BACK))
>> +            found = true;
>> +        }
>> +
>> +        if (found && EDGE_COUNT(bb1->preds) == 2)
>> +          {
>> +            unsigned int k = 0;
>> +            FOR_EACH_EDGE (e1, ei, bb1->preds)
>> +            {
>> +              if ((e1->flags & (EDGE_DFS_BACK)))
>> +                continue;
>> +
>> +              if ( k == 1)
>> +                {
>> +                  if (single_succ_p(e1->src) &&
>> +                      single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
>> +                    {
>> +                      pred2 = e1->src;
>> +                    }
>> +                }
>> +                else
>> +                  {
>> +                    if (single_succ_p(e1->src) &&
>> +                        single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
>> +                      {
>> +                        pred1 = e1->src;
>> +                      }
>> +                  }
>> +                k++;
>> +            }
>> +            bbs.release();
>> +            return bb1;
>> +          }
>> +       }
>> +   }
>> +   bbs.release();
>> +   return NULL;
>> +}
>> +
>> +/* This is the core function to perform path splitting. The join
>> +   same as the source of the loop latch node is identified along
>> +   with their predecessors. Based on the feasibility tests for
>> +   path splitting the path splitting is performed by wiring the
>> +   copy of join blocks into the predecessors and propagating the phi
>> +   result with the corresponding phi arguments into each of the copy
>> +   of join blocks wired with the original predecessors of the join
>> +   block.
>> +
>> +   The  tree-cfg-cleanup will merge the blocks in the predecessors
>> +   path and the update-ssa will update the ssa representation after
>> +   the path splitting is performed.  */
>> +
>> +static void
>> +perform_path_splitting (basic_block bb) {
>> +  basic_block pred1 = NULL, pred2 = NULL, join_block = NULL;
>> +
>> +  join_block = get_join_blk_same_as_loop_latch (bb, pred1, pred2);
>> +
>> +  if (join_block  &&
>> +      is_feasible_path_splitting (join_block, pred1, pred2))
>> +    {
>> +      basic_block new_bb1 = NULL, new_bb2 = NULL;
>> +      gimple_stmt_iterator last;
>> +      basic_block temp_bb = NULL;
>> +      edge_iterator ei;
>> +      edge e1;
>> +
>> +      temp_bb = duplicate_block (join_block, NULL, NULL);
>> +
>> +      FOR_EACH_EDGE (e1, ei, pred1->succs)
>> +        new_bb1 = split_edge (e1);
>> +
>> +      FOR_EACH_EDGE (e1, ei, pred2->succs)
>> +        new_bb2 = split_edge (e1);
>> +
>> +      last = gsi_start_bb (new_bb1);
>> +      gsi_insert_seq_after (&last, bb_seq (join_block), GSI_NEW_STMT);
>> +      last = gsi_start_bb (new_bb2);
>> +      gsi_insert_seq_after (&last, bb_seq (temp_bb), GSI_NEW_STMT);
>> +      update_stmt_bb (new_bb1);
>> +      update_stmt_bb (new_bb2);
>> +
>> +      replace_uses_phi (join_block, new_bb2);
>> +
>> +      set_bb_seq (join_block, NULL);
>> +      set_bb_seq(temp_bb,NULL);
>> +      delete_basic_block (temp_bb);
>> +      return;
>> +    }
>> +}
>> +
>> +static unsigned int
>> +execute_path_split (void)
>> +{
>> +  basic_block bb;
>> +
>> +  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
>> + initialize_original_copy_tables();
>> +
>> +  calculate_dominance_info (CDI_DOMINATORS);
>> + calculate_dominance_info (CDI_POST_DOMINATORS);
>> +
>> +  mark_dfs_back_edges ();
>> +
>> +  FOR_EACH_BB_FN (bb, cfun)
>> +  {
>> +    gimple last;
>> +
>> +    /* We only care about blocks ending in a COND_EXPR. */
>> +
>> +    last = gsi_stmt (gsi_last_bb (bb));
>> +
>> +    /* We're basically looking for a switch or any kind of conditional with
>> +       integral or pointer type arguments.  Note the type of the second
>> +       argument will be the same as the first argument, so no need to
>> +       check it explicitly.  */
>> +    if ((last && (gimple_code (last) == GIMPLE_COND
>> +            && TREE_CODE (gimple_cond_lhs (last)) == SSA_NAME
>> +            && (INTEGRAL_TYPE_P (TREE_TYPE (gimple_cond_lhs (last)))
>> +            || POINTER_TYPE_P (TREE_TYPE (gimple_cond_lhs (last))))
>> +            && (TREE_CODE (gimple_cond_rhs (last)) == SSA_NAME
>> +            || is_gimple_min_invariant (gimple_cond_rhs (last))))))
>> +      {
>> +
>> +         if (gimple_code(last) == GIMPLE_COND)
>> +           {
>> +              perform_path_splitting (bb);
>> +           }
>> +      }
>> +   }
>> +
>> +   loop_optimizer_finalize ();
>> +   free_original_copy_tables ();
>> +   free_dominance_info (CDI_DOMINATORS);
>> +   free_dominance_info (CDI_POST_DOMINATORS);
>> +   return 0;
>> +}
>> +
>> +namespace {
>> +
>> +const pass_data pass_data_path_split = {
>> +   GIMPLE_PASS, /* type */
>> +   "path_split", /* name */
>> +    OPTGROUP_NONE, /* optinfo_flags */
>> +    TV_TREE_PATH_SPLIT, /* tv_id */
>> +    PROP_ssa, /* properties_required */
>> +    0, /* properties_provided */
>> +    0, /* properties_destroyed */
>> +    0, /* todo_flags_start */
>> +    ( TODO_cleanup_cfg | TODO_update_ssa ), /* todo_flags_finish */
>> +};
>> +
>> +class pass_path_split : public gimple_opt_pass {
>> +   public:
>> +    pass_path_split (gcc::context *ctxt)
>> +      : gimple_opt_pass (pass_data_path_split, ctxt)
>> +    {}
>> +
>> +   /* opt_pass methods: */
>> +   opt_pass * clone () { return new pass_path_split (m_ctxt); }
>> +   virtual bool gate (function *) { return flag_tree_path_split != 0; }
>> +   virtual unsigned int execute (function *) { return
>> + execute_path_split (); }
>> +
>> +}; // class pass_path_split
>> +
>> +} // anon namespace
>> +
>> +gimple_opt_pass *
>> +make_pass_path_split (gcc::context *ctxt) {
>> +  return new pass_path_split (ctxt);
>> +}
>> --
>> 1.8.2.1
>>
>> Thanks & Regards
>> Ajit

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-07-07  8:50     ` Richard Biener
@ 2015-07-07 13:22       ` Ajit Kumar Agarwal
  2015-07-16 11:20         ` Richard Biener
  0 siblings, 1 reply; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-07-07 13:22 UTC (permalink / raw)
  To: Richard Biener
  Cc: law, GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala



-----Original Message-----
From: Richard Biener [mailto:richard.guenther@gmail.com] 
Sent: Tuesday, July 07, 2015 2:21 PM
To: Ajit Kumar Agarwal
Cc: law@redhat.com; GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On Sat, Jul 4, 2015 at 2:39 PM, Ajit Kumar Agarwal <ajit.kumar.agarwal@xilinx.com> wrote:
>
>
> -----Original Message-----
> From: Richard Biener [mailto:richard.guenther@gmail.com]
> Sent: Tuesday, June 30, 2015 4:42 PM
> To: Ajit Kumar Agarwal
> Cc: law@redhat.com; GCC Patches; Vinod Kathail; Shail Aditya Gupta; 
> Vidhumouli Hunsigida; Nagaraju Mekala
> Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on 
> tree ssa representation
>
> On Tue, Jun 30, 2015 at 10:16 AM, Ajit Kumar Agarwal <ajit.kumar.agarwal@xilinx.com> wrote:
>> All:
>>
>> The below patch added a new path Splitting optimization pass on SSA 
>> representation. The Path Splitting optimization Pass moves the join 
>> block of if-then-else same as loop latch to its predecessors and get merged with the predecessors Preserving the SSA representation.
>>
>> The patch is tested for Microblaze and i386 target. The EEMBC/Mibench 
>> benchmarks is run with the Microblaze target And the performance gain 
>> of 9.15% and rgbcmy01_lite(EEMBC benchmarks). The Deja GNU tests is run for Mircroblaze Target and no regression is seen for Microblaze target and the new testcase attached are passed.
>>
>> For i386 bootstrapping goes through fine and the Spec cpu2000 
>> benchmarks is run with this patch. Following observation were seen with spec cpu2000 benchmarks.
>>
>> Ratio of path splitting change vs Ratio of not having path splitting change is 3653.353 vs 3652.14 for INT benchmarks.
>> Ratio of path splitting change vs Ratio of not having path splitting change is  4353.812 vs 4345.351 for FP benchmarks.
>>
>> Based on comments from RFC patch following changes were done.
>>
>> 1. Added a new pass for path splitting changes.
>> 2. Placed the new path  Splitting Optimization pass before the copy propagation pass.
>> 3. The join block same as the Loop latch is wired into its 
>> predecessors so that the CFG Cleanup pass will merge the blocks Wired together.
>> 4. Copy propagation routines added for path splitting changes is not 
>> needed as suggested by Jeff. They are removed in the patch as The copy propagation in the copied join blocks will be done by the existing copy propagation pass and the update ssa pass.
>> 5. Only the propagation of phi results of the join block with the phi 
>> argument is done which will not be done by the existing update_ssa Or copy propagation pass on tree ssa representation.
>> 6. Added 2 tests.
>>     a) compilation check  tests.
>>    b) execution tests.
>> 7. Refactoring of the code for the feasibility check and finding the join block same as loop latch node.
>>
>>     [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation.
>>
>>     Added a new pass on path splitting on tree SSA representation. The path
>>     splitting optimization does the CFG transformation of join block of the
>>     if-then-else same as the loop latch node is moved and merged with the
>>     predecessor blocks after preserving the SSA representation.
>>
>>     ChangeLog:
>>     2015-06-30  Ajit Agarwal  <ajitkum@xilinx.com>
>>
>>         * gcc/Makefile.in: Add the build of the new file
>>         tree-ssa-path-split.c
>>         * gcc/common.opt: Add the new flag ftree-path-split.
>>         * gcc/opts.c: Add an entry for Path splitting pass
>>         with optimization flag greater and equal to O2.
>>         * gcc/passes.def: Enable and add new pass path splitting.
>>         * gcc/timevar.def: Add the new entry for TV_TREE_PATH_SPLIT.
>>         * gcc/tree-pass.h: Extern Declaration of make_pass_path_split.
>>         * gcc/tree-ssa-path-split.c: New file for path splitting pass.
>>         * gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c: New testcase.
>>         * gcc/testsuite/gcc.dg/path-split-1.c: New testcase.
>
>>>I'm not 100% sure I understand the transform but what I see from the testcases it tail-duplicates from a conditional up to a loop latch block (not sure if it >>includes it and thus ends up creating a loop nest or not).
>
>>>An observation I have is that the pass should at least share the transform stage to some extent with the existing tracer pass (tracer.c) which essentially does >>the same but not restricted to loops in any way.
>
> The following piece of code from tracer.c can be shared with the existing path splitting pass.
>
> {
>              e = find_edge (bb, bb2);
>
>               copy = duplicate_block (bb2, e, bb);
>               flush_pending_stmts (e);
>
>               add_phi_args_after_copy (&copy, 1, NULL); }
>
> Sharing the above code of the transform stage of tracer.c with the path splitting pass has the following limitation.
>
> 1. The duplicated loop latch node is wired to its predecessors and the 
> existing phi node in the loop latch node with the Phi arguments from 
> its corresponding predecessors is moved to the duplicated loop latch node that is wired into its predecessors. Due To this, the duplicated loop latch nodes wired into its predecessors will not be merged with the original predecessors by CFG cleanup phase .
>
>>> So I wonder if your pass could be simply another heuristic to compute paths to trace in the existing tracer pass.
>
> Sorry, I am not very clear when you say the above.  I am trying to 
> figure out whether you expect the existing pass of tracer.c should be modified Or the path splitting pass should coexist.

>>Yes, I was wondering whether tracer.c could be simply modified.  Both transforms are doing something very similar.
>>Yes, your pass would simply compute extra traces based on the new heuristic.

I have observed the following with the tracer pass optimization.

Tracer Pass:

1. The tracer pass is FDO optimizations  and is not enabled by default at O2. This optimization is enabled with -fprofile-use.
2. The -ftracer flag is used to enable the optimization explicitly in the absence of FDO optimization.
3. The tracer pass optimizations is placed at only place before the dominator_pass. Moving the tracer pass before copy_prop
Pass gives the following error. " Error : pass tracer does not support cloning".
4. The code for tracer pass is totally based on the FDO related information and the logic is based on the profile data.

Path Spliiting pass:

1. Having the path splitting as a separate pass is enabled by default at  >= O2.
2. No FDO information is required in the path splitting pass.
3. The Path Splitting pass can be placed anywhere well before any optimizations pass. I have placed the path splitting pass before
Copy_prop and it works. Also placing before the dominator also works fine.
4. The code for path splitting as a separate pass is purely based on non profile and Non FDO data.
5. Placing the path splitting pass as a separate pass  can be placed anywhere in the optimizations. The optimizations that got benefitted with the
Path splitting pass are PRE , CCP, DCE and can be placed well before the optimizations.
6. At the first phase of path splitting pass I am duplicating the loop latch node to its predecessor and make it SSA and loop structure
Preserved. Making the path splitting as separate pass I would like to extend this pass to the multiple latch node with a forwarding block
and the multiple latch nodes are edged towards the forwarder block making it one Loop latch edge.

With the above observation, Do you think the existing tracer pass should be modified and the path splitting pass should be incorporated
On top of the existing tracer pass.

Kindly give your feedback.

Thanks & Regards
Ajit

Richard.

> Thanks & Regards
> Ajit
>
> Thanks,
> Richard.
>
>>     Signed-off-by:Ajit Agarwal ajitkum@xilinx.com.
>>
>> gcc/Makefile.in                              |   1 +
>>  gcc/common.opt                               |   4 +
>>  gcc/opts.c                                   |   1 +
>>  gcc/passes.def                               |   1 +
>>  gcc/testsuite/gcc.dg/path-split-1.c          |  65 ++++
>>  gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c |  62 ++++
>>  gcc/timevar.def                              |   1 +
>>  gcc/tree-pass.h                              |   1 +
>>  gcc/tree-ssa-path-split.c                    | 462 +++++++++++++++++++++++++++
>>  9 files changed, 598 insertions(+)
>>  create mode 100644 gcc/testsuite/gcc.dg/path-split-1.c
>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>>  create mode 100644 gcc/tree-ssa-path-split.c
>>
>> diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 5f9261f..35ac363
>> 100644
>> --- a/gcc/Makefile.in
>> +++ b/gcc/Makefile.in
>> @@ -1476,6 +1476,7 @@ OBJS = \
>>         tree-vect-slp.o \
>>         tree-vectorizer.o \
>>         tree-vrp.o \
>> +        tree-ssa-path-split.o \
>>         tree.o \
>>         valtrack.o \
>>         value-prof.o \
>> diff --git a/gcc/common.opt b/gcc/common.opt index e104269..c63b100
>> 100644
>> --- a/gcc/common.opt
>> +++ b/gcc/common.opt
>> @@ -2328,6 +2328,10 @@ ftree-vrp
>>  Common Report Var(flag_tree_vrp) Init(0) Optimization  Perform Value 
>> Range Propagation on trees
>>
>> +ftree-path-split
>> +Common Report Var(flag_tree_path_split) Init(0) Optimization Perform 
>> +Path Splitting
>> +
>>  funit-at-a-time
>>  Common Report Var(flag_unit_at_a_time) Init(1) Optimization  Compile 
>> whole compilation unit at a time diff --git a/gcc/opts.c b/gcc/opts.c 
>> index 8a16116..31947ff 100644
>> --- a/gcc/opts.c
>> +++ b/gcc/opts.c
>> @@ -508,6 +508,7 @@ static const struct default_options default_options_table[] =
>>      { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1 },
>>      { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
>>      { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
>> +    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },
>>
>>      /* -O3 optimizations.  */
>>      { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 
>> }, diff --git a/gcc/passes.def b/gcc/passes.def index 
>> c0ddee4..43618eb
>> 100644
>> --- a/gcc/passes.def
>> +++ b/gcc/passes.def
>> @@ -155,6 +155,7 @@ along with GCC; see the file COPYING3.  If not see
>>        NEXT_PASS (pass_ccp);
>>        /* After CCP we rewrite no longer addressed locals into SSA
>>          form if possible.  */
>> +      NEXT_PASS (pass_path_split);
>>        NEXT_PASS (pass_copy_prop);
>>        NEXT_PASS (pass_complete_unrolli);
>>        NEXT_PASS (pass_phiprop);
>> diff --git a/gcc/testsuite/gcc.dg/path-split-1.c
>> b/gcc/testsuite/gcc.dg/path-split-1.c
>> new file mode 100644
>> index 0000000..075dc87
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/path-split-1.c
>> @@ -0,0 +1,65 @@
>> +/* { dg-do run } */
>> +/* { dg-options "-O2 " } */
>> +
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +
>> +#define RGBMAX 255
>> +
>> +int
>> +test()
>> +{
>> +  int i, Pels;
>> +  unsigned char sum = 0;
>> +  unsigned char xr, xg, xb;
>> +  unsigned char xc, xm, xy, xk;
>> +  unsigned char *ReadPtr, *EritePtr;
>> +
>> +  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 
>> + 100); EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) 
>> + * 100);
>> +
>> +  for (i = 0; i < 100;i++)
>> +     {
>> +       ReadPtr[i] = 100 - i;
>> +     }
>> +
>> +  for (i = 0; i < 100; i++)
>> +     {
>> +       xr = *ReadPtr++;
>> +       xg = *ReadPtr++;
>> +       xb = *ReadPtr++;
>> +
>> +       xc = (unsigned char) (RGBMAX - xr);
>> +       xm = (unsigned char) (RGBMAX - xg);
>> +       xy = (unsigned char) (RGBMAX - xb);
>> +
>> +       if (xc < xm)
>> +         {
>> +           xk = (unsigned char) (xc < xy ? xc : xy);
>> +         }
>> +       else
>> +        {
>> +          xk = (unsigned char) (xm < xy ? xm : xy);
>> +        }
>> +
>> +       xc = (unsigned char) (xc - xk);
>> +       xm = (unsigned char) (xm - xk);
>> +       xy = (unsigned char) (xy - xk);
>> +
>> +       *EritePtr++ = xc;
>> +       *EritePtr++ = xm;
>> +       *EritePtr++ = xy;
>> +       *EritePtr++ = xk;
>> +       sum += *EritePtr;
>> +    }
>> +  return sum;
>> +}
>> +
>> +int
>> +main()
>> +{
>> +  if (test() != 33)
>> +    abort();
>> +
>> +  return 0;
>> +}
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>> b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>> new file mode 100644
>> index 0000000..19f277c
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>> @@ -0,0 +1,62 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -fdump-tree-path_split" } */
>> +
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +
>> +#define RGBMAX 255
>> +
>> +int
>> +test()
>> +{
>> +  int i, Pels;
>> +  unsigned char sum = 0;
>> +  unsigned char xr, xg, xb;
>> +  unsigned char xc, xm, xy, xk;
>> +  unsigned char *ReadPtr, *EritePtr;
>> +
>> +  ReadPtr = (unsigned char *) malloc (sizeof (unsigned char) * 100); 
>> + EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 
>> + 100);
>> +
>> +  for (i = 0; i < 100;i++)
>> +     {
>> +       ReadPtr[i] = 100 - i;
>> +     }
>> +
>> +  for (i = 0; i < 100; i++)
>> +     {
>> +       xr = *ReadPtr++;
>> +       xg = *ReadPtr++;
>> +       xb = *ReadPtr++;
>> +
>> +       xc = ( unsigned char) (RGBMAX - xr);
>> +       xm = ( unsigned char) (RGBMAX - xg);
>> +       xy = ( unsigned char) (RGBMAX - xb);
>> +
>> +       if (xc < xm)
>> +         {
>> +           xk = ( unsigned char) (xc < xy ? xc : xy);
>> +         }
>> +       else
>> +         {
>> +           xk = ( unsigned char) (xm < xy ? xm : xy);
>> +         }
>> +
>> +       xc = (unsigned char) (xc - xk);
>> +       xm = (unsigned char) (xm - xk);
>> +       xy = (unsigned char) (xy - xk);
>> +
>> +       *EritePtr++ = xc;
>> +       *EritePtr++ = xm;
>> +       *EritePtr++ = xy;
>> +       *EritePtr++ = xk;
>> +       sum += *EritePtr;
>> +    }
>> +  return sum;
>> +}
>> +
>> +/* { dg-final { scan-tree-dump "xc_[0-9][0-9]* -> { xc_[0-9][0-9]* }"
>> +"path_split"} } */
>> +/* { dg-final { scan-tree-dump "xm_[0-9][0-9]* -> { xm_[0-9][0-9]* }"
>> +"path_split"} } */
>> +/* { dg-final { scan-tree-dump "xy_[0-9][0-9]* -> { xy_[0-9][0-9]* }"
>> +"path_split"} } */
>> +/* { dg-final { scan-tree-dump "Merging blocks" "path_split"} } */
>> +/* { dg-final { cleanup-tree-dump "path_split" } } */
>> diff --git a/gcc/timevar.def b/gcc/timevar.def index 711bbed..6217a8e
>> 100644
>> --- a/gcc/timevar.def
>> +++ b/gcc/timevar.def
>> @@ -288,3 +288,4 @@ DEFTIMEVAR (TV_JIT_REPLAY        , "replay of JIT client activity")
>>  DEFTIMEVAR (TV_ASSEMBLE             , "assemble JIT code")
>>  DEFTIMEVAR (TV_LINK                 , "link JIT code")
>>  DEFTIMEVAR (TV_LOAD                 , "load JIT result")
>> +DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path_split")
>> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 398ab83..e00639e
>> 100644
>> --- a/gcc/tree-pass.h
>> +++ b/gcc/tree-pass.h
>> @@ -379,6 +379,7 @@ extern gimple_opt_pass *make_pass_iv_optimize 
>> (gcc::context *ctxt);  extern gimple_opt_pass 
>> *make_pass_tree_loop_done (gcc::context *ctxt);  extern 
>> gimple_opt_pass *make_pass_ch (gcc::context *ctxt);  extern 
>> gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
>> +extern gimple_opt_pass *make_pass_path_split (gcc::context *ctxt);
>>  extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context 
>> *ctxt);  extern gimple_opt_pass *make_pass_build_ssa (gcc::context 
>> *ctxt);  extern gimple_opt_pass *make_pass_build_alias (gcc::context 
>> *ctxt); diff --git a/gcc/tree-ssa-path-split.c 
>> b/gcc/tree-ssa-path-split.c new file mode 100644 index
>> 0000000..3da7791
>> --- /dev/null
>> +++ b/gcc/tree-ssa-path-split.c
>> @@ -0,0 +1,462 @@
>> +/* Support routines for Path Splitting.
>> +   Copyright (C) 2015 Free Software Foundation, Inc.
>> +   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
>> +
>> + This file is part of GCC.
>> +
>> + GCC is free software; you can redistribute it and/or modify it 
>> + under the terms of the GNU General Public License as published by 
>> + the Free Software Foundation; either version 3, or (at your option) 
>> + any later version.
>> +
>> +GCC is distributed in the hope that it will be useful, but WITHOUT 
>> +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 
>> +or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public 
>> +License for more details.
>> +
>> +You should have received a copy of the GNU General Public License 
>> +along with GCC; see the file COPYING3.  If not see 
>> +<http://www.gnu.org/licenses/>.  */
>> +
>> +#include "config.h"
>> +#include "system.h"
>> +#include "coretypes.h"
>> +#include "tm.h"
>> +#include "flags.h"
>> +#include "tree.h"
>> +#include "stor-layout.h"
>> +#include "calls.h"
>> +#include "predict.h"
>> +#include "vec.h"
>> +#include "hashtab.h"
>> +#include "hash-set.h"
>> +#include "machmode.h"
>> +#include "hard-reg-set.h"
>> +#include "input.h"
>> +#include "function.h"
>> +#include "dominance.h"
>> +#include "cfg.h"
>> +#include "cfganal.h"
>> +#include "basic-block.h"
>> +#include "tree-ssa-alias.h"
>> +#include "internal-fn.h"
>> +#include "gimple-fold.h"
>> +#include "tree-eh.h"
>> +#include "gimple-expr.h"
>> +#include "is-a.h"
>> +#include "gimple.h"
>> +#include "gimple-iterator.h"
>> +#include "gimple-walk.h"
>> +#include "gimple-ssa.h"
>> +#include "tree-cfg.h"
>> +#include "tree-phinodes.h"
>> +#include "ssa-iterators.h"
>> +#include "stringpool.h"
>> +#include "tree-ssanames.h"
>> +#include "tree-ssa-loop-manip.h"
>> +#include "tree-ssa-loop-niter.h"
>> +#include "tree-ssa-loop.h"
>> +#include "tree-into-ssa.h"
>> +#include "tree-ssa.h"
>> +#include "tree-pass.h"
>> +#include "tree-dump.h"
>> +#include "gimple-pretty-print.h"
>> +#include "diagnostic-core.h"
>> +#include "intl.h"
>> +#include "cfgloop.h"
>> +#include "tree-scalar-evolution.h"
>> +#include "tree-ssa-propagate.h"
>> +#include "tree-chrec.h"
>> +#include "tree-ssa-threadupdate.h"
>> +#include "expr.h"
>> +#include "insn-codes.h"
>> +#include "optabs.h"
>> +#include "tree-ssa-threadedge.h"
>> +#include "wide-int.h"
>> +
>> +/* Replace_uses_phi function propagates the phi results with the
>> +   first phi argument into each of the copied join blocks wired into
>> +   its predecessors. This function is called from the replace_uses_phi
>> +   to replace the uses of first phi arguments with the second
>> +   phi arguments in the next copy of join block.  */
>> +
>> +static void
>> +replace_use_phi_operand1_with_operand2 (basic_block b,
>> +                                        tree use1,
>> +                                        tree use2) {
>> +  use_operand_p use;
>> +  ssa_op_iter iter;
>> +  gimple_stmt_iterator gsi;
>> +
>> +  for (gsi = gsi_start_bb (b); !gsi_end_p (gsi);)
>> +     {
>> +       gimple stmt = gsi_stmt (gsi);
>> +       FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
>> +       {
>> +         tree tuse = USE_FROM_PTR (use);
>> +          if (use1 == tuse || use1 == NULL_TREE)
>> +            {
>> +              propagate_value (use, use2);
>> +              update_stmt(stmt);
>> +            }
>> +        }
>> +       gsi_next(&gsi);
>> +     }
>> +}
>> +
>> +/* This function propagates the phi result into the use points with
>> +   the phi arguments. The join block is copied and wired into the
>> +   predecessors. Since the use points of the phi results will be same
>> +   in the each of the copy join blocks in the  predecessors, it
>> +   propagates the phi arguments in the copy of the join blocks wired
>> +   into its predecessor.  */
>> +
>> +static
>> +void replace_uses_phi (basic_block b, basic_block temp_bb) {
>> +  gimple_seq phis = phi_nodes (b);
>> +  gimple phi = gimple_seq_first_stmt (phis);
>> +  tree def = gimple_phi_result (phi), use = gimple_phi_arg_def 
>> +(phi,0);
>> +  tree use2 = gimple_phi_arg_def (phi,1);
>> +
>> +  if (virtual_operand_p (def))
>> +    {
>> +      imm_use_iterator iter;
>> +      use_operand_p use_p;
>> +      gimple stmt;
>> +
>> +      FOR_EACH_IMM_USE_STMT (stmt, iter, def)
>> +        FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
>> +          SET_USE (use_p, use);
>> +      if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (def))
>> +        SSA_NAME_OCCURS_IN_ABNORMAL_PHI (use) = 1;
>> +    }
>> +   else
>> +     replace_uses_by (def, use);
>> +   replace_use_phi_operand1_with_operand2 (temp_bb, use, use2); }
>> +
>> +/* Returns true if the block bb has label or call statements.
>> +   Otherwise return false.  */
>> +
>> +static bool
>> +is_block_has_label_call (basic_block bb) {
>> +  gimple_stmt_iterator gsi;
>> +
>> +  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
>> +     {
>> +       gimple stmt = gsi_stmt(gsi);
>> +       if (dyn_cast <glabel *> (stmt))
>> +         {
>> +           return true;
>> +         }
>> +       if (is_gimple_call (stmt))
>> +         return true;
>> +     }
>> +  return false;
>> +}
>> +
>> +/* This function performs the feasibility tests for path splitting
>> +   to perform. Return false if the feasibility for path splitting
>> +   is not done and returns true if the feasbility for path splitting
>> +   is done. Following feasibility tests are performed.
>> +
>> +   1. Return false if the join block has call gimple statements.
>> +   2. Return false if the join block has rhs casting for assign
>> +      gimple statements.
>> +   3. If the number of phis is greater than 1 or the phi node in
>> +      the join block has virtual operand return false.
>> +   4. Return false if the number of sequential statements is
>> +      greater than 2.
>> +   5. If the predecessors blocks has labels and call statements
>> +      return false.
>> +   6. If the phi result in the phi node of the join block is not
>> +      used inside the same join block return false.
>> +   7. Otherwise returns true.  */
>> +
>> +static bool
>> +is_feasible_path_splitting (basic_block join_node, basic_block pred1,
>> +                           basic_block pred2) {
>> +  int num_stmt = 0, num_phis = 0;
>> +  gimple_stmt_iterator psi, gsi;
>> +
>> +  for (gsi = gsi_start_bb (join_node); !gsi_end_p (gsi); gsi_next (&gsi))
>> +     {
>> +       gimple stmt = gsi_stmt(gsi);
>> +
>> +       if (gimple_assign_cast_p (stmt))
>> +         return false;
>> +
>> +       if (is_gimple_call (stmt))
>> +         return false;
>> +
>> +       if (!is_gimple_debug(stmt))
>> +         {
>> +           num_stmt++;
>> +         }
>> +     }
>> +
>> +   if (pred1 && pred2 && (num_stmt > 2))
>> +     {
>> +       bool found_virtual_result = false;
>> +
>> +       for (psi = gsi_start_phis (join_node); !gsi_end_p (psi); )
>> +          {
>> +            use_operand_p use_p;
>> +            imm_use_iterator iter;
>> +            gimple stmt = gsi_stmt(psi);
>> +
>> +            if (!virtual_operand_p (gimple_phi_result (stmt)))
>> +              num_phis++;
>> +            else
>> +              found_virtual_result = true;
>> +
>> +            FOR_EACH_IMM_USE_FAST (use_p, iter, gimple_phi_result (stmt))
>> +            {
>> +              gimple use_stmt = USE_STMT (use_p);
>> +
>> +              if (gimple_bb (use_stmt) != join_node)
>> +                return false;
>> +            }
>> +
>> +            gsi_next(&psi);
>> +         }
>> +
>> +       if ((num_phis >1) || found_virtual_result)
>> +          return false;
>> +
>> +       if(is_block_has_label_call(pred1) || is_block_has_label_call(pred2))
>> +         return false;
>> +
>> +       return true;
>> +    }
>> +  return false;
>> +}
>> +
>> +/* Update the statements in the basic block with the basic
>> +   basic block.  */
>> +
>> +static void
>> +update_stmt_bb(basic_block b)
>> +{
>> +  gimple_stmt_iterator gsi;
>> +  for(gsi = gsi_start_bb(b); !gsi_end_p(gsi); gsi_next(&gsi))
>> +   {
>> +     gimple stmt = gsi_stmt(gsi);
>> +     gimple_set_bb(stmt,b);
>> +   }
>> +}
>> +
>> +/* This function gets the join blocks same as the source
>> +   node of the loop latch nodes and the predecessors of
>> +   the join block is updated in the pred1 and pred2 passed
>> +   as the reference arguments into the function. Return
>> +   the join block.  */
>> +
>> +static basic_block
>> +get_join_blk_same_as_loop_latch (basic_block bb,
>> +                                 basic_block &pred1,
>> +                                 basic_block &pred2) {
>> +  vec<basic_block> bbs;
>> +  basic_block bb1;
>> +  unsigned int i;
>> +  edge_iterator ei;
>> +  edge e1;
>> +  bool found = false ,found1;
>> +  bbs = get_all_dominated_blocks (CDI_DOMINATORS,
>> +                                  bb );
>> +  FOR_EACH_VEC_ELT (bbs, i, bb1)
>> +  {
>> +    found1 = false;
>> +    FOR_EACH_EDGE (e1, ei, bb->succs)
>> +    {
>> +      if ( bb1 == e1->dest)
>> +        {
>> +          found = true;
>> +          found1 = true;
>> +        }
>> +    }
>> +    if (!found1 && found)
>> +      {
>> +        found = false;
>> +        FOR_EACH_EDGE (e1, ei, bb1->succs)
>> +        {
>> +          if (e1->flags & (EDGE_DFS_BACK))
>> +            found = true;
>> +        }
>> +
>> +        if (found && EDGE_COUNT(bb1->preds) == 2)
>> +          {
>> +            unsigned int k = 0;
>> +            FOR_EACH_EDGE (e1, ei, bb1->preds)
>> +            {
>> +              if ((e1->flags & (EDGE_DFS_BACK)))
>> +                continue;
>> +
>> +              if ( k == 1)
>> +                {
>> +                  if (single_succ_p(e1->src) &&
>> +                      single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
>> +                    {
>> +                      pred2 = e1->src;
>> +                    }
>> +                }
>> +                else
>> +                  {
>> +                    if (single_succ_p(e1->src) &&
>> +                        single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
>> +                      {
>> +                        pred1 = e1->src;
>> +                      }
>> +                  }
>> +                k++;
>> +            }
>> +            bbs.release();
>> +            return bb1;
>> +          }
>> +       }
>> +   }
>> +   bbs.release();
>> +   return NULL;
>> +}
>> +
>> +/* This is the core function to perform path splitting. The join
>> +   same as the source of the loop latch node is identified along
>> +   with their predecessors. Based on the feasibility tests for
>> +   path splitting the path splitting is performed by wiring the
>> +   copy of join blocks into the predecessors and propagating the phi
>> +   result with the corresponding phi arguments into each of the copy
>> +   of join blocks wired with the original predecessors of the join
>> +   block.
>> +
>> +   The  tree-cfg-cleanup will merge the blocks in the predecessors
>> +   path and the update-ssa will update the ssa representation after
>> +   the path splitting is performed.  */
>> +
>> +static void
>> +perform_path_splitting (basic_block bb) {
>> +  basic_block pred1 = NULL, pred2 = NULL, join_block = NULL;
>> +
>> +  join_block = get_join_blk_same_as_loop_latch (bb, pred1, pred2);
>> +
>> +  if (join_block  &&
>> +      is_feasible_path_splitting (join_block, pred1, pred2))
>> +    {
>> +      basic_block new_bb1 = NULL, new_bb2 = NULL;
>> +      gimple_stmt_iterator last;
>> +      basic_block temp_bb = NULL;
>> +      edge_iterator ei;
>> +      edge e1;
>> +
>> +      temp_bb = duplicate_block (join_block, NULL, NULL);
>> +
>> +      FOR_EACH_EDGE (e1, ei, pred1->succs)
>> +        new_bb1 = split_edge (e1);
>> +
>> +      FOR_EACH_EDGE (e1, ei, pred2->succs)
>> +        new_bb2 = split_edge (e1);
>> +
>> +      last = gsi_start_bb (new_bb1);
>> +      gsi_insert_seq_after (&last, bb_seq (join_block), GSI_NEW_STMT);
>> +      last = gsi_start_bb (new_bb2);
>> +      gsi_insert_seq_after (&last, bb_seq (temp_bb), GSI_NEW_STMT);
>> +      update_stmt_bb (new_bb1);
>> +      update_stmt_bb (new_bb2);
>> +
>> +      replace_uses_phi (join_block, new_bb2);
>> +
>> +      set_bb_seq (join_block, NULL);
>> +      set_bb_seq(temp_bb,NULL);
>> +      delete_basic_block (temp_bb);
>> +      return;
>> +    }
>> +}
>> +
>> +static unsigned int
>> +execute_path_split (void)
>> +{
>> +  basic_block bb;
>> +
>> +  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS); 
>> + initialize_original_copy_tables();
>> +
>> +  calculate_dominance_info (CDI_DOMINATORS); 
>> + calculate_dominance_info (CDI_POST_DOMINATORS);
>> +
>> +  mark_dfs_back_edges ();
>> +
>> +  FOR_EACH_BB_FN (bb, cfun)
>> +  {
>> +    gimple last;
>> +
>> +    /* We only care about blocks ending in a COND_EXPR. */
>> +
>> +    last = gsi_stmt (gsi_last_bb (bb));
>> +
>> +    /* We're basically looking for a switch or any kind of conditional with
>> +       integral or pointer type arguments.  Note the type of the second
>> +       argument will be the same as the first argument, so no need to
>> +       check it explicitly.  */
>> +    if ((last && (gimple_code (last) == GIMPLE_COND
>> +            && TREE_CODE (gimple_cond_lhs (last)) == SSA_NAME
>> +            && (INTEGRAL_TYPE_P (TREE_TYPE (gimple_cond_lhs (last)))
>> +            || POINTER_TYPE_P (TREE_TYPE (gimple_cond_lhs (last))))
>> +            && (TREE_CODE (gimple_cond_rhs (last)) == SSA_NAME
>> +            || is_gimple_min_invariant (gimple_cond_rhs (last))))))
>> +      {
>> +
>> +         if (gimple_code(last) == GIMPLE_COND)
>> +           {
>> +              perform_path_splitting (bb);
>> +           }
>> +      }
>> +   }
>> +
>> +   loop_optimizer_finalize ();
>> +   free_original_copy_tables ();
>> +   free_dominance_info (CDI_DOMINATORS);
>> +   free_dominance_info (CDI_POST_DOMINATORS);
>> +   return 0;
>> +}
>> +
>> +namespace {
>> +
>> +const pass_data pass_data_path_split = {
>> +   GIMPLE_PASS, /* type */
>> +   "path_split", /* name */
>> +    OPTGROUP_NONE, /* optinfo_flags */
>> +    TV_TREE_PATH_SPLIT, /* tv_id */
>> +    PROP_ssa, /* properties_required */
>> +    0, /* properties_provided */
>> +    0, /* properties_destroyed */
>> +    0, /* todo_flags_start */
>> +    ( TODO_cleanup_cfg | TODO_update_ssa ), /* todo_flags_finish */ 
>> +};
>> +
>> +class pass_path_split : public gimple_opt_pass {
>> +   public:
>> +    pass_path_split (gcc::context *ctxt)
>> +      : gimple_opt_pass (pass_data_path_split, ctxt)
>> +    {}
>> +
>> +   /* opt_pass methods: */
>> +   opt_pass * clone () { return new pass_path_split (m_ctxt); }
>> +   virtual bool gate (function *) { return flag_tree_path_split != 0; }
>> +   virtual unsigned int execute (function *) { return 
>> + execute_path_split (); }
>> +
>> +}; // class pass_path_split
>> +
>> +} // anon namespace
>> +
>> +gimple_opt_pass *
>> +make_pass_path_split (gcc::context *ctxt) {
>> +  return new pass_path_split (ctxt); }
>> --
>> 1.8.2.1
>>
>> Thanks & Regards
>> Ajit

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-07-07 13:22       ` Ajit Kumar Agarwal
@ 2015-07-16 11:20         ` Richard Biener
  2015-07-29  7:44           ` Ajit Kumar Agarwal
  0 siblings, 1 reply; 72+ messages in thread
From: Richard Biener @ 2015-07-16 11:20 UTC (permalink / raw)
  To: Ajit Kumar Agarwal
  Cc: law, GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On Tue, Jul 7, 2015 at 3:22 PM, Ajit Kumar Agarwal
<ajit.kumar.agarwal@xilinx.com> wrote:
>
>
> -----Original Message-----
> From: Richard Biener [mailto:richard.guenther@gmail.com]
> Sent: Tuesday, July 07, 2015 2:21 PM
> To: Ajit Kumar Agarwal
> Cc: law@redhat.com; GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
> Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
>
> On Sat, Jul 4, 2015 at 2:39 PM, Ajit Kumar Agarwal <ajit.kumar.agarwal@xilinx.com> wrote:
>>
>>
>> -----Original Message-----
>> From: Richard Biener [mailto:richard.guenther@gmail.com]
>> Sent: Tuesday, June 30, 2015 4:42 PM
>> To: Ajit Kumar Agarwal
>> Cc: law@redhat.com; GCC Patches; Vinod Kathail; Shail Aditya Gupta;
>> Vidhumouli Hunsigida; Nagaraju Mekala
>> Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on
>> tree ssa representation
>>
>> On Tue, Jun 30, 2015 at 10:16 AM, Ajit Kumar Agarwal <ajit.kumar.agarwal@xilinx.com> wrote:
>>> All:
>>>
>>> The below patch added a new path Splitting optimization pass on SSA
>>> representation. The Path Splitting optimization Pass moves the join
>>> block of if-then-else same as loop latch to its predecessors and get merged with the predecessors Preserving the SSA representation.
>>>
>>> The patch is tested for Microblaze and i386 target. The EEMBC/Mibench
>>> benchmarks is run with the Microblaze target And the performance gain
>>> of 9.15% and rgbcmy01_lite(EEMBC benchmarks). The Deja GNU tests is run for Mircroblaze Target and no regression is seen for Microblaze target and the new testcase attached are passed.
>>>
>>> For i386 bootstrapping goes through fine and the Spec cpu2000
>>> benchmarks is run with this patch. Following observation were seen with spec cpu2000 benchmarks.
>>>
>>> Ratio of path splitting change vs Ratio of not having path splitting change is 3653.353 vs 3652.14 for INT benchmarks.
>>> Ratio of path splitting change vs Ratio of not having path splitting change is  4353.812 vs 4345.351 for FP benchmarks.
>>>
>>> Based on comments from RFC patch following changes were done.
>>>
>>> 1. Added a new pass for path splitting changes.
>>> 2. Placed the new path  Splitting Optimization pass before the copy propagation pass.
>>> 3. The join block same as the Loop latch is wired into its
>>> predecessors so that the CFG Cleanup pass will merge the blocks Wired together.
>>> 4. Copy propagation routines added for path splitting changes is not
>>> needed as suggested by Jeff. They are removed in the patch as The copy propagation in the copied join blocks will be done by the existing copy propagation pass and the update ssa pass.
>>> 5. Only the propagation of phi results of the join block with the phi
>>> argument is done which will not be done by the existing update_ssa Or copy propagation pass on tree ssa representation.
>>> 6. Added 2 tests.
>>>     a) compilation check  tests.
>>>    b) execution tests.
>>> 7. Refactoring of the code for the feasibility check and finding the join block same as loop latch node.
>>>
>>>     [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation.
>>>
>>>     Added a new pass on path splitting on tree SSA representation. The path
>>>     splitting optimization does the CFG transformation of join block of the
>>>     if-then-else same as the loop latch node is moved and merged with the
>>>     predecessor blocks after preserving the SSA representation.
>>>
>>>     ChangeLog:
>>>     2015-06-30  Ajit Agarwal  <ajitkum@xilinx.com>
>>>
>>>         * gcc/Makefile.in: Add the build of the new file
>>>         tree-ssa-path-split.c
>>>         * gcc/common.opt: Add the new flag ftree-path-split.
>>>         * gcc/opts.c: Add an entry for Path splitting pass
>>>         with optimization flag greater and equal to O2.
>>>         * gcc/passes.def: Enable and add new pass path splitting.
>>>         * gcc/timevar.def: Add the new entry for TV_TREE_PATH_SPLIT.
>>>         * gcc/tree-pass.h: Extern Declaration of make_pass_path_split.
>>>         * gcc/tree-ssa-path-split.c: New file for path splitting pass.
>>>         * gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c: New testcase.
>>>         * gcc/testsuite/gcc.dg/path-split-1.c: New testcase.
>>
>>>>I'm not 100% sure I understand the transform but what I see from the testcases it tail-duplicates from a conditional up to a loop latch block (not sure if it >>includes it and thus ends up creating a loop nest or not).
>>
>>>>An observation I have is that the pass should at least share the transform stage to some extent with the existing tracer pass (tracer.c) which essentially does >>the same but not restricted to loops in any way.
>>
>> The following piece of code from tracer.c can be shared with the existing path splitting pass.
>>
>> {
>>              e = find_edge (bb, bb2);
>>
>>               copy = duplicate_block (bb2, e, bb);
>>               flush_pending_stmts (e);
>>
>>               add_phi_args_after_copy (&copy, 1, NULL); }
>>
>> Sharing the above code of the transform stage of tracer.c with the path splitting pass has the following limitation.
>>
>> 1. The duplicated loop latch node is wired to its predecessors and the
>> existing phi node in the loop latch node with the Phi arguments from
>> its corresponding predecessors is moved to the duplicated loop latch node that is wired into its predecessors. Due To this, the duplicated loop latch nodes wired into its predecessors will not be merged with the original predecessors by CFG cleanup phase .
>>
>>>> So I wonder if your pass could be simply another heuristic to compute paths to trace in the existing tracer pass.
>>
>> Sorry, I am not very clear when you say the above.  I am trying to
>> figure out whether you expect the existing pass of tracer.c should be modified Or the path splitting pass should coexist.
>
>>>Yes, I was wondering whether tracer.c could be simply modified.  Both transforms are doing something very similar.
>>>Yes, your pass would simply compute extra traces based on the new heuristic.
>
> I have observed the following with the tracer pass optimization.
>
> Tracer Pass:
>
> 1. The tracer pass is FDO optimizations  and is not enabled by default at O2. This optimization is enabled with -fprofile-use.
> 2. The -ftracer flag is used to enable the optimization explicitly in the absence of FDO optimization.
> 3. The tracer pass optimizations is placed at only place before the dominator_pass. Moving the tracer pass before copy_prop
> Pass gives the following error. " Error : pass tracer does not support cloning".

This is an error from the pass manager, you added a second tracer pass instead
of moving it.

> 4. The code for tracer pass is totally based on the FDO related information and the logic is based on the profile data.

Yes.

> Path Spliiting pass:
>
> 1. Having the path splitting as a separate pass is enabled by default at  >= O2.

Well, it's debatable on whether you want to enable it at -O2.  I seriously
doubt that.

> 2. No FDO information is required in the path splitting pass.
> 3. The Path Splitting pass can be placed anywhere well before any optimizations pass. I have placed the path splitting pass before
> Copy_prop and it works. Also placing before the dominator also works fine.

Same for tracer - see above.

> 4. The code for path splitting as a separate pass is purely based on non profile and Non FDO data.
> 5. Placing the path splitting pass as a separate pass  can be placed anywhere in the optimizations. The optimizations that got benefitted with the
> Path splitting pass are PRE , CCP, DCE and can be placed well before the optimizations.
> 6. At the first phase of path splitting pass I am duplicating the loop latch node to its predecessor and make it SSA and loop structure
> Preserved. Making the path splitting as separate pass I would like to extend this pass to the multiple latch node with a forwarding block
> and the multiple latch nodes are edged towards the forwarder block making it one Loop latch edge.
>
> With the above observation, Do you think the existing tracer pass should be modified and the path splitting pass should be incorporated
> On top of the existing tracer pass.

I still think both passes do a fundamentally similar transform and thus should
be able to share data structures ('path' representation), parts of
analysis (can this path be duplicated?) and the transform stage.  Yes,
it might be that two instances
of the pass will run in the end, one doing path splitting and one doing tracing,
at different times in the optimization pipeline.

We have another similar transform with similar needs on data structures
and analysis / transform.  Jump threading.

It would be nice to have a common machinery here, for example to assess
cost of doing a trace.

Richard.

> Kindly give your feedback.
>
> Thanks & Regards
> Ajit
>
> Richard.
>
>> Thanks & Regards
>> Ajit
>>
>> Thanks,
>> Richard.
>>
>>>     Signed-off-by:Ajit Agarwal ajitkum@xilinx.com.
>>>
>>> gcc/Makefile.in                              |   1 +
>>>  gcc/common.opt                               |   4 +
>>>  gcc/opts.c                                   |   1 +
>>>  gcc/passes.def                               |   1 +
>>>  gcc/testsuite/gcc.dg/path-split-1.c          |  65 ++++
>>>  gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c |  62 ++++
>>>  gcc/timevar.def                              |   1 +
>>>  gcc/tree-pass.h                              |   1 +
>>>  gcc/tree-ssa-path-split.c                    | 462 +++++++++++++++++++++++++++
>>>  9 files changed, 598 insertions(+)
>>>  create mode 100644 gcc/testsuite/gcc.dg/path-split-1.c
>>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>>>  create mode 100644 gcc/tree-ssa-path-split.c
>>>
>>> diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 5f9261f..35ac363
>>> 100644
>>> --- a/gcc/Makefile.in
>>> +++ b/gcc/Makefile.in
>>> @@ -1476,6 +1476,7 @@ OBJS = \
>>>         tree-vect-slp.o \
>>>         tree-vectorizer.o \
>>>         tree-vrp.o \
>>> +        tree-ssa-path-split.o \
>>>         tree.o \
>>>         valtrack.o \
>>>         value-prof.o \
>>> diff --git a/gcc/common.opt b/gcc/common.opt index e104269..c63b100
>>> 100644
>>> --- a/gcc/common.opt
>>> +++ b/gcc/common.opt
>>> @@ -2328,6 +2328,10 @@ ftree-vrp
>>>  Common Report Var(flag_tree_vrp) Init(0) Optimization  Perform Value
>>> Range Propagation on trees
>>>
>>> +ftree-path-split
>>> +Common Report Var(flag_tree_path_split) Init(0) Optimization Perform
>>> +Path Splitting
>>> +
>>>  funit-at-a-time
>>>  Common Report Var(flag_unit_at_a_time) Init(1) Optimization  Compile
>>> whole compilation unit at a time diff --git a/gcc/opts.c b/gcc/opts.c
>>> index 8a16116..31947ff 100644
>>> --- a/gcc/opts.c
>>> +++ b/gcc/opts.c
>>> @@ -508,6 +508,7 @@ static const struct default_options default_options_table[] =
>>>      { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1 },
>>>      { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
>>>      { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
>>> +    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },
>>>
>>>      /* -O3 optimizations.  */
>>>      { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1
>>> }, diff --git a/gcc/passes.def b/gcc/passes.def index
>>> c0ddee4..43618eb
>>> 100644
>>> --- a/gcc/passes.def
>>> +++ b/gcc/passes.def
>>> @@ -155,6 +155,7 @@ along with GCC; see the file COPYING3.  If not see
>>>        NEXT_PASS (pass_ccp);
>>>        /* After CCP we rewrite no longer addressed locals into SSA
>>>          form if possible.  */
>>> +      NEXT_PASS (pass_path_split);
>>>        NEXT_PASS (pass_copy_prop);
>>>        NEXT_PASS (pass_complete_unrolli);
>>>        NEXT_PASS (pass_phiprop);
>>> diff --git a/gcc/testsuite/gcc.dg/path-split-1.c
>>> b/gcc/testsuite/gcc.dg/path-split-1.c
>>> new file mode 100644
>>> index 0000000..075dc87
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.dg/path-split-1.c
>>> @@ -0,0 +1,65 @@
>>> +/* { dg-do run } */
>>> +/* { dg-options "-O2 " } */
>>> +
>>> +#include <stdio.h>
>>> +#include <stdlib.h>
>>> +
>>> +#define RGBMAX 255
>>> +
>>> +int
>>> +test()
>>> +{
>>> +  int i, Pels;
>>> +  unsigned char sum = 0;
>>> +  unsigned char xr, xg, xb;
>>> +  unsigned char xc, xm, xy, xk;
>>> +  unsigned char *ReadPtr, *EritePtr;
>>> +
>>> +  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) *
>>> + 100); EritePtr = ( unsigned char *) malloc (sizeof (unsigned char)
>>> + * 100);
>>> +
>>> +  for (i = 0; i < 100;i++)
>>> +     {
>>> +       ReadPtr[i] = 100 - i;
>>> +     }
>>> +
>>> +  for (i = 0; i < 100; i++)
>>> +     {
>>> +       xr = *ReadPtr++;
>>> +       xg = *ReadPtr++;
>>> +       xb = *ReadPtr++;
>>> +
>>> +       xc = (unsigned char) (RGBMAX - xr);
>>> +       xm = (unsigned char) (RGBMAX - xg);
>>> +       xy = (unsigned char) (RGBMAX - xb);
>>> +
>>> +       if (xc < xm)
>>> +         {
>>> +           xk = (unsigned char) (xc < xy ? xc : xy);
>>> +         }
>>> +       else
>>> +        {
>>> +          xk = (unsigned char) (xm < xy ? xm : xy);
>>> +        }
>>> +
>>> +       xc = (unsigned char) (xc - xk);
>>> +       xm = (unsigned char) (xm - xk);
>>> +       xy = (unsigned char) (xy - xk);
>>> +
>>> +       *EritePtr++ = xc;
>>> +       *EritePtr++ = xm;
>>> +       *EritePtr++ = xy;
>>> +       *EritePtr++ = xk;
>>> +       sum += *EritePtr;
>>> +    }
>>> +  return sum;
>>> +}
>>> +
>>> +int
>>> +main()
>>> +{
>>> +  if (test() != 33)
>>> +    abort();
>>> +
>>> +  return 0;
>>> +}
>>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>>> b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>>> new file mode 100644
>>> index 0000000..19f277c
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>>> @@ -0,0 +1,62 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O2 -fdump-tree-path_split" } */
>>> +
>>> +#include <stdio.h>
>>> +#include <stdlib.h>
>>> +
>>> +#define RGBMAX 255
>>> +
>>> +int
>>> +test()
>>> +{
>>> +  int i, Pels;
>>> +  unsigned char sum = 0;
>>> +  unsigned char xr, xg, xb;
>>> +  unsigned char xc, xm, xy, xk;
>>> +  unsigned char *ReadPtr, *EritePtr;
>>> +
>>> +  ReadPtr = (unsigned char *) malloc (sizeof (unsigned char) * 100);
>>> + EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) *
>>> + 100);
>>> +
>>> +  for (i = 0; i < 100;i++)
>>> +     {
>>> +       ReadPtr[i] = 100 - i;
>>> +     }
>>> +
>>> +  for (i = 0; i < 100; i++)
>>> +     {
>>> +       xr = *ReadPtr++;
>>> +       xg = *ReadPtr++;
>>> +       xb = *ReadPtr++;
>>> +
>>> +       xc = ( unsigned char) (RGBMAX - xr);
>>> +       xm = ( unsigned char) (RGBMAX - xg);
>>> +       xy = ( unsigned char) (RGBMAX - xb);
>>> +
>>> +       if (xc < xm)
>>> +         {
>>> +           xk = ( unsigned char) (xc < xy ? xc : xy);
>>> +         }
>>> +       else
>>> +         {
>>> +           xk = ( unsigned char) (xm < xy ? xm : xy);
>>> +         }
>>> +
>>> +       xc = (unsigned char) (xc - xk);
>>> +       xm = (unsigned char) (xm - xk);
>>> +       xy = (unsigned char) (xy - xk);
>>> +
>>> +       *EritePtr++ = xc;
>>> +       *EritePtr++ = xm;
>>> +       *EritePtr++ = xy;
>>> +       *EritePtr++ = xk;
>>> +       sum += *EritePtr;
>>> +    }
>>> +  return sum;
>>> +}
>>> +
>>> +/* { dg-final { scan-tree-dump "xc_[0-9][0-9]* -> { xc_[0-9][0-9]* }"
>>> +"path_split"} } */
>>> +/* { dg-final { scan-tree-dump "xm_[0-9][0-9]* -> { xm_[0-9][0-9]* }"
>>> +"path_split"} } */
>>> +/* { dg-final { scan-tree-dump "xy_[0-9][0-9]* -> { xy_[0-9][0-9]* }"
>>> +"path_split"} } */
>>> +/* { dg-final { scan-tree-dump "Merging blocks" "path_split"} } */
>>> +/* { dg-final { cleanup-tree-dump "path_split" } } */
>>> diff --git a/gcc/timevar.def b/gcc/timevar.def index 711bbed..6217a8e
>>> 100644
>>> --- a/gcc/timevar.def
>>> +++ b/gcc/timevar.def
>>> @@ -288,3 +288,4 @@ DEFTIMEVAR (TV_JIT_REPLAY        , "replay of JIT client activity")
>>>  DEFTIMEVAR (TV_ASSEMBLE             , "assemble JIT code")
>>>  DEFTIMEVAR (TV_LINK                 , "link JIT code")
>>>  DEFTIMEVAR (TV_LOAD                 , "load JIT result")
>>> +DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path_split")
>>> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 398ab83..e00639e
>>> 100644
>>> --- a/gcc/tree-pass.h
>>> +++ b/gcc/tree-pass.h
>>> @@ -379,6 +379,7 @@ extern gimple_opt_pass *make_pass_iv_optimize
>>> (gcc::context *ctxt);  extern gimple_opt_pass
>>> *make_pass_tree_loop_done (gcc::context *ctxt);  extern
>>> gimple_opt_pass *make_pass_ch (gcc::context *ctxt);  extern
>>> gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
>>> +extern gimple_opt_pass *make_pass_path_split (gcc::context *ctxt);
>>>  extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context
>>> *ctxt);  extern gimple_opt_pass *make_pass_build_ssa (gcc::context
>>> *ctxt);  extern gimple_opt_pass *make_pass_build_alias (gcc::context
>>> *ctxt); diff --git a/gcc/tree-ssa-path-split.c
>>> b/gcc/tree-ssa-path-split.c new file mode 100644 index
>>> 0000000..3da7791
>>> --- /dev/null
>>> +++ b/gcc/tree-ssa-path-split.c
>>> @@ -0,0 +1,462 @@
>>> +/* Support routines for Path Splitting.
>>> +   Copyright (C) 2015 Free Software Foundation, Inc.
>>> +   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
>>> +
>>> + This file is part of GCC.
>>> +
>>> + GCC is free software; you can redistribute it and/or modify it
>>> + under the terms of the GNU General Public License as published by
>>> + the Free Software Foundation; either version 3, or (at your option)
>>> + any later version.
>>> +
>>> +GCC is distributed in the hope that it will be useful, but WITHOUT
>>> +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
>>> +or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
>>> +License for more details.
>>> +
>>> +You should have received a copy of the GNU General Public License
>>> +along with GCC; see the file COPYING3.  If not see
>>> +<http://www.gnu.org/licenses/>.  */
>>> +
>>> +#include "config.h"
>>> +#include "system.h"
>>> +#include "coretypes.h"
>>> +#include "tm.h"
>>> +#include "flags.h"
>>> +#include "tree.h"
>>> +#include "stor-layout.h"
>>> +#include "calls.h"
>>> +#include "predict.h"
>>> +#include "vec.h"
>>> +#include "hashtab.h"
>>> +#include "hash-set.h"
>>> +#include "machmode.h"
>>> +#include "hard-reg-set.h"
>>> +#include "input.h"
>>> +#include "function.h"
>>> +#include "dominance.h"
>>> +#include "cfg.h"
>>> +#include "cfganal.h"
>>> +#include "basic-block.h"
>>> +#include "tree-ssa-alias.h"
>>> +#include "internal-fn.h"
>>> +#include "gimple-fold.h"
>>> +#include "tree-eh.h"
>>> +#include "gimple-expr.h"
>>> +#include "is-a.h"
>>> +#include "gimple.h"
>>> +#include "gimple-iterator.h"
>>> +#include "gimple-walk.h"
>>> +#include "gimple-ssa.h"
>>> +#include "tree-cfg.h"
>>> +#include "tree-phinodes.h"
>>> +#include "ssa-iterators.h"
>>> +#include "stringpool.h"
>>> +#include "tree-ssanames.h"
>>> +#include "tree-ssa-loop-manip.h"
>>> +#include "tree-ssa-loop-niter.h"
>>> +#include "tree-ssa-loop.h"
>>> +#include "tree-into-ssa.h"
>>> +#include "tree-ssa.h"
>>> +#include "tree-pass.h"
>>> +#include "tree-dump.h"
>>> +#include "gimple-pretty-print.h"
>>> +#include "diagnostic-core.h"
>>> +#include "intl.h"
>>> +#include "cfgloop.h"
>>> +#include "tree-scalar-evolution.h"
>>> +#include "tree-ssa-propagate.h"
>>> +#include "tree-chrec.h"
>>> +#include "tree-ssa-threadupdate.h"
>>> +#include "expr.h"
>>> +#include "insn-codes.h"
>>> +#include "optabs.h"
>>> +#include "tree-ssa-threadedge.h"
>>> +#include "wide-int.h"
>>> +
>>> +/* Replace_uses_phi function propagates the phi results with the
>>> +   first phi argument into each of the copied join blocks wired into
>>> +   its predecessors. This function is called from the replace_uses_phi
>>> +   to replace the uses of first phi arguments with the second
>>> +   phi arguments in the next copy of join block.  */
>>> +
>>> +static void
>>> +replace_use_phi_operand1_with_operand2 (basic_block b,
>>> +                                        tree use1,
>>> +                                        tree use2) {
>>> +  use_operand_p use;
>>> +  ssa_op_iter iter;
>>> +  gimple_stmt_iterator gsi;
>>> +
>>> +  for (gsi = gsi_start_bb (b); !gsi_end_p (gsi);)
>>> +     {
>>> +       gimple stmt = gsi_stmt (gsi);
>>> +       FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
>>> +       {
>>> +         tree tuse = USE_FROM_PTR (use);
>>> +          if (use1 == tuse || use1 == NULL_TREE)
>>> +            {
>>> +              propagate_value (use, use2);
>>> +              update_stmt(stmt);
>>> +            }
>>> +        }
>>> +       gsi_next(&gsi);
>>> +     }
>>> +}
>>> +
>>> +/* This function propagates the phi result into the use points with
>>> +   the phi arguments. The join block is copied and wired into the
>>> +   predecessors. Since the use points of the phi results will be same
>>> +   in the each of the copy join blocks in the  predecessors, it
>>> +   propagates the phi arguments in the copy of the join blocks wired
>>> +   into its predecessor.  */
>>> +
>>> +static
>>> +void replace_uses_phi (basic_block b, basic_block temp_bb) {
>>> +  gimple_seq phis = phi_nodes (b);
>>> +  gimple phi = gimple_seq_first_stmt (phis);
>>> +  tree def = gimple_phi_result (phi), use = gimple_phi_arg_def
>>> +(phi,0);
>>> +  tree use2 = gimple_phi_arg_def (phi,1);
>>> +
>>> +  if (virtual_operand_p (def))
>>> +    {
>>> +      imm_use_iterator iter;
>>> +      use_operand_p use_p;
>>> +      gimple stmt;
>>> +
>>> +      FOR_EACH_IMM_USE_STMT (stmt, iter, def)
>>> +        FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
>>> +          SET_USE (use_p, use);
>>> +      if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (def))
>>> +        SSA_NAME_OCCURS_IN_ABNORMAL_PHI (use) = 1;
>>> +    }
>>> +   else
>>> +     replace_uses_by (def, use);
>>> +   replace_use_phi_operand1_with_operand2 (temp_bb, use, use2); }
>>> +
>>> +/* Returns true if the block bb has label or call statements.
>>> +   Otherwise return false.  */
>>> +
>>> +static bool
>>> +is_block_has_label_call (basic_block bb) {
>>> +  gimple_stmt_iterator gsi;
>>> +
>>> +  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
>>> +     {
>>> +       gimple stmt = gsi_stmt(gsi);
>>> +       if (dyn_cast <glabel *> (stmt))
>>> +         {
>>> +           return true;
>>> +         }
>>> +       if (is_gimple_call (stmt))
>>> +         return true;
>>> +     }
>>> +  return false;
>>> +}
>>> +
>>> +/* This function performs the feasibility tests for path splitting
>>> +   to perform. Return false if the feasibility for path splitting
>>> +   is not done and returns true if the feasbility for path splitting
>>> +   is done. Following feasibility tests are performed.
>>> +
>>> +   1. Return false if the join block has call gimple statements.
>>> +   2. Return false if the join block has rhs casting for assign
>>> +      gimple statements.
>>> +   3. If the number of phis is greater than 1 or the phi node in
>>> +      the join block has virtual operand return false.
>>> +   4. Return false if the number of sequential statements is
>>> +      greater than 2.
>>> +   5. If the predecessors blocks has labels and call statements
>>> +      return false.
>>> +   6. If the phi result in the phi node of the join block is not
>>> +      used inside the same join block return false.
>>> +   7. Otherwise returns true.  */
>>> +
>>> +static bool
>>> +is_feasible_path_splitting (basic_block join_node, basic_block pred1,
>>> +                           basic_block pred2) {
>>> +  int num_stmt = 0, num_phis = 0;
>>> +  gimple_stmt_iterator psi, gsi;
>>> +
>>> +  for (gsi = gsi_start_bb (join_node); !gsi_end_p (gsi); gsi_next (&gsi))
>>> +     {
>>> +       gimple stmt = gsi_stmt(gsi);
>>> +
>>> +       if (gimple_assign_cast_p (stmt))
>>> +         return false;
>>> +
>>> +       if (is_gimple_call (stmt))
>>> +         return false;
>>> +
>>> +       if (!is_gimple_debug(stmt))
>>> +         {
>>> +           num_stmt++;
>>> +         }
>>> +     }
>>> +
>>> +   if (pred1 && pred2 && (num_stmt > 2))
>>> +     {
>>> +       bool found_virtual_result = false;
>>> +
>>> +       for (psi = gsi_start_phis (join_node); !gsi_end_p (psi); )
>>> +          {
>>> +            use_operand_p use_p;
>>> +            imm_use_iterator iter;
>>> +            gimple stmt = gsi_stmt(psi);
>>> +
>>> +            if (!virtual_operand_p (gimple_phi_result (stmt)))
>>> +              num_phis++;
>>> +            else
>>> +              found_virtual_result = true;
>>> +
>>> +            FOR_EACH_IMM_USE_FAST (use_p, iter, gimple_phi_result (stmt))
>>> +            {
>>> +              gimple use_stmt = USE_STMT (use_p);
>>> +
>>> +              if (gimple_bb (use_stmt) != join_node)
>>> +                return false;
>>> +            }
>>> +
>>> +            gsi_next(&psi);
>>> +         }
>>> +
>>> +       if ((num_phis >1) || found_virtual_result)
>>> +          return false;
>>> +
>>> +       if(is_block_has_label_call(pred1) || is_block_has_label_call(pred2))
>>> +         return false;
>>> +
>>> +       return true;
>>> +    }
>>> +  return false;
>>> +}
>>> +
>>> +/* Update the statements in the basic block with the basic
>>> +   basic block.  */
>>> +
>>> +static void
>>> +update_stmt_bb(basic_block b)
>>> +{
>>> +  gimple_stmt_iterator gsi;
>>> +  for(gsi = gsi_start_bb(b); !gsi_end_p(gsi); gsi_next(&gsi))
>>> +   {
>>> +     gimple stmt = gsi_stmt(gsi);
>>> +     gimple_set_bb(stmt,b);
>>> +   }
>>> +}
>>> +
>>> +/* This function gets the join blocks same as the source
>>> +   node of the loop latch nodes and the predecessors of
>>> +   the join block is updated in the pred1 and pred2 passed
>>> +   as the reference arguments into the function. Return
>>> +   the join block.  */
>>> +
>>> +static basic_block
>>> +get_join_blk_same_as_loop_latch (basic_block bb,
>>> +                                 basic_block &pred1,
>>> +                                 basic_block &pred2) {
>>> +  vec<basic_block> bbs;
>>> +  basic_block bb1;
>>> +  unsigned int i;
>>> +  edge_iterator ei;
>>> +  edge e1;
>>> +  bool found = false ,found1;
>>> +  bbs = get_all_dominated_blocks (CDI_DOMINATORS,
>>> +                                  bb );
>>> +  FOR_EACH_VEC_ELT (bbs, i, bb1)
>>> +  {
>>> +    found1 = false;
>>> +    FOR_EACH_EDGE (e1, ei, bb->succs)
>>> +    {
>>> +      if ( bb1 == e1->dest)
>>> +        {
>>> +          found = true;
>>> +          found1 = true;
>>> +        }
>>> +    }
>>> +    if (!found1 && found)
>>> +      {
>>> +        found = false;
>>> +        FOR_EACH_EDGE (e1, ei, bb1->succs)
>>> +        {
>>> +          if (e1->flags & (EDGE_DFS_BACK))
>>> +            found = true;
>>> +        }
>>> +
>>> +        if (found && EDGE_COUNT(bb1->preds) == 2)
>>> +          {
>>> +            unsigned int k = 0;
>>> +            FOR_EACH_EDGE (e1, ei, bb1->preds)
>>> +            {
>>> +              if ((e1->flags & (EDGE_DFS_BACK)))
>>> +                continue;
>>> +
>>> +              if ( k == 1)
>>> +                {
>>> +                  if (single_succ_p(e1->src) &&
>>> +                      single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
>>> +                    {
>>> +                      pred2 = e1->src;
>>> +                    }
>>> +                }
>>> +                else
>>> +                  {
>>> +                    if (single_succ_p(e1->src) &&
>>> +                        single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
>>> +                      {
>>> +                        pred1 = e1->src;
>>> +                      }
>>> +                  }
>>> +                k++;
>>> +            }
>>> +            bbs.release();
>>> +            return bb1;
>>> +          }
>>> +       }
>>> +   }
>>> +   bbs.release();
>>> +   return NULL;
>>> +}
>>> +
>>> +/* This is the core function to perform path splitting. The join
>>> +   same as the source of the loop latch node is identified along
>>> +   with their predecessors. Based on the feasibility tests for
>>> +   path splitting the path splitting is performed by wiring the
>>> +   copy of join blocks into the predecessors and propagating the phi
>>> +   result with the corresponding phi arguments into each of the copy
>>> +   of join blocks wired with the original predecessors of the join
>>> +   block.
>>> +
>>> +   The  tree-cfg-cleanup will merge the blocks in the predecessors
>>> +   path and the update-ssa will update the ssa representation after
>>> +   the path splitting is performed.  */
>>> +
>>> +static void
>>> +perform_path_splitting (basic_block bb) {
>>> +  basic_block pred1 = NULL, pred2 = NULL, join_block = NULL;
>>> +
>>> +  join_block = get_join_blk_same_as_loop_latch (bb, pred1, pred2);
>>> +
>>> +  if (join_block  &&
>>> +      is_feasible_path_splitting (join_block, pred1, pred2))
>>> +    {
>>> +      basic_block new_bb1 = NULL, new_bb2 = NULL;
>>> +      gimple_stmt_iterator last;
>>> +      basic_block temp_bb = NULL;
>>> +      edge_iterator ei;
>>> +      edge e1;
>>> +
>>> +      temp_bb = duplicate_block (join_block, NULL, NULL);
>>> +
>>> +      FOR_EACH_EDGE (e1, ei, pred1->succs)
>>> +        new_bb1 = split_edge (e1);
>>> +
>>> +      FOR_EACH_EDGE (e1, ei, pred2->succs)
>>> +        new_bb2 = split_edge (e1);
>>> +
>>> +      last = gsi_start_bb (new_bb1);
>>> +      gsi_insert_seq_after (&last, bb_seq (join_block), GSI_NEW_STMT);
>>> +      last = gsi_start_bb (new_bb2);
>>> +      gsi_insert_seq_after (&last, bb_seq (temp_bb), GSI_NEW_STMT);
>>> +      update_stmt_bb (new_bb1);
>>> +      update_stmt_bb (new_bb2);
>>> +
>>> +      replace_uses_phi (join_block, new_bb2);
>>> +
>>> +      set_bb_seq (join_block, NULL);
>>> +      set_bb_seq(temp_bb,NULL);
>>> +      delete_basic_block (temp_bb);
>>> +      return;
>>> +    }
>>> +}
>>> +
>>> +static unsigned int
>>> +execute_path_split (void)
>>> +{
>>> +  basic_block bb;
>>> +
>>> +  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
>>> + initialize_original_copy_tables();
>>> +
>>> +  calculate_dominance_info (CDI_DOMINATORS);
>>> + calculate_dominance_info (CDI_POST_DOMINATORS);
>>> +
>>> +  mark_dfs_back_edges ();
>>> +
>>> +  FOR_EACH_BB_FN (bb, cfun)
>>> +  {
>>> +    gimple last;
>>> +
>>> +    /* We only care about blocks ending in a COND_EXPR. */
>>> +
>>> +    last = gsi_stmt (gsi_last_bb (bb));
>>> +
>>> +    /* We're basically looking for a switch or any kind of conditional with
>>> +       integral or pointer type arguments.  Note the type of the second
>>> +       argument will be the same as the first argument, so no need to
>>> +       check it explicitly.  */
>>> +    if ((last && (gimple_code (last) == GIMPLE_COND
>>> +            && TREE_CODE (gimple_cond_lhs (last)) == SSA_NAME
>>> +            && (INTEGRAL_TYPE_P (TREE_TYPE (gimple_cond_lhs (last)))
>>> +            || POINTER_TYPE_P (TREE_TYPE (gimple_cond_lhs (last))))
>>> +            && (TREE_CODE (gimple_cond_rhs (last)) == SSA_NAME
>>> +            || is_gimple_min_invariant (gimple_cond_rhs (last))))))
>>> +      {
>>> +
>>> +         if (gimple_code(last) == GIMPLE_COND)
>>> +           {
>>> +              perform_path_splitting (bb);
>>> +           }
>>> +      }
>>> +   }
>>> +
>>> +   loop_optimizer_finalize ();
>>> +   free_original_copy_tables ();
>>> +   free_dominance_info (CDI_DOMINATORS);
>>> +   free_dominance_info (CDI_POST_DOMINATORS);
>>> +   return 0;
>>> +}
>>> +
>>> +namespace {
>>> +
>>> +const pass_data pass_data_path_split = {
>>> +   GIMPLE_PASS, /* type */
>>> +   "path_split", /* name */
>>> +    OPTGROUP_NONE, /* optinfo_flags */
>>> +    TV_TREE_PATH_SPLIT, /* tv_id */
>>> +    PROP_ssa, /* properties_required */
>>> +    0, /* properties_provided */
>>> +    0, /* properties_destroyed */
>>> +    0, /* todo_flags_start */
>>> +    ( TODO_cleanup_cfg | TODO_update_ssa ), /* todo_flags_finish */
>>> +};
>>> +
>>> +class pass_path_split : public gimple_opt_pass {
>>> +   public:
>>> +    pass_path_split (gcc::context *ctxt)
>>> +      : gimple_opt_pass (pass_data_path_split, ctxt)
>>> +    {}
>>> +
>>> +   /* opt_pass methods: */
>>> +   opt_pass * clone () { return new pass_path_split (m_ctxt); }
>>> +   virtual bool gate (function *) { return flag_tree_path_split != 0; }
>>> +   virtual unsigned int execute (function *) { return
>>> + execute_path_split (); }
>>> +
>>> +}; // class pass_path_split
>>> +
>>> +} // anon namespace
>>> +
>>> +gimple_opt_pass *
>>> +make_pass_path_split (gcc::context *ctxt) {
>>> +  return new pass_path_split (ctxt); }
>>> --
>>> 1.8.2.1
>>>
>>> Thanks & Regards
>>> Ajit

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-07-16 11:20         ` Richard Biener
@ 2015-07-29  7:44           ` Ajit Kumar Agarwal
  2015-08-15 23:13             ` Ajit Kumar Agarwal
  0 siblings, 1 reply; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-07-29  7:44 UTC (permalink / raw)
  To: Richard Biener, Jeff Law
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala



-----Original Message-----
From: Richard Biener [mailto:richard.guenther@gmail.com] 
Sent: Thursday, July 16, 2015 4:30 PM
To: Ajit Kumar Agarwal
Cc: law@redhat.com; GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On Tue, Jul 7, 2015 at 3:22 PM, Ajit Kumar Agarwal <ajit.kumar.agarwal@xilinx.com> wrote:
>
>
> -----Original Message-----
> From: Richard Biener [mailto:richard.guenther@gmail.com]
> Sent: Tuesday, July 07, 2015 2:21 PM
> To: Ajit Kumar Agarwal
> Cc: law@redhat.com; GCC Patches; Vinod Kathail; Shail Aditya Gupta; 
> Vidhumouli Hunsigida; Nagaraju Mekala
> Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on 
> tree ssa representation
>
> On Sat, Jul 4, 2015 at 2:39 PM, Ajit Kumar Agarwal <ajit.kumar.agarwal@xilinx.com> wrote:
>>
>>
>> -----Original Message-----
>> From: Richard Biener [mailto:richard.guenther@gmail.com]
>> Sent: Tuesday, June 30, 2015 4:42 PM
>> To: Ajit Kumar Agarwal
>> Cc: law@redhat.com; GCC Patches; Vinod Kathail; Shail Aditya Gupta; 
>> Vidhumouli Hunsigida; Nagaraju Mekala
>> Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass 
>> on tree ssa representation
>>
>> On Tue, Jun 30, 2015 at 10:16 AM, Ajit Kumar Agarwal <ajit.kumar.agarwal@xilinx.com> wrote:
>>> All:
>>>
>>> The below patch added a new path Splitting optimization pass on SSA 
>>> representation. The Path Splitting optimization Pass moves the join 
>>> block of if-then-else same as loop latch to its predecessors and get merged with the predecessors Preserving the SSA representation.
>>>
>>> The patch is tested for Microblaze and i386 target. The 
>>> EEMBC/Mibench benchmarks is run with the Microblaze target And the 
>>> performance gain of 9.15% and rgbcmy01_lite(EEMBC benchmarks). The Deja GNU tests is run for Mircroblaze Target and no regression is seen for Microblaze target and the new testcase attached are passed.
>>>
>>> For i386 bootstrapping goes through fine and the Spec cpu2000 
>>> benchmarks is run with this patch. Following observation were seen with spec cpu2000 benchmarks.
>>>
>>> Ratio of path splitting change vs Ratio of not having path splitting change is 3653.353 vs 3652.14 for INT benchmarks.
>>> Ratio of path splitting change vs Ratio of not having path splitting change is  4353.812 vs 4345.351 for FP benchmarks.
>>>
>>> Based on comments from RFC patch following changes were done.
>>>
>>> 1. Added a new pass for path splitting changes.
>>> 2. Placed the new path  Splitting Optimization pass before the copy propagation pass.
>>> 3. The join block same as the Loop latch is wired into its 
>>> predecessors so that the CFG Cleanup pass will merge the blocks Wired together.
>>> 4. Copy propagation routines added for path splitting changes is not 
>>> needed as suggested by Jeff. They are removed in the patch as The copy propagation in the copied join blocks will be done by the existing copy propagation pass and the update ssa pass.
>>> 5. Only the propagation of phi results of the join block with the 
>>> phi argument is done which will not be done by the existing update_ssa Or copy propagation pass on tree ssa representation.
>>> 6. Added 2 tests.
>>>     a) compilation check  tests.
>>>    b) execution tests.
>>> 7. Refactoring of the code for the feasibility check and finding the join block same as loop latch node.
>>>
>>>     [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation.
>>>
>>>     Added a new pass on path splitting on tree SSA representation. The path
>>>     splitting optimization does the CFG transformation of join block of the
>>>     if-then-else same as the loop latch node is moved and merged with the
>>>     predecessor blocks after preserving the SSA representation.
>>>
>>>     ChangeLog:
>>>     2015-06-30  Ajit Agarwal  <ajitkum@xilinx.com>
>>>
>>>         * gcc/Makefile.in: Add the build of the new file
>>>         tree-ssa-path-split.c
>>>         * gcc/common.opt: Add the new flag ftree-path-split.
>>>         * gcc/opts.c: Add an entry for Path splitting pass
>>>         with optimization flag greater and equal to O2.
>>>         * gcc/passes.def: Enable and add new pass path splitting.
>>>         * gcc/timevar.def: Add the new entry for TV_TREE_PATH_SPLIT.
>>>         * gcc/tree-pass.h: Extern Declaration of make_pass_path_split.
>>>         * gcc/tree-ssa-path-split.c: New file for path splitting pass.
>>>         * gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c: New testcase.
>>>         * gcc/testsuite/gcc.dg/path-split-1.c: New testcase.
>>
>>>>I'm not 100% sure I understand the transform but what I see from the testcases it tail-duplicates from a conditional up to a loop latch block (not sure if it >>includes it and thus ends up creating a loop nest or not).
>>
>>>>An observation I have is that the pass should at least share the transform stage to some extent with the existing tracer pass (tracer.c) which essentially does >>the same but not restricted to loops in any way.
>>
>> The following piece of code from tracer.c can be shared with the existing path splitting pass.
>>
>> {
>>              e = find_edge (bb, bb2);
>>
>>               copy = duplicate_block (bb2, e, bb);
>>               flush_pending_stmts (e);
>>
>>               add_phi_args_after_copy (&copy, 1, NULL); }
>>
>> Sharing the above code of the transform stage of tracer.c with the path splitting pass has the following limitation.
>>
>> 1. The duplicated loop latch node is wired to its predecessors and 
>> the existing phi node in the loop latch node with the Phi arguments 
>> from its corresponding predecessors is moved to the duplicated loop latch node that is wired into its predecessors. Due To this, the duplicated loop latch nodes wired into its predecessors will not be merged with the original predecessors by CFG cleanup phase .
>>
>>>> So I wonder if your pass could be simply another heuristic to compute paths to trace in the existing tracer pass.
>>
>> Sorry, I am not very clear when you say the above.  I am trying to 
>> figure out whether you expect the existing pass of tracer.c should be modified Or the path splitting pass should coexist.
>
>>>Yes, I was wondering whether tracer.c could be simply modified.  Both transforms are doing something very similar.
>>>Yes, your pass would simply compute extra traces based on the new heuristic.
>
> I have observed the following with the tracer pass optimization.
>
> Tracer Pass:
>
> 1. The tracer pass is FDO optimizations  and is not enabled by default at O2. This optimization is enabled with -fprofile-use.
> 2. The -ftracer flag is used to enable the optimization explicitly in the absence of FDO optimization.
> 3. The tracer pass optimizations is placed at only place before the 
> dominator_pass. Moving the tracer pass before copy_prop Pass gives the following error. " Error : pass tracer does not support cloning".

This is an error from the pass manager, you added a second tracer pass instead of moving it.

> 4. The code for tracer pass is totally based on the FDO related information and the logic is based on the profile data.

>>Yes.

> Path Spliiting pass:
>
> 1. Having the path splitting as a separate pass is enabled by default at  >= O2.

>>Well, it's debatable on whether you want to enable it at -O2.  I seriously doubt that.

> 2. No FDO information is required in the path splitting pass.
> 3. The Path Splitting pass can be placed anywhere well before any 
> optimizations pass. I have placed the path splitting pass before Copy_prop and it works. Also placing before the dominator also works fine.

>>Same for tracer - see above.

> 4. The code for path splitting as a separate pass is purely based on non profile and Non FDO data.
> 5. Placing the path splitting pass as a separate pass  can be placed 
> anywhere in the optimizations. The optimizations that got benefitted with the Path splitting pass are PRE , CCP, DCE and can be placed well before the optimizations.
> 6. At the first phase of path splitting pass I am duplicating the loop 
> latch node to its predecessor and make it SSA and loop structure 
> Preserved. Making the path splitting as separate pass I would like to extend this pass to the multiple latch node with a forwarding block and the multiple latch nodes are edged towards the forwarder block making it one Loop latch edge.
>
> With the above observation, Do you think the existing tracer pass 
> should be modified and the path splitting pass should be incorporated On top of the existing tracer pass.

>>I still think both passes do a fundamentally similar transform and thus should be able to share data structures ('path' representation), parts of analysis (can >>this path be duplicated?) and the transform stage.  Yes, it might be that two instances of the pass will run in the end, one doing path splitting and one doing >>tracing, at different times in the optimization pipeline.

>>We have another similar transform with similar needs on data structures and analysis / transform.  Jump threading.

Thanks. I am going to make the following change and send for review.

 Both path splitting and traces should be a separate pass and the common code between them  should
Be abstracted out and the path splitting pass and the tracer pass should use this abstracted common piece of code.

In the next phase of the path splitting transformation, I would like to make the basic block that dominates the IF node and has the successor of
the given block is IF-node then the given block is duplicated to THEN and ELSE path similar to head duplication thus making the scope of path splitting
to enable more of PRE,DCE and CCP optimizations.

Please let me know if there is any concern.

Thanks & Regards
Ajit

>>It would be nice to have a common machinery here, for example to assess cost of doing a trace.

Richard.

> Kindly give your feedback.
>
> Thanks & Regards
> Ajit
>
> Richard.
>
>> Thanks & Regards
>> Ajit
>>
>> Thanks,
>> Richard.
>>
>>>     Signed-off-by:Ajit Agarwal ajitkum@xilinx.com.
>>>
>>> gcc/Makefile.in                              |   1 +
>>>  gcc/common.opt                               |   4 +
>>>  gcc/opts.c                                   |   1 +
>>>  gcc/passes.def                               |   1 +
>>>  gcc/testsuite/gcc.dg/path-split-1.c          |  65 ++++
>>>  gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c |  62 ++++
>>>  gcc/timevar.def                              |   1 +
>>>  gcc/tree-pass.h                              |   1 +
>>>  gcc/tree-ssa-path-split.c                    | 462 +++++++++++++++++++++++++++
>>>  9 files changed, 598 insertions(+)
>>>  create mode 100644 gcc/testsuite/gcc.dg/path-split-1.c
>>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>>>  create mode 100644 gcc/tree-ssa-path-split.c
>>>
>>> diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 
>>> 5f9261f..35ac363
>>> 100644
>>> --- a/gcc/Makefile.in
>>> +++ b/gcc/Makefile.in
>>> @@ -1476,6 +1476,7 @@ OBJS = \
>>>         tree-vect-slp.o \
>>>         tree-vectorizer.o \
>>>         tree-vrp.o \
>>> +        tree-ssa-path-split.o \
>>>         tree.o \
>>>         valtrack.o \
>>>         value-prof.o \
>>> diff --git a/gcc/common.opt b/gcc/common.opt index e104269..c63b100
>>> 100644
>>> --- a/gcc/common.opt
>>> +++ b/gcc/common.opt
>>> @@ -2328,6 +2328,10 @@ ftree-vrp
>>>  Common Report Var(flag_tree_vrp) Init(0) Optimization  Perform 
>>> Value Range Propagation on trees
>>>
>>> +ftree-path-split
>>> +Common Report Var(flag_tree_path_split) Init(0) Optimization 
>>> +Perform Path Splitting
>>> +
>>>  funit-at-a-time
>>>  Common Report Var(flag_unit_at_a_time) Init(1) Optimization  
>>> Compile whole compilation unit at a time diff --git a/gcc/opts.c 
>>> b/gcc/opts.c index 8a16116..31947ff 100644
>>> --- a/gcc/opts.c
>>> +++ b/gcc/opts.c
>>> @@ -508,6 +508,7 @@ static const struct default_options default_options_table[] =
>>>      { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1 },
>>>      { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
>>>      { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
>>> +    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },
>>>
>>>      /* -O3 optimizations.  */
>>>      { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 
>>> 1 }, diff --git a/gcc/passes.def b/gcc/passes.def index 
>>> c0ddee4..43618eb
>>> 100644
>>> --- a/gcc/passes.def
>>> +++ b/gcc/passes.def
>>> @@ -155,6 +155,7 @@ along with GCC; see the file COPYING3.  If not see
>>>        NEXT_PASS (pass_ccp);
>>>        /* After CCP we rewrite no longer addressed locals into SSA
>>>          form if possible.  */
>>> +      NEXT_PASS (pass_path_split);
>>>        NEXT_PASS (pass_copy_prop);
>>>        NEXT_PASS (pass_complete_unrolli);
>>>        NEXT_PASS (pass_phiprop);
>>> diff --git a/gcc/testsuite/gcc.dg/path-split-1.c
>>> b/gcc/testsuite/gcc.dg/path-split-1.c
>>> new file mode 100644
>>> index 0000000..075dc87
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.dg/path-split-1.c
>>> @@ -0,0 +1,65 @@
>>> +/* { dg-do run } */
>>> +/* { dg-options "-O2 " } */
>>> +
>>> +#include <stdio.h>
>>> +#include <stdlib.h>
>>> +
>>> +#define RGBMAX 255
>>> +
>>> +int
>>> +test()
>>> +{
>>> +  int i, Pels;
>>> +  unsigned char sum = 0;
>>> +  unsigned char xr, xg, xb;
>>> +  unsigned char xc, xm, xy, xk;
>>> +  unsigned char *ReadPtr, *EritePtr;
>>> +
>>> +  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 
>>> + 100); EritePtr = ( unsigned char *) malloc (sizeof (unsigned char)
>>> + * 100);
>>> +
>>> +  for (i = 0; i < 100;i++)
>>> +     {
>>> +       ReadPtr[i] = 100 - i;
>>> +     }
>>> +
>>> +  for (i = 0; i < 100; i++)
>>> +     {
>>> +       xr = *ReadPtr++;
>>> +       xg = *ReadPtr++;
>>> +       xb = *ReadPtr++;
>>> +
>>> +       xc = (unsigned char) (RGBMAX - xr);
>>> +       xm = (unsigned char) (RGBMAX - xg);
>>> +       xy = (unsigned char) (RGBMAX - xb);
>>> +
>>> +       if (xc < xm)
>>> +         {
>>> +           xk = (unsigned char) (xc < xy ? xc : xy);
>>> +         }
>>> +       else
>>> +        {
>>> +          xk = (unsigned char) (xm < xy ? xm : xy);
>>> +        }
>>> +
>>> +       xc = (unsigned char) (xc - xk);
>>> +       xm = (unsigned char) (xm - xk);
>>> +       xy = (unsigned char) (xy - xk);
>>> +
>>> +       *EritePtr++ = xc;
>>> +       *EritePtr++ = xm;
>>> +       *EritePtr++ = xy;
>>> +       *EritePtr++ = xk;
>>> +       sum += *EritePtr;
>>> +    }
>>> +  return sum;
>>> +}
>>> +
>>> +int
>>> +main()
>>> +{
>>> +  if (test() != 33)
>>> +    abort();
>>> +
>>> +  return 0;
>>> +}
>>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>>> b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>>> new file mode 100644
>>> index 0000000..19f277c
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>>> @@ -0,0 +1,62 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O2 -fdump-tree-path_split" } */
>>> +
>>> +#include <stdio.h>
>>> +#include <stdlib.h>
>>> +
>>> +#define RGBMAX 255
>>> +
>>> +int
>>> +test()
>>> +{
>>> +  int i, Pels;
>>> +  unsigned char sum = 0;
>>> +  unsigned char xr, xg, xb;
>>> +  unsigned char xc, xm, xy, xk;
>>> +  unsigned char *ReadPtr, *EritePtr;
>>> +
>>> +  ReadPtr = (unsigned char *) malloc (sizeof (unsigned char) * 
>>> + 100); EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) 
>>> + * 100);
>>> +
>>> +  for (i = 0; i < 100;i++)
>>> +     {
>>> +       ReadPtr[i] = 100 - i;
>>> +     }
>>> +
>>> +  for (i = 0; i < 100; i++)
>>> +     {
>>> +       xr = *ReadPtr++;
>>> +       xg = *ReadPtr++;
>>> +       xb = *ReadPtr++;
>>> +
>>> +       xc = ( unsigned char) (RGBMAX - xr);
>>> +       xm = ( unsigned char) (RGBMAX - xg);
>>> +       xy = ( unsigned char) (RGBMAX - xb);
>>> +
>>> +       if (xc < xm)
>>> +         {
>>> +           xk = ( unsigned char) (xc < xy ? xc : xy);
>>> +         }
>>> +       else
>>> +         {
>>> +           xk = ( unsigned char) (xm < xy ? xm : xy);
>>> +         }
>>> +
>>> +       xc = (unsigned char) (xc - xk);
>>> +       xm = (unsigned char) (xm - xk);
>>> +       xy = (unsigned char) (xy - xk);
>>> +
>>> +       *EritePtr++ = xc;
>>> +       *EritePtr++ = xm;
>>> +       *EritePtr++ = xy;
>>> +       *EritePtr++ = xk;
>>> +       sum += *EritePtr;
>>> +    }
>>> +  return sum;
>>> +}
>>> +
>>> +/* { dg-final { scan-tree-dump "xc_[0-9][0-9]* -> { xc_[0-9][0-9]* }"
>>> +"path_split"} } */
>>> +/* { dg-final { scan-tree-dump "xm_[0-9][0-9]* -> { xm_[0-9][0-9]* }"
>>> +"path_split"} } */
>>> +/* { dg-final { scan-tree-dump "xy_[0-9][0-9]* -> { xy_[0-9][0-9]* }"
>>> +"path_split"} } */
>>> +/* { dg-final { scan-tree-dump "Merging blocks" "path_split"} } */
>>> +/* { dg-final { cleanup-tree-dump "path_split" } } */
>>> diff --git a/gcc/timevar.def b/gcc/timevar.def index 
>>> 711bbed..6217a8e
>>> 100644
>>> --- a/gcc/timevar.def
>>> +++ b/gcc/timevar.def
>>> @@ -288,3 +288,4 @@ DEFTIMEVAR (TV_JIT_REPLAY        , "replay of JIT client activity")
>>>  DEFTIMEVAR (TV_ASSEMBLE             , "assemble JIT code")
>>>  DEFTIMEVAR (TV_LINK                 , "link JIT code")
>>>  DEFTIMEVAR (TV_LOAD                 , "load JIT result")
>>> +DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path_split")
>>> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 
>>> 398ab83..e00639e
>>> 100644
>>> --- a/gcc/tree-pass.h
>>> +++ b/gcc/tree-pass.h
>>> @@ -379,6 +379,7 @@ extern gimple_opt_pass *make_pass_iv_optimize 
>>> (gcc::context *ctxt);  extern gimple_opt_pass 
>>> *make_pass_tree_loop_done (gcc::context *ctxt);  extern 
>>> gimple_opt_pass *make_pass_ch (gcc::context *ctxt);  extern 
>>> gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
>>> +extern gimple_opt_pass *make_pass_path_split (gcc::context *ctxt);
>>>  extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context 
>>> *ctxt);  extern gimple_opt_pass *make_pass_build_ssa (gcc::context 
>>> *ctxt);  extern gimple_opt_pass *make_pass_build_alias (gcc::context 
>>> *ctxt); diff --git a/gcc/tree-ssa-path-split.c 
>>> b/gcc/tree-ssa-path-split.c new file mode 100644 index
>>> 0000000..3da7791
>>> --- /dev/null
>>> +++ b/gcc/tree-ssa-path-split.c
>>> @@ -0,0 +1,462 @@
>>> +/* Support routines for Path Splitting.
>>> +   Copyright (C) 2015 Free Software Foundation, Inc.
>>> +   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
>>> +
>>> + This file is part of GCC.
>>> +
>>> + GCC is free software; you can redistribute it and/or modify it 
>>> + under the terms of the GNU General Public License as published by 
>>> + the Free Software Foundation; either version 3, or (at your 
>>> + option) any later version.
>>> +
>>> +GCC is distributed in the hope that it will be useful, but WITHOUT 
>>> +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 
>>> +or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public 
>>> +License for more details.
>>> +
>>> +You should have received a copy of the GNU General Public License 
>>> +along with GCC; see the file COPYING3.  If not see 
>>> +<http://www.gnu.org/licenses/>.  */
>>> +
>>> +#include "config.h"
>>> +#include "system.h"
>>> +#include "coretypes.h"
>>> +#include "tm.h"
>>> +#include "flags.h"
>>> +#include "tree.h"
>>> +#include "stor-layout.h"
>>> +#include "calls.h"
>>> +#include "predict.h"
>>> +#include "vec.h"
>>> +#include "hashtab.h"
>>> +#include "hash-set.h"
>>> +#include "machmode.h"
>>> +#include "hard-reg-set.h"
>>> +#include "input.h"
>>> +#include "function.h"
>>> +#include "dominance.h"
>>> +#include "cfg.h"
>>> +#include "cfganal.h"
>>> +#include "basic-block.h"
>>> +#include "tree-ssa-alias.h"
>>> +#include "internal-fn.h"
>>> +#include "gimple-fold.h"
>>> +#include "tree-eh.h"
>>> +#include "gimple-expr.h"
>>> +#include "is-a.h"
>>> +#include "gimple.h"
>>> +#include "gimple-iterator.h"
>>> +#include "gimple-walk.h"
>>> +#include "gimple-ssa.h"
>>> +#include "tree-cfg.h"
>>> +#include "tree-phinodes.h"
>>> +#include "ssa-iterators.h"
>>> +#include "stringpool.h"
>>> +#include "tree-ssanames.h"
>>> +#include "tree-ssa-loop-manip.h"
>>> +#include "tree-ssa-loop-niter.h"
>>> +#include "tree-ssa-loop.h"
>>> +#include "tree-into-ssa.h"
>>> +#include "tree-ssa.h"
>>> +#include "tree-pass.h"
>>> +#include "tree-dump.h"
>>> +#include "gimple-pretty-print.h"
>>> +#include "diagnostic-core.h"
>>> +#include "intl.h"
>>> +#include "cfgloop.h"
>>> +#include "tree-scalar-evolution.h"
>>> +#include "tree-ssa-propagate.h"
>>> +#include "tree-chrec.h"
>>> +#include "tree-ssa-threadupdate.h"
>>> +#include "expr.h"
>>> +#include "insn-codes.h"
>>> +#include "optabs.h"
>>> +#include "tree-ssa-threadedge.h"
>>> +#include "wide-int.h"
>>> +
>>> +/* Replace_uses_phi function propagates the phi results with the
>>> +   first phi argument into each of the copied join blocks wired into
>>> +   its predecessors. This function is called from the replace_uses_phi
>>> +   to replace the uses of first phi arguments with the second
>>> +   phi arguments in the next copy of join block.  */
>>> +
>>> +static void
>>> +replace_use_phi_operand1_with_operand2 (basic_block b,
>>> +                                        tree use1,
>>> +                                        tree use2) {
>>> +  use_operand_p use;
>>> +  ssa_op_iter iter;
>>> +  gimple_stmt_iterator gsi;
>>> +
>>> +  for (gsi = gsi_start_bb (b); !gsi_end_p (gsi);)
>>> +     {
>>> +       gimple stmt = gsi_stmt (gsi);
>>> +       FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
>>> +       {
>>> +         tree tuse = USE_FROM_PTR (use);
>>> +          if (use1 == tuse || use1 == NULL_TREE)
>>> +            {
>>> +              propagate_value (use, use2);
>>> +              update_stmt(stmt);
>>> +            }
>>> +        }
>>> +       gsi_next(&gsi);
>>> +     }
>>> +}
>>> +
>>> +/* This function propagates the phi result into the use points with
>>> +   the phi arguments. The join block is copied and wired into the
>>> +   predecessors. Since the use points of the phi results will be same
>>> +   in the each of the copy join blocks in the  predecessors, it
>>> +   propagates the phi arguments in the copy of the join blocks wired
>>> +   into its predecessor.  */
>>> +
>>> +static
>>> +void replace_uses_phi (basic_block b, basic_block temp_bb) {
>>> +  gimple_seq phis = phi_nodes (b);
>>> +  gimple phi = gimple_seq_first_stmt (phis);
>>> +  tree def = gimple_phi_result (phi), use = gimple_phi_arg_def 
>>> +(phi,0);
>>> +  tree use2 = gimple_phi_arg_def (phi,1);
>>> +
>>> +  if (virtual_operand_p (def))
>>> +    {
>>> +      imm_use_iterator iter;
>>> +      use_operand_p use_p;
>>> +      gimple stmt;
>>> +
>>> +      FOR_EACH_IMM_USE_STMT (stmt, iter, def)
>>> +        FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
>>> +          SET_USE (use_p, use);
>>> +      if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (def))
>>> +        SSA_NAME_OCCURS_IN_ABNORMAL_PHI (use) = 1;
>>> +    }
>>> +   else
>>> +     replace_uses_by (def, use);
>>> +   replace_use_phi_operand1_with_operand2 (temp_bb, use, use2); }
>>> +
>>> +/* Returns true if the block bb has label or call statements.
>>> +   Otherwise return false.  */
>>> +
>>> +static bool
>>> +is_block_has_label_call (basic_block bb) {
>>> +  gimple_stmt_iterator gsi;
>>> +
>>> +  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
>>> +     {
>>> +       gimple stmt = gsi_stmt(gsi);
>>> +       if (dyn_cast <glabel *> (stmt))
>>> +         {
>>> +           return true;
>>> +         }
>>> +       if (is_gimple_call (stmt))
>>> +         return true;
>>> +     }
>>> +  return false;
>>> +}
>>> +
>>> +/* This function performs the feasibility tests for path splitting
>>> +   to perform. Return false if the feasibility for path splitting
>>> +   is not done and returns true if the feasbility for path splitting
>>> +   is done. Following feasibility tests are performed.
>>> +
>>> +   1. Return false if the join block has call gimple statements.
>>> +   2. Return false if the join block has rhs casting for assign
>>> +      gimple statements.
>>> +   3. If the number of phis is greater than 1 or the phi node in
>>> +      the join block has virtual operand return false.
>>> +   4. Return false if the number of sequential statements is
>>> +      greater than 2.
>>> +   5. If the predecessors blocks has labels and call statements
>>> +      return false.
>>> +   6. If the phi result in the phi node of the join block is not
>>> +      used inside the same join block return false.
>>> +   7. Otherwise returns true.  */
>>> +
>>> +static bool
>>> +is_feasible_path_splitting (basic_block join_node, basic_block pred1,
>>> +                           basic_block pred2) {
>>> +  int num_stmt = 0, num_phis = 0;
>>> +  gimple_stmt_iterator psi, gsi;
>>> +
>>> +  for (gsi = gsi_start_bb (join_node); !gsi_end_p (gsi); gsi_next (&gsi))
>>> +     {
>>> +       gimple stmt = gsi_stmt(gsi);
>>> +
>>> +       if (gimple_assign_cast_p (stmt))
>>> +         return false;
>>> +
>>> +       if (is_gimple_call (stmt))
>>> +         return false;
>>> +
>>> +       if (!is_gimple_debug(stmt))
>>> +         {
>>> +           num_stmt++;
>>> +         }
>>> +     }
>>> +
>>> +   if (pred1 && pred2 && (num_stmt > 2))
>>> +     {
>>> +       bool found_virtual_result = false;
>>> +
>>> +       for (psi = gsi_start_phis (join_node); !gsi_end_p (psi); )
>>> +          {
>>> +            use_operand_p use_p;
>>> +            imm_use_iterator iter;
>>> +            gimple stmt = gsi_stmt(psi);
>>> +
>>> +            if (!virtual_operand_p (gimple_phi_result (stmt)))
>>> +              num_phis++;
>>> +            else
>>> +              found_virtual_result = true;
>>> +
>>> +            FOR_EACH_IMM_USE_FAST (use_p, iter, gimple_phi_result (stmt))
>>> +            {
>>> +              gimple use_stmt = USE_STMT (use_p);
>>> +
>>> +              if (gimple_bb (use_stmt) != join_node)
>>> +                return false;
>>> +            }
>>> +
>>> +            gsi_next(&psi);
>>> +         }
>>> +
>>> +       if ((num_phis >1) || found_virtual_result)
>>> +          return false;
>>> +
>>> +       if(is_block_has_label_call(pred1) || is_block_has_label_call(pred2))
>>> +         return false;
>>> +
>>> +       return true;
>>> +    }
>>> +  return false;
>>> +}
>>> +
>>> +/* Update the statements in the basic block with the basic
>>> +   basic block.  */
>>> +
>>> +static void
>>> +update_stmt_bb(basic_block b)
>>> +{
>>> +  gimple_stmt_iterator gsi;
>>> +  for(gsi = gsi_start_bb(b); !gsi_end_p(gsi); gsi_next(&gsi))
>>> +   {
>>> +     gimple stmt = gsi_stmt(gsi);
>>> +     gimple_set_bb(stmt,b);
>>> +   }
>>> +}
>>> +
>>> +/* This function gets the join blocks same as the source
>>> +   node of the loop latch nodes and the predecessors of
>>> +   the join block is updated in the pred1 and pred2 passed
>>> +   as the reference arguments into the function. Return
>>> +   the join block.  */
>>> +
>>> +static basic_block
>>> +get_join_blk_same_as_loop_latch (basic_block bb,
>>> +                                 basic_block &pred1,
>>> +                                 basic_block &pred2) {
>>> +  vec<basic_block> bbs;
>>> +  basic_block bb1;
>>> +  unsigned int i;
>>> +  edge_iterator ei;
>>> +  edge e1;
>>> +  bool found = false ,found1;
>>> +  bbs = get_all_dominated_blocks (CDI_DOMINATORS,
>>> +                                  bb );
>>> +  FOR_EACH_VEC_ELT (bbs, i, bb1)
>>> +  {
>>> +    found1 = false;
>>> +    FOR_EACH_EDGE (e1, ei, bb->succs)
>>> +    {
>>> +      if ( bb1 == e1->dest)
>>> +        {
>>> +          found = true;
>>> +          found1 = true;
>>> +        }
>>> +    }
>>> +    if (!found1 && found)
>>> +      {
>>> +        found = false;
>>> +        FOR_EACH_EDGE (e1, ei, bb1->succs)
>>> +        {
>>> +          if (e1->flags & (EDGE_DFS_BACK))
>>> +            found = true;
>>> +        }
>>> +
>>> +        if (found && EDGE_COUNT(bb1->preds) == 2)
>>> +          {
>>> +            unsigned int k = 0;
>>> +            FOR_EACH_EDGE (e1, ei, bb1->preds)
>>> +            {
>>> +              if ((e1->flags & (EDGE_DFS_BACK)))
>>> +                continue;
>>> +
>>> +              if ( k == 1)
>>> +                {
>>> +                  if (single_succ_p(e1->src) &&
>>> +                      single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
>>> +                    {
>>> +                      pred2 = e1->src;
>>> +                    }
>>> +                }
>>> +                else
>>> +                  {
>>> +                    if (single_succ_p(e1->src) &&
>>> +                        single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
>>> +                      {
>>> +                        pred1 = e1->src;
>>> +                      }
>>> +                  }
>>> +                k++;
>>> +            }
>>> +            bbs.release();
>>> +            return bb1;
>>> +          }
>>> +       }
>>> +   }
>>> +   bbs.release();
>>> +   return NULL;
>>> +}
>>> +
>>> +/* This is the core function to perform path splitting. The join
>>> +   same as the source of the loop latch node is identified along
>>> +   with their predecessors. Based on the feasibility tests for
>>> +   path splitting the path splitting is performed by wiring the
>>> +   copy of join blocks into the predecessors and propagating the phi
>>> +   result with the corresponding phi arguments into each of the copy
>>> +   of join blocks wired with the original predecessors of the join
>>> +   block.
>>> +
>>> +   The  tree-cfg-cleanup will merge the blocks in the predecessors
>>> +   path and the update-ssa will update the ssa representation after
>>> +   the path splitting is performed.  */
>>> +
>>> +static void
>>> +perform_path_splitting (basic_block bb) {
>>> +  basic_block pred1 = NULL, pred2 = NULL, join_block = NULL;
>>> +
>>> +  join_block = get_join_blk_same_as_loop_latch (bb, pred1, pred2);
>>> +
>>> +  if (join_block  &&
>>> +      is_feasible_path_splitting (join_block, pred1, pred2))
>>> +    {
>>> +      basic_block new_bb1 = NULL, new_bb2 = NULL;
>>> +      gimple_stmt_iterator last;
>>> +      basic_block temp_bb = NULL;
>>> +      edge_iterator ei;
>>> +      edge e1;
>>> +
>>> +      temp_bb = duplicate_block (join_block, NULL, NULL);
>>> +
>>> +      FOR_EACH_EDGE (e1, ei, pred1->succs)
>>> +        new_bb1 = split_edge (e1);
>>> +
>>> +      FOR_EACH_EDGE (e1, ei, pred2->succs)
>>> +        new_bb2 = split_edge (e1);
>>> +
>>> +      last = gsi_start_bb (new_bb1);
>>> +      gsi_insert_seq_after (&last, bb_seq (join_block), GSI_NEW_STMT);
>>> +      last = gsi_start_bb (new_bb2);
>>> +      gsi_insert_seq_after (&last, bb_seq (temp_bb), GSI_NEW_STMT);
>>> +      update_stmt_bb (new_bb1);
>>> +      update_stmt_bb (new_bb2);
>>> +
>>> +      replace_uses_phi (join_block, new_bb2);
>>> +
>>> +      set_bb_seq (join_block, NULL);
>>> +      set_bb_seq(temp_bb,NULL);
>>> +      delete_basic_block (temp_bb);
>>> +      return;
>>> +    }
>>> +}
>>> +
>>> +static unsigned int
>>> +execute_path_split (void)
>>> +{
>>> +  basic_block bb;
>>> +
>>> +  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS); 
>>> + initialize_original_copy_tables();
>>> +
>>> +  calculate_dominance_info (CDI_DOMINATORS); 
>>> + calculate_dominance_info (CDI_POST_DOMINATORS);
>>> +
>>> +  mark_dfs_back_edges ();
>>> +
>>> +  FOR_EACH_BB_FN (bb, cfun)
>>> +  {
>>> +    gimple last;
>>> +
>>> +    /* We only care about blocks ending in a COND_EXPR. */
>>> +
>>> +    last = gsi_stmt (gsi_last_bb (bb));
>>> +
>>> +    /* We're basically looking for a switch or any kind of conditional with
>>> +       integral or pointer type arguments.  Note the type of the second
>>> +       argument will be the same as the first argument, so no need to
>>> +       check it explicitly.  */
>>> +    if ((last && (gimple_code (last) == GIMPLE_COND
>>> +            && TREE_CODE (gimple_cond_lhs (last)) == SSA_NAME
>>> +            && (INTEGRAL_TYPE_P (TREE_TYPE (gimple_cond_lhs (last)))
>>> +            || POINTER_TYPE_P (TREE_TYPE (gimple_cond_lhs (last))))
>>> +            && (TREE_CODE (gimple_cond_rhs (last)) == SSA_NAME
>>> +            || is_gimple_min_invariant (gimple_cond_rhs (last))))))
>>> +      {
>>> +
>>> +         if (gimple_code(last) == GIMPLE_COND)
>>> +           {
>>> +              perform_path_splitting (bb);
>>> +           }
>>> +      }
>>> +   }
>>> +
>>> +   loop_optimizer_finalize ();
>>> +   free_original_copy_tables ();
>>> +   free_dominance_info (CDI_DOMINATORS);
>>> +   free_dominance_info (CDI_POST_DOMINATORS);
>>> +   return 0;
>>> +}
>>> +
>>> +namespace {
>>> +
>>> +const pass_data pass_data_path_split = {
>>> +   GIMPLE_PASS, /* type */
>>> +   "path_split", /* name */
>>> +    OPTGROUP_NONE, /* optinfo_flags */
>>> +    TV_TREE_PATH_SPLIT, /* tv_id */
>>> +    PROP_ssa, /* properties_required */
>>> +    0, /* properties_provided */
>>> +    0, /* properties_destroyed */
>>> +    0, /* todo_flags_start */
>>> +    ( TODO_cleanup_cfg | TODO_update_ssa ), /* todo_flags_finish */ 
>>> +};
>>> +
>>> +class pass_path_split : public gimple_opt_pass {
>>> +   public:
>>> +    pass_path_split (gcc::context *ctxt)
>>> +      : gimple_opt_pass (pass_data_path_split, ctxt)
>>> +    {}
>>> +
>>> +   /* opt_pass methods: */
>>> +   opt_pass * clone () { return new pass_path_split (m_ctxt); }
>>> +   virtual bool gate (function *) { return flag_tree_path_split != 0; }
>>> +   virtual unsigned int execute (function *) { return 
>>> + execute_path_split (); }
>>> +
>>> +}; // class pass_path_split
>>> +
>>> +} // anon namespace
>>> +
>>> +gimple_opt_pass *
>>> +make_pass_path_split (gcc::context *ctxt) {
>>> +  return new pass_path_split (ctxt); }
>>> --
>>> 1.8.2.1
>>>
>>> Thanks & Regards
>>> Ajit

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-07-29  7:44           ` Ajit Kumar Agarwal
@ 2015-08-15 23:13             ` Ajit Kumar Agarwal
  2015-08-19 19:47               ` Jeff Law
  2015-08-19 21:53               ` Jeff Law
  0 siblings, 2 replies; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-08-15 23:13 UTC (permalink / raw)
  To: Richard Biener, Jeff Law
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

[-- Attachment #1: Type: text/plain, Size: 38284 bytes --]

All:

Please find the updated patch with suggestion and feedback incorporated.

Thanks Jeff and Richard for the review comments.

Following changes were done based on the feedback on RFC comments. and the review for the previous patch.

1. Both tracer and path splitting pass are separate passes so  that two instances of the pass will run in the end, one doing path splitting
 and one doing  tracing, at different times in the optimization pipeline.
2. Transform code is shared for tracer and path splitting pass. The common code in extracted in a given function transform_duplicate
And place the function in tracer.c and the path splitting pass uses the transform code.
3. Analysis for the basic block population and traversing the basic block using the Fibonacci heap is commonly used. This cannot be
Factored out into new function as the tracer pass does more analysis based on the profile and the different heuristics is used in tracer
And path splitting pass.
4. The include headers is minimal and presence of what is required for the path splitting pass.
5. The earlier patch does the SSA updating  with replace function to preserve the SSA representation required to move the loop latch node same as join
Block to its predecessors and the loop latch node is just forward block. Such replace function are not required as suggested by the Jeff. Such replace
Function goes away with this patch and the transformed code is factored into a given function which is shared between tracer and path splitting pass.   

Bootstrapping with i386 and Microblaze target works fine. No regression is seen in Deja GNU tests for Microblaze. There
are lesser failures. Mibench/EEMBC benchmarks were run for Microblaze target and the gain of
9.3% is seen in rgbcmy_lite the EEMBC benchmarks.

SPEC 2000 benchmarks were run with i386 target and the following performance number is achieved.

INT benchmarks with path splitting(ratio) Vs INT benchmarks without path splitting(ratio) = 3661.225091 vs 3621.520572
FP benchmarks with path splitting(ratio) Vs FP benchmarks without path splitting(ratio )  =  4339.986209 vs 4339.775527

Maximum gains achieved with 252.eon INT benchmarks = 9.03%.

ChangeLog:
2015-08-15  Ajit Agarwal  <ajitkum@xilinx.com>

        * gcc/Makefile.in: Add the build of the new file
        tree-ssa-path-split.c
        * gcc/common.opt (ftree-path-split): Add the new flag.
        * gcc/opts.c (OPT_ftree_path_split) : Add an entry for
        Path splitting pass with optimization flag greater and
        equal to O2.
        * gcc/passes.def (path_split): add new path splitting pass.
        * gcc/timevar.def (TV_TREE_PATH_SPLIT): New.
        * gcc/tree-pass.h (make_pass_path_split): New declaration.
       * gcc/tree-ssa-path-split.c: New.
        * gcc/tracer.c (transform_duplicate): New.
        * gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c: New.
        * gcc/testsuite/gcc.dg/path-split-1.c: New.
        * gcc/doc/invoke.texi
        (ftree-path-split): Document.
        (fdump-tree-path_split): Document.

Signed-off-by:Ajit Agarwal ajitkum@xilinx.com.

Thanks & Regards
Ajit

-----Original Message-----
From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-owner@gcc.gnu.org] On Behalf Of Ajit Kumar Agarwal
Sent: Wednesday, July 29, 2015 10:13 AM
To: Richard Biener; Jeff Law
Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation



-----Original Message-----
From: Richard Biener [mailto:richard.guenther@gmail.com]
Sent: Thursday, July 16, 2015 4:30 PM
To: Ajit Kumar Agarwal
Cc: law@redhat.com; GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On Tue, Jul 7, 2015 at 3:22 PM, Ajit Kumar Agarwal <ajit.kumar.agarwal@xilinx.com> wrote:
>
>
> -----Original Message-----
> From: Richard Biener [mailto:richard.guenther@gmail.com]
> Sent: Tuesday, July 07, 2015 2:21 PM
> To: Ajit Kumar Agarwal
> Cc: law@redhat.com; GCC Patches; Vinod Kathail; Shail Aditya Gupta; 
> Vidhumouli Hunsigida; Nagaraju Mekala
> Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on 
> tree ssa representation
>
> On Sat, Jul 4, 2015 at 2:39 PM, Ajit Kumar Agarwal <ajit.kumar.agarwal@xilinx.com> wrote:
>>
>>
>> -----Original Message-----
>> From: Richard Biener [mailto:richard.guenther@gmail.com]
>> Sent: Tuesday, June 30, 2015 4:42 PM
>> To: Ajit Kumar Agarwal
>> Cc: law@redhat.com; GCC Patches; Vinod Kathail; Shail Aditya Gupta; 
>> Vidhumouli Hunsigida; Nagaraju Mekala
>> Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass 
>> on tree ssa representation
>>
>> On Tue, Jun 30, 2015 at 10:16 AM, Ajit Kumar Agarwal <ajit.kumar.agarwal@xilinx.com> wrote:
>>> All:
>>>
>>> The below patch added a new path Splitting optimization pass on SSA 
>>> representation. The Path Splitting optimization Pass moves the join 
>>> block of if-then-else same as loop latch to its predecessors and get merged with the predecessors Preserving the SSA representation.
>>>
>>> The patch is tested for Microblaze and i386 target. The 
>>> EEMBC/Mibench benchmarks is run with the Microblaze target And the 
>>> performance gain of 9.15% and rgbcmy01_lite(EEMBC benchmarks). The Deja GNU tests is run for Mircroblaze Target and no regression is seen for Microblaze target and the new testcase attached are passed.
>>>
>>> For i386 bootstrapping goes through fine and the Spec cpu2000 
>>> benchmarks is run with this patch. Following observation were seen with spec cpu2000 benchmarks.
>>>
>>> Ratio of path splitting change vs Ratio of not having path splitting change is 3653.353 vs 3652.14 for INT benchmarks.
>>> Ratio of path splitting change vs Ratio of not having path splitting change is  4353.812 vs 4345.351 for FP benchmarks.
>>>
>>> Based on comments from RFC patch following changes were done.
>>>
>>> 1. Added a new pass for path splitting changes.
>>> 2. Placed the new path  Splitting Optimization pass before the copy propagation pass.
>>> 3. The join block same as the Loop latch is wired into its 
>>> predecessors so that the CFG Cleanup pass will merge the blocks Wired together.
>>> 4. Copy propagation routines added for path splitting changes is not 
>>> needed as suggested by Jeff. They are removed in the patch as The copy propagation in the copied join blocks will be done by the existing copy propagation pass and the update ssa pass.
>>> 5. Only the propagation of phi results of the join block with the 
>>> phi argument is done which will not be done by the existing update_ssa Or copy propagation pass on tree ssa representation.
>>> 6. Added 2 tests.
>>>     a) compilation check  tests.
>>>    b) execution tests.
>>> 7. Refactoring of the code for the feasibility check and finding the join block same as loop latch node.
>>>
>>>     [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation.
>>>
>>>     Added a new pass on path splitting on tree SSA representation. The path
>>>     splitting optimization does the CFG transformation of join block of the
>>>     if-then-else same as the loop latch node is moved and merged with the
>>>     predecessor blocks after preserving the SSA representation.
>>>
>>>     ChangeLog:
>>>     2015-06-30  Ajit Agarwal  <ajitkum@xilinx.com>
>>>
>>>         * gcc/Makefile.in: Add the build of the new file
>>>         tree-ssa-path-split.c
>>>         * gcc/common.opt: Add the new flag ftree-path-split.
>>>         * gcc/opts.c: Add an entry for Path splitting pass
>>>         with optimization flag greater and equal to O2.
>>>         * gcc/passes.def: Enable and add new pass path splitting.
>>>         * gcc/timevar.def: Add the new entry for TV_TREE_PATH_SPLIT.
>>>         * gcc/tree-pass.h: Extern Declaration of make_pass_path_split.
>>>         * gcc/tree-ssa-path-split.c: New file for path splitting pass.
>>>         * gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c: New testcase.
>>>         * gcc/testsuite/gcc.dg/path-split-1.c: New testcase.
>>
>>>>I'm not 100% sure I understand the transform but what I see from the testcases it tail-duplicates from a conditional up to a loop latch block (not sure if it >>includes it and thus ends up creating a loop nest or not).
>>
>>>>An observation I have is that the pass should at least share the transform stage to some extent with the existing tracer pass (tracer.c) which essentially does >>the same but not restricted to loops in any way.
>>
>> The following piece of code from tracer.c can be shared with the existing path splitting pass.
>>
>> {
>>              e = find_edge (bb, bb2);
>>
>>               copy = duplicate_block (bb2, e, bb);
>>               flush_pending_stmts (e);
>>
>>               add_phi_args_after_copy (&copy, 1, NULL); }
>>
>> Sharing the above code of the transform stage of tracer.c with the path splitting pass has the following limitation.
>>
>> 1. The duplicated loop latch node is wired to its predecessors and 
>> the existing phi node in the loop latch node with the Phi arguments 
>> from its corresponding predecessors is moved to the duplicated loop latch node that is wired into its predecessors. Due To this, the duplicated loop latch nodes wired into its predecessors will not be merged with the original predecessors by CFG cleanup phase .
>>
>>>> So I wonder if your pass could be simply another heuristic to compute paths to trace in the existing tracer pass.
>>
>> Sorry, I am not very clear when you say the above.  I am trying to 
>> figure out whether you expect the existing pass of tracer.c should be modified Or the path splitting pass should coexist.
>
>>>Yes, I was wondering whether tracer.c could be simply modified.  Both transforms are doing something very similar.
>>>Yes, your pass would simply compute extra traces based on the new heuristic.
>
> I have observed the following with the tracer pass optimization.
>
> Tracer Pass:
>
> 1. The tracer pass is FDO optimizations  and is not enabled by default at O2. This optimization is enabled with -fprofile-use.
> 2. The -ftracer flag is used to enable the optimization explicitly in the absence of FDO optimization.
> 3. The tracer pass optimizations is placed at only place before the 
> dominator_pass. Moving the tracer pass before copy_prop Pass gives the following error. " Error : pass tracer does not support cloning".

This is an error from the pass manager, you added a second tracer pass instead of moving it.

> 4. The code for tracer pass is totally based on the FDO related information and the logic is based on the profile data.

>>Yes.

> Path Spliiting pass:
>
> 1. Having the path splitting as a separate pass is enabled by default at  >= O2.

>>Well, it's debatable on whether you want to enable it at -O2.  I seriously doubt that.

> 2. No FDO information is required in the path splitting pass.
> 3. The Path Splitting pass can be placed anywhere well before any 
> optimizations pass. I have placed the path splitting pass before Copy_prop and it works. Also placing before the dominator also works fine.

>>Same for tracer - see above.

> 4. The code for path splitting as a separate pass is purely based on non profile and Non FDO data.
> 5. Placing the path splitting pass as a separate pass  can be placed 
> anywhere in the optimizations. The optimizations that got benefitted with the Path splitting pass are PRE , CCP, DCE and can be placed well before the optimizations.
> 6. At the first phase of path splitting pass I am duplicating the loop 
> latch node to its predecessor and make it SSA and loop structure 
> Preserved. Making the path splitting as separate pass I would like to extend this pass to the multiple latch node with a forwarding block and the multiple latch nodes are edged towards the forwarder block making it one Loop latch edge.
>
> With the above observation, Do you think the existing tracer pass 
> should be modified and the path splitting pass should be incorporated On top of the existing tracer pass.

>>I still think both passes do a fundamentally similar transform and thus should be able to share data structures ('path' representation), parts of analysis (can >>this path be duplicated?) and the transform stage.  Yes, it might be that two instances of the pass will run in the end, one doing path splitting and one doing >>tracing, at different times in the optimization pipeline.

>>We have another similar transform with similar needs on data structures and analysis / transform.  Jump threading.

Thanks. I am going to make the following change and send for review.

 Both path splitting and traces should be a separate pass and the common code between them  should Be abstracted out and the path splitting pass and the tracer pass should use this abstracted common piece of code.

In the next phase of the path splitting transformation, I would like to make the basic block that dominates the IF node and has the successor of the given block is IF-node then the given block is duplicated to THEN and ELSE path similar to head duplication thus making the scope of path splitting to enable more of PRE,DCE and CCP optimizations.

Please let me know if there is any concern.

Thanks & Regards
Ajit

>>It would be nice to have a common machinery here, for example to assess cost of doing a trace.

Richard.

> Kindly give your feedback.
>
> Thanks & Regards
> Ajit
>
> Richard.
>
>> Thanks & Regards
>> Ajit
>>
>> Thanks,
>> Richard.
>>
>>>     Signed-off-by:Ajit Agarwal ajitkum@xilinx.com.
>>>
>>> gcc/Makefile.in                              |   1 +
>>>  gcc/common.opt                               |   4 +
>>>  gcc/opts.c                                   |   1 +
>>>  gcc/passes.def                               |   1 +
>>>  gcc/testsuite/gcc.dg/path-split-1.c          |  65 ++++
>>>  gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c |  62 ++++
>>>  gcc/timevar.def                              |   1 +
>>>  gcc/tree-pass.h                              |   1 +
>>>  gcc/tree-ssa-path-split.c                    | 462 +++++++++++++++++++++++++++
>>>  9 files changed, 598 insertions(+)
>>>  create mode 100644 gcc/testsuite/gcc.dg/path-split-1.c
>>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>>>  create mode 100644 gcc/tree-ssa-path-split.c
>>>
>>> diff --git a/gcc/Makefile.in b/gcc/Makefile.in index
>>> 5f9261f..35ac363
>>> 100644
>>> --- a/gcc/Makefile.in
>>> +++ b/gcc/Makefile.in
>>> @@ -1476,6 +1476,7 @@ OBJS = \
>>>         tree-vect-slp.o \
>>>         tree-vectorizer.o \
>>>         tree-vrp.o \
>>> +        tree-ssa-path-split.o \
>>>         tree.o \
>>>         valtrack.o \
>>>         value-prof.o \
>>> diff --git a/gcc/common.opt b/gcc/common.opt index e104269..c63b100
>>> 100644
>>> --- a/gcc/common.opt
>>> +++ b/gcc/common.opt
>>> @@ -2328,6 +2328,10 @@ ftree-vrp
>>>  Common Report Var(flag_tree_vrp) Init(0) Optimization  Perform 
>>> Value Range Propagation on trees
>>>
>>> +ftree-path-split
>>> +Common Report Var(flag_tree_path_split) Init(0) Optimization 
>>> +Perform Path Splitting
>>> +
>>>  funit-at-a-time
>>>  Common Report Var(flag_unit_at_a_time) Init(1) Optimization Compile 
>>> whole compilation unit at a time diff --git a/gcc/opts.c 
>>> b/gcc/opts.c index 8a16116..31947ff 100644
>>> --- a/gcc/opts.c
>>> +++ b/gcc/opts.c
>>> @@ -508,6 +508,7 @@ static const struct default_options default_options_table[] =
>>>      { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1 },
>>>      { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
>>>      { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
>>> +    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },
>>>
>>>      /* -O3 optimizations.  */
>>>      { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL,
>>> 1 }, diff --git a/gcc/passes.def b/gcc/passes.def index 
>>> c0ddee4..43618eb
>>> 100644
>>> --- a/gcc/passes.def
>>> +++ b/gcc/passes.def
>>> @@ -155,6 +155,7 @@ along with GCC; see the file COPYING3.  If not see
>>>        NEXT_PASS (pass_ccp);
>>>        /* After CCP we rewrite no longer addressed locals into SSA
>>>          form if possible.  */
>>> +      NEXT_PASS (pass_path_split);
>>>        NEXT_PASS (pass_copy_prop);
>>>        NEXT_PASS (pass_complete_unrolli);
>>>        NEXT_PASS (pass_phiprop);
>>> diff --git a/gcc/testsuite/gcc.dg/path-split-1.c
>>> b/gcc/testsuite/gcc.dg/path-split-1.c
>>> new file mode 100644
>>> index 0000000..075dc87
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.dg/path-split-1.c
>>> @@ -0,0 +1,65 @@
>>> +/* { dg-do run } */
>>> +/* { dg-options "-O2 " } */
>>> +
>>> +#include <stdio.h>
>>> +#include <stdlib.h>
>>> +
>>> +#define RGBMAX 255
>>> +
>>> +int
>>> +test()
>>> +{
>>> +  int i, Pels;
>>> +  unsigned char sum = 0;
>>> +  unsigned char xr, xg, xb;
>>> +  unsigned char xc, xm, xy, xk;
>>> +  unsigned char *ReadPtr, *EritePtr;
>>> +
>>> +  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 
>>> + 100); EritePtr = ( unsigned char *) malloc (sizeof (unsigned char)
>>> + * 100);
>>> +
>>> +  for (i = 0; i < 100;i++)
>>> +     {
>>> +       ReadPtr[i] = 100 - i;
>>> +     }
>>> +
>>> +  for (i = 0; i < 100; i++)
>>> +     {
>>> +       xr = *ReadPtr++;
>>> +       xg = *ReadPtr++;
>>> +       xb = *ReadPtr++;
>>> +
>>> +       xc = (unsigned char) (RGBMAX - xr);
>>> +       xm = (unsigned char) (RGBMAX - xg);
>>> +       xy = (unsigned char) (RGBMAX - xb);
>>> +
>>> +       if (xc < xm)
>>> +         {
>>> +           xk = (unsigned char) (xc < xy ? xc : xy);
>>> +         }
>>> +       else
>>> +        {
>>> +          xk = (unsigned char) (xm < xy ? xm : xy);
>>> +        }
>>> +
>>> +       xc = (unsigned char) (xc - xk);
>>> +       xm = (unsigned char) (xm - xk);
>>> +       xy = (unsigned char) (xy - xk);
>>> +
>>> +       *EritePtr++ = xc;
>>> +       *EritePtr++ = xm;
>>> +       *EritePtr++ = xy;
>>> +       *EritePtr++ = xk;
>>> +       sum += *EritePtr;
>>> +    }
>>> +  return sum;
>>> +}
>>> +
>>> +int
>>> +main()
>>> +{
>>> +  if (test() != 33)
>>> +    abort();
>>> +
>>> +  return 0;
>>> +}
>>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>>> b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>>> new file mode 100644
>>> index 0000000..19f277c
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>>> @@ -0,0 +1,62 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O2 -fdump-tree-path_split" } */
>>> +
>>> +#include <stdio.h>
>>> +#include <stdlib.h>
>>> +
>>> +#define RGBMAX 255
>>> +
>>> +int
>>> +test()
>>> +{
>>> +  int i, Pels;
>>> +  unsigned char sum = 0;
>>> +  unsigned char xr, xg, xb;
>>> +  unsigned char xc, xm, xy, xk;
>>> +  unsigned char *ReadPtr, *EritePtr;
>>> +
>>> +  ReadPtr = (unsigned char *) malloc (sizeof (unsigned char) * 
>>> + 100); EritePtr = ( unsigned char *) malloc (sizeof (unsigned char)
>>> + * 100);
>>> +
>>> +  for (i = 0; i < 100;i++)
>>> +     {
>>> +       ReadPtr[i] = 100 - i;
>>> +     }
>>> +
>>> +  for (i = 0; i < 100; i++)
>>> +     {
>>> +       xr = *ReadPtr++;
>>> +       xg = *ReadPtr++;
>>> +       xb = *ReadPtr++;
>>> +
>>> +       xc = ( unsigned char) (RGBMAX - xr);
>>> +       xm = ( unsigned char) (RGBMAX - xg);
>>> +       xy = ( unsigned char) (RGBMAX - xb);
>>> +
>>> +       if (xc < xm)
>>> +         {
>>> +           xk = ( unsigned char) (xc < xy ? xc : xy);
>>> +         }
>>> +       else
>>> +         {
>>> +           xk = ( unsigned char) (xm < xy ? xm : xy);
>>> +         }
>>> +
>>> +       xc = (unsigned char) (xc - xk);
>>> +       xm = (unsigned char) (xm - xk);
>>> +       xy = (unsigned char) (xy - xk);
>>> +
>>> +       *EritePtr++ = xc;
>>> +       *EritePtr++ = xm;
>>> +       *EritePtr++ = xy;
>>> +       *EritePtr++ = xk;
>>> +       sum += *EritePtr;
>>> +    }
>>> +  return sum;
>>> +}
>>> +
>>> +/* { dg-final { scan-tree-dump "xc_[0-9][0-9]* -> { xc_[0-9][0-9]* }"
>>> +"path_split"} } */
>>> +/* { dg-final { scan-tree-dump "xm_[0-9][0-9]* -> { xm_[0-9][0-9]* }"
>>> +"path_split"} } */
>>> +/* { dg-final { scan-tree-dump "xy_[0-9][0-9]* -> { xy_[0-9][0-9]* }"
>>> +"path_split"} } */
>>> +/* { dg-final { scan-tree-dump "Merging blocks" "path_split"} } */
>>> +/* { dg-final { cleanup-tree-dump "path_split" } } */
>>> diff --git a/gcc/timevar.def b/gcc/timevar.def index 
>>> 711bbed..6217a8e
>>> 100644
>>> --- a/gcc/timevar.def
>>> +++ b/gcc/timevar.def
>>> @@ -288,3 +288,4 @@ DEFTIMEVAR (TV_JIT_REPLAY        , "replay of JIT client activity")
>>>  DEFTIMEVAR (TV_ASSEMBLE             , "assemble JIT code")
>>>  DEFTIMEVAR (TV_LINK                 , "link JIT code")
>>>  DEFTIMEVAR (TV_LOAD                 , "load JIT result")
>>> +DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path_split")
>>> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 
>>> 398ab83..e00639e
>>> 100644
>>> --- a/gcc/tree-pass.h
>>> +++ b/gcc/tree-pass.h
>>> @@ -379,6 +379,7 @@ extern gimple_opt_pass *make_pass_iv_optimize 
>>> (gcc::context *ctxt);  extern gimple_opt_pass 
>>> *make_pass_tree_loop_done (gcc::context *ctxt);  extern 
>>> gimple_opt_pass *make_pass_ch (gcc::context *ctxt);  extern 
>>> gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
>>> +extern gimple_opt_pass *make_pass_path_split (gcc::context *ctxt);
>>>  extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context 
>>> *ctxt);  extern gimple_opt_pass *make_pass_build_ssa (gcc::context 
>>> *ctxt);  extern gimple_opt_pass *make_pass_build_alias (gcc::context 
>>> *ctxt); diff --git a/gcc/tree-ssa-path-split.c 
>>> b/gcc/tree-ssa-path-split.c new file mode 100644 index
>>> 0000000..3da7791
>>> --- /dev/null
>>> +++ b/gcc/tree-ssa-path-split.c
>>> @@ -0,0 +1,462 @@
>>> +/* Support routines for Path Splitting.
>>> +   Copyright (C) 2015 Free Software Foundation, Inc.
>>> +   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
>>> +
>>> + This file is part of GCC.
>>> +
>>> + GCC is free software; you can redistribute it and/or modify it 
>>> + under the terms of the GNU General Public License as published by 
>>> + the Free Software Foundation; either version 3, or (at your
>>> + option) any later version.
>>> +
>>> +GCC is distributed in the hope that it will be useful, but WITHOUT 
>>> +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 
>>> +or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public 
>>> +License for more details.
>>> +
>>> +You should have received a copy of the GNU General Public License 
>>> +along with GCC; see the file COPYING3.  If not see 
>>> +<http://www.gnu.org/licenses/>.  */
>>> +
>>> +#include "config.h"
>>> +#include "system.h"
>>> +#include "coretypes.h"
>>> +#include "tm.h"
>>> +#include "flags.h"
>>> +#include "tree.h"
>>> +#include "stor-layout.h"
>>> +#include "calls.h"
>>> +#include "predict.h"
>>> +#include "vec.h"
>>> +#include "hashtab.h"
>>> +#include "hash-set.h"
>>> +#include "machmode.h"
>>> +#include "hard-reg-set.h"
>>> +#include "input.h"
>>> +#include "function.h"
>>> +#include "dominance.h"
>>> +#include "cfg.h"
>>> +#include "cfganal.h"
>>> +#include "basic-block.h"
>>> +#include "tree-ssa-alias.h"
>>> +#include "internal-fn.h"
>>> +#include "gimple-fold.h"
>>> +#include "tree-eh.h"
>>> +#include "gimple-expr.h"
>>> +#include "is-a.h"
>>> +#include "gimple.h"
>>> +#include "gimple-iterator.h"
>>> +#include "gimple-walk.h"
>>> +#include "gimple-ssa.h"
>>> +#include "tree-cfg.h"
>>> +#include "tree-phinodes.h"
>>> +#include "ssa-iterators.h"
>>> +#include "stringpool.h"
>>> +#include "tree-ssanames.h"
>>> +#include "tree-ssa-loop-manip.h"
>>> +#include "tree-ssa-loop-niter.h"
>>> +#include "tree-ssa-loop.h"
>>> +#include "tree-into-ssa.h"
>>> +#include "tree-ssa.h"
>>> +#include "tree-pass.h"
>>> +#include "tree-dump.h"
>>> +#include "gimple-pretty-print.h"
>>> +#include "diagnostic-core.h"
>>> +#include "intl.h"
>>> +#include "cfgloop.h"
>>> +#include "tree-scalar-evolution.h"
>>> +#include "tree-ssa-propagate.h"
>>> +#include "tree-chrec.h"
>>> +#include "tree-ssa-threadupdate.h"
>>> +#include "expr.h"
>>> +#include "insn-codes.h"
>>> +#include "optabs.h"
>>> +#include "tree-ssa-threadedge.h"
>>> +#include "wide-int.h"
>>> +
>>> +/* Replace_uses_phi function propagates the phi results with the
>>> +   first phi argument into each of the copied join blocks wired into
>>> +   its predecessors. This function is called from the replace_uses_phi
>>> +   to replace the uses of first phi arguments with the second
>>> +   phi arguments in the next copy of join block.  */
>>> +
>>> +static void
>>> +replace_use_phi_operand1_with_operand2 (basic_block b,
>>> +                                        tree use1,
>>> +                                        tree use2) {
>>> +  use_operand_p use;
>>> +  ssa_op_iter iter;
>>> +  gimple_stmt_iterator gsi;
>>> +
>>> +  for (gsi = gsi_start_bb (b); !gsi_end_p (gsi);)
>>> +     {
>>> +       gimple stmt = gsi_stmt (gsi);
>>> +       FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
>>> +       {
>>> +         tree tuse = USE_FROM_PTR (use);
>>> +          if (use1 == tuse || use1 == NULL_TREE)
>>> +            {
>>> +              propagate_value (use, use2);
>>> +              update_stmt(stmt);
>>> +            }
>>> +        }
>>> +       gsi_next(&gsi);
>>> +     }
>>> +}
>>> +
>>> +/* This function propagates the phi result into the use points with
>>> +   the phi arguments. The join block is copied and wired into the
>>> +   predecessors. Since the use points of the phi results will be same
>>> +   in the each of the copy join blocks in the  predecessors, it
>>> +   propagates the phi arguments in the copy of the join blocks wired
>>> +   into its predecessor.  */
>>> +
>>> +static
>>> +void replace_uses_phi (basic_block b, basic_block temp_bb) {
>>> +  gimple_seq phis = phi_nodes (b);
>>> +  gimple phi = gimple_seq_first_stmt (phis);
>>> +  tree def = gimple_phi_result (phi), use = gimple_phi_arg_def 
>>> +(phi,0);
>>> +  tree use2 = gimple_phi_arg_def (phi,1);
>>> +
>>> +  if (virtual_operand_p (def))
>>> +    {
>>> +      imm_use_iterator iter;
>>> +      use_operand_p use_p;
>>> +      gimple stmt;
>>> +
>>> +      FOR_EACH_IMM_USE_STMT (stmt, iter, def)
>>> +        FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
>>> +          SET_USE (use_p, use);
>>> +      if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (def))
>>> +        SSA_NAME_OCCURS_IN_ABNORMAL_PHI (use) = 1;
>>> +    }
>>> +   else
>>> +     replace_uses_by (def, use);
>>> +   replace_use_phi_operand1_with_operand2 (temp_bb, use, use2); }
>>> +
>>> +/* Returns true if the block bb has label or call statements.
>>> +   Otherwise return false.  */
>>> +
>>> +static bool
>>> +is_block_has_label_call (basic_block bb) {
>>> +  gimple_stmt_iterator gsi;
>>> +
>>> +  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
>>> +     {
>>> +       gimple stmt = gsi_stmt(gsi);
>>> +       if (dyn_cast <glabel *> (stmt))
>>> +         {
>>> +           return true;
>>> +         }
>>> +       if (is_gimple_call (stmt))
>>> +         return true;
>>> +     }
>>> +  return false;
>>> +}
>>> +
>>> +/* This function performs the feasibility tests for path splitting
>>> +   to perform. Return false if the feasibility for path splitting
>>> +   is not done and returns true if the feasbility for path splitting
>>> +   is done. Following feasibility tests are performed.
>>> +
>>> +   1. Return false if the join block has call gimple statements.
>>> +   2. Return false if the join block has rhs casting for assign
>>> +      gimple statements.
>>> +   3. If the number of phis is greater than 1 or the phi node in
>>> +      the join block has virtual operand return false.
>>> +   4. Return false if the number of sequential statements is
>>> +      greater than 2.
>>> +   5. If the predecessors blocks has labels and call statements
>>> +      return false.
>>> +   6. If the phi result in the phi node of the join block is not
>>> +      used inside the same join block return false.
>>> +   7. Otherwise returns true.  */
>>> +
>>> +static bool
>>> +is_feasible_path_splitting (basic_block join_node, basic_block pred1,
>>> +                           basic_block pred2) {
>>> +  int num_stmt = 0, num_phis = 0;
>>> +  gimple_stmt_iterator psi, gsi;
>>> +
>>> +  for (gsi = gsi_start_bb (join_node); !gsi_end_p (gsi); gsi_next (&gsi))
>>> +     {
>>> +       gimple stmt = gsi_stmt(gsi);
>>> +
>>> +       if (gimple_assign_cast_p (stmt))
>>> +         return false;
>>> +
>>> +       if (is_gimple_call (stmt))
>>> +         return false;
>>> +
>>> +       if (!is_gimple_debug(stmt))
>>> +         {
>>> +           num_stmt++;
>>> +         }
>>> +     }
>>> +
>>> +   if (pred1 && pred2 && (num_stmt > 2))
>>> +     {
>>> +       bool found_virtual_result = false;
>>> +
>>> +       for (psi = gsi_start_phis (join_node); !gsi_end_p (psi); )
>>> +          {
>>> +            use_operand_p use_p;
>>> +            imm_use_iterator iter;
>>> +            gimple stmt = gsi_stmt(psi);
>>> +
>>> +            if (!virtual_operand_p (gimple_phi_result (stmt)))
>>> +              num_phis++;
>>> +            else
>>> +              found_virtual_result = true;
>>> +
>>> +            FOR_EACH_IMM_USE_FAST (use_p, iter, gimple_phi_result (stmt))
>>> +            {
>>> +              gimple use_stmt = USE_STMT (use_p);
>>> +
>>> +              if (gimple_bb (use_stmt) != join_node)
>>> +                return false;
>>> +            }
>>> +
>>> +            gsi_next(&psi);
>>> +         }
>>> +
>>> +       if ((num_phis >1) || found_virtual_result)
>>> +          return false;
>>> +
>>> +       if(is_block_has_label_call(pred1) || is_block_has_label_call(pred2))
>>> +         return false;
>>> +
>>> +       return true;
>>> +    }
>>> +  return false;
>>> +}
>>> +
>>> +/* Update the statements in the basic block with the basic
>>> +   basic block.  */
>>> +
>>> +static void
>>> +update_stmt_bb(basic_block b)
>>> +{
>>> +  gimple_stmt_iterator gsi;
>>> +  for(gsi = gsi_start_bb(b); !gsi_end_p(gsi); gsi_next(&gsi))
>>> +   {
>>> +     gimple stmt = gsi_stmt(gsi);
>>> +     gimple_set_bb(stmt,b);
>>> +   }
>>> +}
>>> +
>>> +/* This function gets the join blocks same as the source
>>> +   node of the loop latch nodes and the predecessors of
>>> +   the join block is updated in the pred1 and pred2 passed
>>> +   as the reference arguments into the function. Return
>>> +   the join block.  */
>>> +
>>> +static basic_block
>>> +get_join_blk_same_as_loop_latch (basic_block bb,
>>> +                                 basic_block &pred1,
>>> +                                 basic_block &pred2) {
>>> +  vec<basic_block> bbs;
>>> +  basic_block bb1;
>>> +  unsigned int i;
>>> +  edge_iterator ei;
>>> +  edge e1;
>>> +  bool found = false ,found1;
>>> +  bbs = get_all_dominated_blocks (CDI_DOMINATORS,
>>> +                                  bb );
>>> +  FOR_EACH_VEC_ELT (bbs, i, bb1)
>>> +  {
>>> +    found1 = false;
>>> +    FOR_EACH_EDGE (e1, ei, bb->succs)
>>> +    {
>>> +      if ( bb1 == e1->dest)
>>> +        {
>>> +          found = true;
>>> +          found1 = true;
>>> +        }
>>> +    }
>>> +    if (!found1 && found)
>>> +      {
>>> +        found = false;
>>> +        FOR_EACH_EDGE (e1, ei, bb1->succs)
>>> +        {
>>> +          if (e1->flags & (EDGE_DFS_BACK))
>>> +            found = true;
>>> +        }
>>> +
>>> +        if (found && EDGE_COUNT(bb1->preds) == 2)
>>> +          {
>>> +            unsigned int k = 0;
>>> +            FOR_EACH_EDGE (e1, ei, bb1->preds)
>>> +            {
>>> +              if ((e1->flags & (EDGE_DFS_BACK)))
>>> +                continue;
>>> +
>>> +              if ( k == 1)
>>> +                {
>>> +                  if (single_succ_p(e1->src) &&
>>> +                      single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
>>> +                    {
>>> +                      pred2 = e1->src;
>>> +                    }
>>> +                }
>>> +                else
>>> +                  {
>>> +                    if (single_succ_p(e1->src) &&
>>> +                        single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
>>> +                      {
>>> +                        pred1 = e1->src;
>>> +                      }
>>> +                  }
>>> +                k++;
>>> +            }
>>> +            bbs.release();
>>> +            return bb1;
>>> +          }
>>> +       }
>>> +   }
>>> +   bbs.release();
>>> +   return NULL;
>>> +}
>>> +
>>> +/* This is the core function to perform path splitting. The join
>>> +   same as the source of the loop latch node is identified along
>>> +   with their predecessors. Based on the feasibility tests for
>>> +   path splitting the path splitting is performed by wiring the
>>> +   copy of join blocks into the predecessors and propagating the phi
>>> +   result with the corresponding phi arguments into each of the copy
>>> +   of join blocks wired with the original predecessors of the join
>>> +   block.
>>> +
>>> +   The  tree-cfg-cleanup will merge the blocks in the predecessors
>>> +   path and the update-ssa will update the ssa representation after
>>> +   the path splitting is performed.  */
>>> +
>>> +static void
>>> +perform_path_splitting (basic_block bb) {
>>> +  basic_block pred1 = NULL, pred2 = NULL, join_block = NULL;
>>> +
>>> +  join_block = get_join_blk_same_as_loop_latch (bb, pred1, pred2);
>>> +
>>> +  if (join_block  &&
>>> +      is_feasible_path_splitting (join_block, pred1, pred2))
>>> +    {
>>> +      basic_block new_bb1 = NULL, new_bb2 = NULL;
>>> +      gimple_stmt_iterator last;
>>> +      basic_block temp_bb = NULL;
>>> +      edge_iterator ei;
>>> +      edge e1;
>>> +
>>> +      temp_bb = duplicate_block (join_block, NULL, NULL);
>>> +
>>> +      FOR_EACH_EDGE (e1, ei, pred1->succs)
>>> +        new_bb1 = split_edge (e1);
>>> +
>>> +      FOR_EACH_EDGE (e1, ei, pred2->succs)
>>> +        new_bb2 = split_edge (e1);
>>> +
>>> +      last = gsi_start_bb (new_bb1);
>>> +      gsi_insert_seq_after (&last, bb_seq (join_block), GSI_NEW_STMT);
>>> +      last = gsi_start_bb (new_bb2);
>>> +      gsi_insert_seq_after (&last, bb_seq (temp_bb), GSI_NEW_STMT);
>>> +      update_stmt_bb (new_bb1);
>>> +      update_stmt_bb (new_bb2);
>>> +
>>> +      replace_uses_phi (join_block, new_bb2);
>>> +
>>> +      set_bb_seq (join_block, NULL);
>>> +      set_bb_seq(temp_bb,NULL);
>>> +      delete_basic_block (temp_bb);
>>> +      return;
>>> +    }
>>> +}
>>> +
>>> +static unsigned int
>>> +execute_path_split (void)
>>> +{
>>> +  basic_block bb;
>>> +
>>> +  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS); 
>>> + initialize_original_copy_tables();
>>> +
>>> +  calculate_dominance_info (CDI_DOMINATORS); 
>>> + calculate_dominance_info (CDI_POST_DOMINATORS);
>>> +
>>> +  mark_dfs_back_edges ();
>>> +
>>> +  FOR_EACH_BB_FN (bb, cfun)
>>> +  {
>>> +    gimple last;
>>> +
>>> +    /* We only care about blocks ending in a COND_EXPR. */
>>> +
>>> +    last = gsi_stmt (gsi_last_bb (bb));
>>> +
>>> +    /* We're basically looking for a switch or any kind of conditional with
>>> +       integral or pointer type arguments.  Note the type of the second
>>> +       argument will be the same as the first argument, so no need to
>>> +       check it explicitly.  */
>>> +    if ((last && (gimple_code (last) == GIMPLE_COND
>>> +            && TREE_CODE (gimple_cond_lhs (last)) == SSA_NAME
>>> +            && (INTEGRAL_TYPE_P (TREE_TYPE (gimple_cond_lhs (last)))
>>> +            || POINTER_TYPE_P (TREE_TYPE (gimple_cond_lhs (last))))
>>> +            && (TREE_CODE (gimple_cond_rhs (last)) == SSA_NAME
>>> +            || is_gimple_min_invariant (gimple_cond_rhs (last))))))
>>> +      {
>>> +
>>> +         if (gimple_code(last) == GIMPLE_COND)
>>> +           {
>>> +              perform_path_splitting (bb);
>>> +           }
>>> +      }
>>> +   }
>>> +
>>> +   loop_optimizer_finalize ();
>>> +   free_original_copy_tables ();
>>> +   free_dominance_info (CDI_DOMINATORS);
>>> +   free_dominance_info (CDI_POST_DOMINATORS);
>>> +   return 0;
>>> +}
>>> +
>>> +namespace {
>>> +
>>> +const pass_data pass_data_path_split = {
>>> +   GIMPLE_PASS, /* type */
>>> +   "path_split", /* name */
>>> +    OPTGROUP_NONE, /* optinfo_flags */
>>> +    TV_TREE_PATH_SPLIT, /* tv_id */
>>> +    PROP_ssa, /* properties_required */
>>> +    0, /* properties_provided */
>>> +    0, /* properties_destroyed */
>>> +    0, /* todo_flags_start */
>>> +    ( TODO_cleanup_cfg | TODO_update_ssa ), /* todo_flags_finish */ 
>>> +};
>>> +
>>> +class pass_path_split : public gimple_opt_pass {
>>> +   public:
>>> +    pass_path_split (gcc::context *ctxt)
>>> +      : gimple_opt_pass (pass_data_path_split, ctxt)
>>> +    {}
>>> +
>>> +   /* opt_pass methods: */
>>> +   opt_pass * clone () { return new pass_path_split (m_ctxt); }
>>> +   virtual bool gate (function *) { return flag_tree_path_split != 0; }
>>> +   virtual unsigned int execute (function *) { return 
>>> + execute_path_split (); }
>>> +
>>> +}; // class pass_path_split
>>> +
>>> +} // anon namespace
>>> +
>>> +gimple_opt_pass *
>>> +make_pass_path_split (gcc::context *ctxt) {
>>> +  return new pass_path_split (ctxt); }
>>> --
>>> 1.8.2.1
>>>
>>> Thanks & Regards
>>> Ajit

[-- Attachment #2: path-splitting.patch --]
[-- Type: application/octet-stream, Size: 21987 bytes --]

From cf2b64cc1d6623424d770f2a9ea257eb7e58e887 Mon Sep 17 00:00:00 2001
From: Ajit Kumar Agarwal <ajitkum@xilix.com>
Date: Sat, 15 Aug 2015 18:19:14 +0200
Subject: [PATCH] [Patch,tree-optimization]: Add new path Splitting pass on
 tree ssa representation.

Added a new pass on path splitting on tree SSA representation. The path
splitting optimization does the CFG transformation of join block of the
if-then-else same as the loop latch node is moved and merged with the
predecessor blocks after preserving the SSA representation.

ChangeLog:
2015-08-15  Ajit Agarwal  <ajitkum@xilinx.com>

	* gcc/Makefile.in: Add the build of the new file
	tree-ssa-path-split.c
	* gcc/common.opt (ftree-path-split): Add the new flag.
	* gcc/opts.c (OPT_ftree_path_split) : Add an entry for
	Path splitting pass with optimization flag greater and
	equal to O2.
	* gcc/passes.def (path_split): add new path splitting pass.
	* gcc/timevar.def (TV_TREE_PATH_SPLIT): New.
	* gcc/tree-pass.h (make_pass_path_split): New declaration.
	* gcc/tree-ssa-path-split.c: New.
	* gcc/tracer.c (transform_duplicate): New.
	* gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c: New.
	* gcc/testsuite/gcc.dg/path-split-1.c: New.
	* gcc/doc/invoke.texi
	(ftree-path-split): Document.
	(fdump-tree-path_split): Document.

Signed-off-by:Ajit Agarwal ajitkum@xilinx.com
---
 gcc/Makefile.in                              |   1 +
 gcc/common.opt                               |   4 +
 gcc/doc/invoke.texi                          |  16 +-
 gcc/opts.c                                   |   1 +
 gcc/passes.def                               |   1 +
 gcc/testsuite/gcc.dg/path-split-1.c          |  65 ++++++
 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c |  60 +++++
 gcc/timevar.def                              |   1 +
 gcc/tracer.c                                 |  37 +--
 gcc/tree-pass.h                              |   1 +
 gcc/tree-ssa-path-split.c                    | 330 +++++++++++++++++++++++++++
 11 files changed, 503 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/path-split-1.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
 create mode 100644 gcc/tree-ssa-path-split.c

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index c1cb4ce..f01f885 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1465,6 +1465,7 @@ OBJS = \
 	tree-ssa-loop.o \
 	tree-ssa-math-opts.o \
 	tree-ssa-operands.o \
+        tree-ssa-path-split.o \
 	tree-ssa-phiopt.o \
 	tree-ssa-phiprop.o \
 	tree-ssa-pre.o \
diff --git a/gcc/common.opt b/gcc/common.opt
index e80eadf..1d02582 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2378,6 +2378,10 @@ ftree-vrp
 Common Report Var(flag_tree_vrp) Init(0) Optimization
 Perform Value Range Propagation on trees
 
+ftree-path-split
+Common Report Var(flag_tree_path_split) Init(0) Optimization
+Perform Path Splitting
+
 funit-at-a-time
 Common Report Var(flag_unit_at_a_time) Init(1)
 Compile whole compilation unit at a time
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index c26cd87..dce31e7 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -350,6 +350,7 @@ Objective-C and Objective-C++ Dialects}.
 -fdump-tree-fre@r{[}-@var{n}@r{]} @gol
 -fdump-tree-vtable-verify @gol
 -fdump-tree-vrp@r{[}-@var{n}@r{]} @gol
+-fdump-tree-path_split@r{[}-@var{n}@r{]} @gol
 -fdump-tree-storeccp@r{[}-@var{n}@r{]} @gol
 -fdump-final-insns=@var{file} @gol
 -fcompare-debug@r{[}=@var{opts}@r{]}  -fcompare-debug-second @gol
@@ -458,7 +459,7 @@ Objective-C and Objective-C++ Dialects}.
 -ftree-parallelize-loops=@var{n} -ftree-pre -ftree-partial-pre -ftree-pta @gol
 -ftree-reassoc -ftree-sink -ftree-slsr -ftree-sra @gol
 -ftree-switch-conversion -ftree-tail-merge -ftree-ter @gol
--ftree-vectorize -ftree-vrp @gol
+-ftree-vectorize -ftree-vrp @gol -ftree-path-split @gol
 -funit-at-a-time -funroll-all-loops -funroll-loops @gol
 -funsafe-loop-optimizations -funsafe-math-optimizations -funswitch-loops @gol
 -fipa-ra -fvariable-expansion-in-unroller -fvect-cost-model -fvpt @gol
@@ -7158,6 +7159,11 @@ is made by appending @file{.slp} to the source file name.
 Dump each function after Value Range Propagation (VRP).  The file name
 is made by appending @file{.vrp} to the source file name.
 
+@item path_split 
+@opindex fdump-tree-path_split
+Dump each function after path splitting.  The file name is made by 
+appending @file{.path_split} to the source file name.
+
 @item all
 @opindex fdump-tree-all
 Enable all the available tree dumps with the flags provided in this option.
@@ -7660,6 +7666,7 @@ also turns on the following optimization flags:
 -ftree-switch-conversion -ftree-tail-merge @gol
 -ftree-pre @gol
 -ftree-vrp @gol
+-ftree-path-split @gol
 -fipa-ra}
 
 Please note the warning under @option{-fgcse} about
@@ -9068,6 +9075,13 @@ enabled by default at @option{-O2} and higher.  Null pointer check
 elimination is only done if @option{-fdelete-null-pointer-checks} is
 enabled.
 
+@item -ftree-path-split
+@opindex ftree-path-split
+Perform Path Splitting  on trees.  The join blocks of IF-THEN-ELSE same
+as loop latch node is moved to its predecessor and the loop latch node
+will be forwarding block.  This is enabled by default at @option{-O2} 
+and higher.  
+
 @item -fsplit-ivs-in-unroller
 @opindex fsplit-ivs-in-unroller
 Enables expression of values of induction variables in later iterations
diff --git a/gcc/opts.c b/gcc/opts.c
index 468a802..c92f94e 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -506,6 +506,7 @@ static const struct default_options default_options_table[] =
     { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
+    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },
 
     /* -O3 optimizations.  */
     { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
diff --git a/gcc/passes.def b/gcc/passes.def
index 6b66f8f..20ddf3d 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -82,6 +82,7 @@ along with GCC; see the file COPYING3.  If not see
 	  NEXT_PASS (pass_ccp);
 	  /* After CCP we rewrite no longer addressed locals into SSA
 	     form if possible.  */
+          NEXT_PASS (pass_path_split);
 	  NEXT_PASS (pass_forwprop);
 	  NEXT_PASS (pass_sra_early);
 	  /* pass_build_ealias is a dummy pass that ensures that we
diff --git a/gcc/testsuite/gcc.dg/path-split-1.c b/gcc/testsuite/gcc.dg/path-split-1.c
new file mode 100644
index 0000000..075dc87
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/path-split-1.c
@@ -0,0 +1,65 @@
+/* { dg-do run } */
+/* { dg-options "-O2 " } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define RGBMAX 255
+
+int
+test()
+{
+  int i, Pels;
+  unsigned char sum = 0;
+  unsigned char xr, xg, xb;
+  unsigned char xc, xm, xy, xk;
+  unsigned char *ReadPtr, *EritePtr;
+
+  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+  EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+
+  for (i = 0; i < 100;i++)
+     {
+       ReadPtr[i] = 100 - i;
+     }
+
+  for (i = 0; i < 100; i++)
+     {
+       xr = *ReadPtr++;
+       xg = *ReadPtr++;
+       xb = *ReadPtr++;
+
+       xc = (unsigned char) (RGBMAX - xr);
+       xm = (unsigned char) (RGBMAX - xg);
+       xy = (unsigned char) (RGBMAX - xb);
+
+       if (xc < xm)
+         {
+           xk = (unsigned char) (xc < xy ? xc : xy);
+         }
+       else
+        {
+          xk = (unsigned char) (xm < xy ? xm : xy);
+        }
+
+       xc = (unsigned char) (xc - xk);
+       xm = (unsigned char) (xm - xk);
+       xy = (unsigned char) (xy - xk);
+
+       *EritePtr++ = xc;
+       *EritePtr++ = xm;
+       *EritePtr++ = xy;
+       *EritePtr++ = xk;
+       sum += *EritePtr;
+    }
+  return sum;
+}
+
+int
+main()
+{
+  if (test() != 33)
+    abort();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
new file mode 100644
index 0000000..172570f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-path_split" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define RGBMAX 255
+
+int
+test()
+{
+  int i, Pels;
+  unsigned char sum = 0;
+  unsigned char xr, xg, xb;
+  unsigned char xc, xm, xy, xk;
+  unsigned char *ReadPtr, *EritePtr;
+
+  ReadPtr = (unsigned char *) malloc (sizeof (unsigned char) * 100);
+  EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+
+  for (i = 0; i < 100;i++)
+     {
+       ReadPtr[i] = 100 - i;
+     }
+
+  for (i = 0; i < 100; i++)
+     {
+       xr = *ReadPtr++;
+       xg = *ReadPtr++;
+       xb = *ReadPtr++;
+
+       xc = ( unsigned char) (RGBMAX - xr);
+       xm = ( unsigned char) (RGBMAX - xg);
+       xy = ( unsigned char) (RGBMAX - xb);
+
+       if (xc < xm)
+         {
+           xk = ( unsigned char) (xc < xy ? xc : xy);
+         }
+       else
+         {
+           xk = ( unsigned char) (xm < xy ? xm : xy);
+         }
+
+       xc = (unsigned char) (xc - xk);
+       xm = (unsigned char) (xm - xk);
+       xy = (unsigned char) (xy - xk);
+
+       *EritePtr++ = xc;
+       *EritePtr++ = xm;
+       *EritePtr++ = xy;
+       *EritePtr++ = xk;
+       sum += *EritePtr;
+    }
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "xc_\[0-9\]\[0-9\]* -> { xc_\[0-9\]\[0-9\]* }" "path_split"} } */
+/* { dg-final { scan-tree-dump "xm_\[0-9\]\[0-9\]* -> { xm_\[0-9\]\[0-9\]* }" "path_split"} } */
+/* { dg-final { scan-tree-dump "xy_\[0-9\]\[0-9\]* -> { xy_\[0-9\]\[0-9\]* }" "path_split"} } */
diff --git a/gcc/timevar.def b/gcc/timevar.def
index ac41075..f2c145e 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -298,3 +298,4 @@ DEFTIMEVAR (TV_LINK		     , "link JIT code")
 DEFTIMEVAR (TV_LOAD		     , "load JIT result")
 DEFTIMEVAR (TV_JIT_ACQUIRING_MUTEX   , "acquiring JIT mutex")
 DEFTIMEVAR (TV_JIT_CLIENT_CODE   , "JIT client code")
+DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path_split")
diff --git a/gcc/tracer.c b/gcc/tracer.c
index cad7ab1..206692f 100644
--- a/gcc/tracer.c
+++ b/gcc/tracer.c
@@ -58,11 +58,13 @@
 #include "fibonacci_heap.h"
 
 static int count_insns (basic_block);
-static bool ignore_bb_p (const_basic_block);
+bool ignore_bb_p (const_basic_block);
 static bool better_p (const_edge, const_edge);
 static edge find_best_successor (basic_block);
 static edge find_best_predecessor (basic_block);
 static int find_trace (basic_block, basic_block *);
+basic_block transform_duplicate(basic_block bb,
+                                basic_block  bb2);
 
 /* Minimal outgoing edge probability considered for superblock formation.  */
 static int probability_cutoff;
@@ -90,7 +92,7 @@ bb_seen_p (basic_block bb)
 }
 
 /* Return true if we should ignore the basic block for purposes of tracing.  */
-static bool
+bool
 ignore_bb_p (const_basic_block bb)
 {
   gimple g;
@@ -224,6 +226,24 @@ find_trace (basic_block bb, basic_block *trace)
   return i;
 }
 
+/* Transform the block that needs to be duplicated.  */
+
+basic_block
+transform_duplicate(basic_block bb,
+                    basic_block  bb2)
+{
+  edge e;
+  basic_block copy;
+
+  e = find_edge (bb, bb2);
+
+  copy = duplicate_block (bb2, e, bb);
+  flush_pending_stmts (e);
+
+  add_phi_args_after_copy (&copy, 1, NULL);
+
+  return (copy);
+}
 /* Look for basic blocks in frequency order, construct traces and tail duplicate
    if profitable.  */
 
@@ -319,17 +339,8 @@ tail_duplicate (void)
 		 entries or at least rotate the loop.  */
 	      && bb2->loop_father->header != bb2)
 	    {
-	      edge e;
-	      basic_block copy;
-
-	      nduplicated += counts [bb2->index];
-
-	      e = find_edge (bb, bb2);
-
-	      copy = duplicate_block (bb2, e, bb);
-	      flush_pending_stmts (e);
-
-	      add_phi_args_after_copy (&copy, 1, NULL);
+              nduplicated += counts [bb2->index];
+              basic_block copy = transform_duplicate (bb,bb2);
 
 	      /* Reconsider the original copy of block we've duplicated.
 	         Removing the most common predecessor may make it to be
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 7b66a1c..6af7f0d 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -383,6 +383,7 @@ extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_path_split (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_build_ssa (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_build_alias (gcc::context *ctxt);
diff --git a/gcc/tree-ssa-path-split.c b/gcc/tree-ssa-path-split.c
new file mode 100644
index 0000000..5ce4b86
--- /dev/null
+++ b/gcc/tree-ssa-path-split.c
@@ -0,0 +1,330 @@
+/* Support routines for Path Splitting.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "cfghooks.h"
+#include "tree.h"
+#include "gimple.h"
+#include "rtl.h"
+#include "ssa.h"
+#include "flags.h"
+#include "alias.h"
+#include "fold-const.h"
+#include "stor-layout.h"
+#include "calls.h"
+#include "cfganal.h"
+#include "internal-fn.h"
+#include "gimple-fold.h"
+#include "tree-eh.h"
+#include "gimple-iterator.h"
+#include "gimple-walk.h"
+#include "tree-cfg.h"
+#include "tree-ssa-loop-manip.h"
+#include "tree-ssa-loop-niter.h"
+#include "tree-ssa-loop.h"
+#include "tree-into-ssa.h"
+#include "tree-ssa.h"
+#include "tree-pass.h"
+#include "tree-dump.h"
+#include "gimple-pretty-print.h"
+#include "diagnostic-core.h"
+#include "intl.h"
+#include "cfgloop.h"
+#include "tree-scalar-evolution.h"
+#include "tree-ssa-propagate.h"
+#include "tree-chrec.h"
+#include "insn-config.h"
+#include "expmed.h"
+#include "dojump.h"
+#include "explow.h"
+#include "emit-rtl.h"
+#include "varasm.h"
+#include "stmt.h"
+#include "expr.h"
+#include "insn-codes.h"
+#include "optabs.h"
+#include "fibonacci_heap.h"
+
+extern basic_block transform_duplicate(basic_block bb, basic_block  bb2);
+extern bool ignore_bb_p (const_basic_block bb);
+
+/* This function gets the join blocks same as the source
+   node of the loop latch nodes and form the trace with
+   the join block and its predecessor.  */
+
+static int
+find_trace_loop_latch_same_as_join_blk (basic_block bb,
+                                        basic_block *trace)
+{
+  vec<basic_block> bbs;
+  basic_block bb1;
+  unsigned int i;
+  edge_iterator ei;
+  edge e1;
+  bool found = false;
+  int n = 0;
+
+  bbs = get_all_dominated_blocks (CDI_DOMINATORS,
+                                  bb );
+  FOR_EACH_VEC_ELT (bbs, i, bb1)
+  {
+    FOR_EACH_EDGE (e1, ei, bb->succs)
+    {
+      if ( bb1 == e1->dest)
+        {
+          found = true;
+        }
+    }
+
+    if (!found && bb1 != bb)
+      {
+        found = false;
+        FOR_EACH_EDGE (e1, ei, bb1->succs)
+        {
+          if (e1->flags & (EDGE_DFS_BACK))
+            {
+              trace[1] = e1->src;
+              n++;
+              found = true;
+            }  
+        }
+
+        if (found && EDGE_COUNT(bb1->preds) == 2)
+          {
+            FOR_EACH_EDGE (e1, ei, bb1->preds)
+            {
+              if (single_succ_p(e1->src) &&
+                  single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
+                {
+                   trace[0] = e1->src;
+                   n++;
+                   break;
+                }
+            }
+            return n;
+          }
+       }
+   }
+   return n;
+}
+
+/* This function performs the feasibility tests for path splitting
+   to perform. Return false if the feasibility for path splitting
+   is not done and returns true if the feasbility for path splitting
+   is done. Following feasibility tests are performed.
+
+   1. Return false if the join block has rhs casting for assign
+      gimple statements.
+   2. If the number of phis is greater than 1 or the phi node in
+      the join block has virtual operand return false.
+   3. Return true if the number of sequential statements is
+      greater than 2.
+   4. If the phi result in the phi node of the join block is not
+      used inside the same join block return false.
+   7. Otherwise returns true.  */
+
+static bool
+is_feasible_trace (basic_block *trace)
+{
+  int num_stmt = 0, num_phis = 0;
+  gimple_stmt_iterator psi, gsi;
+
+  for (gsi = gsi_start_bb (trace[1]); !gsi_end_p (gsi); gsi_next (&gsi))
+     {
+       gimple stmt = gsi_stmt (gsi);
+       if (gimple_assign_cast_p (stmt))
+         return false;
+
+       if (!is_gimple_debug (stmt))
+         num_stmt++;
+    }
+
+   if ((num_stmt > 1))
+     {
+       bool found_virtual_result = false;
+
+       for (psi = gsi_start_phis (trace[1]); !gsi_end_p (psi); )
+          {
+            use_operand_p use_p;
+            imm_use_iterator iter;
+            gimple stmt = gsi_stmt(psi);
+
+            if (!virtual_operand_p (gimple_phi_result (stmt)))
+              num_phis++;
+            else
+              found_virtual_result = true;
+
+             FOR_EACH_IMM_USE_FAST (use_p, iter, gimple_phi_result (stmt))
+             {
+               gimple use_stmt = USE_STMT (use_p);
+
+               if (gimple_bb (use_stmt) != trace[1])
+                 return false;
+             }
+             gsi_next(&psi);
+          }
+
+        if ((num_phis >1) || found_virtual_result)
+          return false;
+
+       return true;
+    }
+
+  return false;
+}
+
+/* This is the core function to perform path splitting. The join
+   same as the source of the loop latch node is identified along
+   with their predecessors. Based on the feasibility tests for
+   path splitting the path splitting is performed by adding the
+   join blocks into the predecessors after propagating the phi
+   result with the corresponding phi arguments for the predecessors.
+   The  tree-cfg-cleanup will merge the blocks in the predecessors
+   path and the update-ssa will update the ssa representation after
+   the path splitting is performed.  */
+
+static bool
+perform_path_splitting ()
+{
+  bool changed = false;
+  basic_block trace[2] = {NULL,NULL};
+  basic_block bb;
+  auto_vec<fibonacci_node<long, basic_block_def>*> blocks;
+  blocks.safe_grow_cleared (last_basic_block_for_fn (cfun));
+  fibonacci_heap<long, basic_block_def> heap (LONG_MIN);
+ 
+  initialize_original_copy_tables();
+  calculate_dominance_info (CDI_DOMINATORS);
+
+  FOR_EACH_BB_FN (bb, cfun)
+  {
+    if (!ignore_bb_p (bb))
+      blocks[bb->index] = heap.insert (-bb->frequency, bb);
+  }
+ 
+  while (!heap.empty())
+  {
+    basic_block bb = heap.extract_min ();
+
+    if (!bb)
+      break;
+
+    blocks[bb->index] = NULL;
+    if (ignore_bb_p (bb))
+        continue;
+
+    gimple last = gsi_stmt (gsi_last_bb (bb));
+
+    if (last && gimple_code (last) != GIMPLE_COND)
+      continue;
+
+    int n = find_trace_loop_latch_same_as_join_blk (bb, trace);
+
+    if (n >= 2 && is_feasible_trace (trace))
+      {
+        if (blocks[trace[0]->index])
+          {
+            heap.delete_node (blocks[trace[0]->index]);
+            blocks[trace[0]->index] = NULL;
+          }
+        if (blocks[trace[1]->index])
+          {
+            heap.delete_node (blocks[trace[1]->index]);
+            blocks[trace[0]->index] = NULL;
+          }
+
+        transform_duplicate (trace[0],trace[1]);
+        printf("path splitting successful \n");
+        changed = true;
+     }
+  }
+ 
+  free_original_copy_tables ();
+  return changed;
+}
+
+static unsigned int
+execute_path_split (void)
+{
+  bool changed;
+
+  if (n_basic_blocks_for_fn (cfun) <= NUM_FIXED_BLOCKS + 1)
+      return 0;
+
+  mark_dfs_back_edges ();
+
+  changed = perform_path_splitting();
+
+  if (changed)
+    {
+       free_dominance_info (CDI_DOMINATORS);
+       /* If we changed the CFG schedule loops for fixup by cleanup_cfg.  */
+       if (current_loops)
+         loops_state_set (LOOPS_NEED_FIXUP);
+    }
+ 
+  return changed ? TODO_cleanup_cfg : 0;
+
+}
+
+static bool
+gate_path_split(void)
+{
+  return flag_tree_path_split != 0;
+}
+
+namespace {
+
+const pass_data pass_data_path_split =
+{
+  GIMPLE_PASS, /* type */
+  "path_split", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_TREE_PATH_SPLIT, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_update_ssa, /* todo_flags_finish */
+};
+
+class pass_path_split : public gimple_opt_pass
+{
+   public:
+    pass_path_split (gcc::context *ctxt)
+      : gimple_opt_pass (pass_data_path_split, ctxt)
+    {}
+   /* opt_pass methods: */
+   opt_pass * clone () { return new pass_path_split (m_ctxt); }
+   virtual bool gate (function *) { return gate_path_split (); }
+   virtual unsigned int execute (function *) { return execute_path_split (); }
+ 
+}; // class pass_path_split
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_path_split (gcc::context *ctxt)
+{
+  return new pass_path_split (ctxt);
+}
-- 
1.8.2.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-08-15 23:13             ` Ajit Kumar Agarwal
@ 2015-08-19 19:47               ` Jeff Law
  2015-08-20 15:40                 ` Ajit Kumar Agarwal
  2015-08-19 21:53               ` Jeff Law
  1 sibling, 1 reply; 72+ messages in thread
From: Jeff Law @ 2015-08-19 19:47 UTC (permalink / raw)
  To: Ajit Kumar Agarwal, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On 08/15/2015 11:01 AM, Ajit Kumar Agarwal wrote:
> All:
>
> Please find the updated patch with suggestion and feedback
> incorporated.
>
> Thanks Jeff and Richard for the review comments.
>
> Following changes were done based on the feedback on RFC comments.
> and the review for the previous patch.
>
> 1. Both tracer and path splitting pass are separate passes so  that
> two instances of the pass will run in the end, one doing path
> splitting and one doing  tracing, at different times in the
> optimization pipeline.
I'll have to think about this.  I'm not sure I agree totally with
Richi's assertion that we should share code with the tracer pass, but
I'll give it a good looksie.



> 2. Transform code is shared for tracer and path splitting pass. The
> common code in extracted in a given function transform_duplicate And
> place the function in tracer.c and the path splitting pass uses the
> transform code.
OK.  I'll take a good look at that.


> 3. Analysis for the basic block population and traversing the basic
> block using the Fibonacci heap is commonly used. This cannot be
> Factored out into new function as the tracer pass does more analysis
> based on the profile and the different heuristics is used in tracer
> And path splitting pass.
Understood.


> 4. The include headers is minimal and presence of what is required
> for the path splitting pass.
THanks.


> 5. The earlier patch does the SSA updating  with replace function to
> preserve the SSA representation required to move the loop latch node
> same as join Block to its predecessors and the loop latch node is
> just forward block. Such replace function are not required as
> suggested by the Jeff. Such replace Function goes away with this
> patch and the transformed code is factored into a given function
> which is shared between tracer and path splitting pass.
Sounds good.

>
> Bootstrapping with i386 and Microblaze target works fine. No
> regression is seen in Deja GNU tests for Microblaze. There are
> lesser failures. Mibench/EEMBC benchmarks were run for Microblaze
> target and the gain of 9.3% is seen in rgbcmy_lite the EEMBC
> benchmarks.
What do you mean by there are "lesser failures"?  Are you saying there 
are cases where path splitting generates incorrect code, or cases where 
path splitting produces code that is less efficient, or something else?

>
> SPEC 2000 benchmarks were run with i386 target and the following
> performance number is achieved.
>
> INT benchmarks with path splitting(ratio) Vs INT benchmarks without
> path splitting(ratio) = 3661.225091 vs 3621.520572
That's an impressive improvement.

Anyway, I'll start taking a close look at this momentarily.

Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-08-15 23:13             ` Ajit Kumar Agarwal
  2015-08-19 19:47               ` Jeff Law
@ 2015-08-19 21:53               ` Jeff Law
  2015-08-20 15:41                 ` Ajit Kumar Agarwal
  2015-09-04 18:07                 ` Ajit Kumar Agarwal
  1 sibling, 2 replies; 72+ messages in thread
From: Jeff Law @ 2015-08-19 21:53 UTC (permalink / raw)
  To: Ajit Kumar Agarwal, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On 08/15/2015 11:01 AM, Ajit Kumar Agarwal wrote:
>
>
>  From cf2b64cc1d6623424d770f2a9ea257eb7e58e887 Mon Sep 17 00:00:00 2001
> From: Ajit Kumar Agarwal<ajitkum@xilix.com>
> Date: Sat, 15 Aug 2015 18:19:14 +0200
> Subject: [PATCH] [Patch,tree-optimization]: Add new path Splitting pass on
>   tree ssa representation.
>
> Added a new pass on path splitting on tree SSA representation. The path
> splitting optimization does the CFG transformation of join block of the
> if-then-else same as the loop latch node is moved and merged with the
> predecessor blocks after preserving the SSA representation.
>
> ChangeLog:
> 2015-08-15  Ajit Agarwal<ajitkum@xilinx.com>
>
> 	* gcc/Makefile.in: Add the build of the new file
> 	tree-ssa-path-split.c
Instead:

	* Makefile.in (OBJS): Add tree-ssa-path-split.o.


> 	* gcc/opts.c (OPT_ftree_path_split) : Add an entry for
> 	Path splitting pass with optimization flag greater and
> 	equal to O2.

	* opts.c (default_options_table): Add entry for path splitting
	optimization at -O2 and above.



> 	* gcc/passes.def (path_split): add new path splitting pass.
Capitalize "add".




> 	* gcc/tree-ssa-path-split.c: New.
Use "New file".

> 	* gcc/tracer.c (transform_duplicate): New.
Use "New function".

> 	* gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c: New.
> 	* gcc/testsuite/gcc.dg/path-split-1.c: New.
These belong in gcc/testsuite/ChangeLog and remove the "gcc/testsuite" 
prefix.

> 	* gcc/doc/invoke.texi
> 	(ftree-path-split): Document.
> 	(fdump-tree-path_split): Document.
Should just be two lines instead of three.

And more generally, there's no need to prefix ChangeLog entries with "gcc/".

Now that the ChangeLog nits are out of the way, let's get to stuff 
that's more interesting.



>
> Signed-off-by:Ajit Agarwalajitkum@xilinx.com
> ---
>   gcc/Makefile.in                              |   1 +
>   gcc/common.opt                               |   4 +
>   gcc/doc/invoke.texi                          |  16 +-
>   gcc/opts.c                                   |   1 +
>   gcc/passes.def                               |   1 +
>   gcc/testsuite/gcc.dg/path-split-1.c          |  65 ++++++
>   gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c |  60 +++++
>   gcc/timevar.def                              |   1 +
>   gcc/tracer.c                                 |  37 +--
>   gcc/tree-pass.h                              |   1 +
>   gcc/tree-ssa-path-split.c                    | 330 +++++++++++++++++++++++++++
>   11 files changed, 503 insertions(+), 14 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.dg/path-split-1.c
>   create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>   create mode 100644 gcc/tree-ssa-path-split.c
>
> diff --git a/gcc/common.opt b/gcc/common.opt
> index e80eadf..1d02582 100644
> --- a/gcc/common.opt
> +++ b/gcc/common.opt
> @@ -2378,6 +2378,10 @@ ftree-vrp
>   Common Report Var(flag_tree_vrp) Init(0) Optimization
>   Perform Value Range Propagation on trees
>
> +ftree-path-split
> +Common Report Var(flag_tree_path_split) Init(0) Optimization
> +Perform Path Splitting
Maybe "Perform Path Splitting for loop backedges" or something which is 
a little more descriptive.  The above isn't exactly right, so don't use 
it as-is.



> @@ -9068,6 +9075,13 @@ enabled by default at @option{-O2} and higher.  Null pointer check
>   elimination is only done if @option{-fdelete-null-pointer-checks} is
>   enabled.
>
> +@item -ftree-path-split
> +@opindex ftree-path-split
> +Perform Path Splitting  on trees.  The join blocks of IF-THEN-ELSE same
> +as loop latch node is moved to its predecessor and the loop latch node
> +will be forwarding block.  This is enabled by default at @option{-O2}
> +and higher.
Needs some work.  Maybe something along the lines of

When two paths of execution merge immediately before a loop latch node, 
try to duplicate the merge node into the two paths.

> diff --git a/gcc/passes.def b/gcc/passes.def
> index 6b66f8f..20ddf3d 100644
> --- a/gcc/passes.def
> +++ b/gcc/passes.def
> @@ -82,6 +82,7 @@ along with GCC; see the file COPYING3.  If not see
>   	  NEXT_PASS (pass_ccp);
>   	  /* After CCP we rewrite no longer addressed locals into SSA
>   	     form if possible.  */
> +          NEXT_PASS (pass_path_split);
>   	  NEXT_PASS (pass_forwprop);
>   	  NEXT_PASS (pass_sra_early);
I can't recall if we've discussed the location of the pass at all.  I'm 
not objecting to this location, but would like to hear why you chose 
this particular location in the optimization pipeline.

>   	  /* pass_build_ealias is a dummy pass that ensures that we
> diff --git a/gcc/testsuite/gcc.dg/path-split-1.c b/gcc/testsuite/gcc.dg/path-split-1.c
ISTM the two tests should be combined into a single test.  I didn't see 
a functional difference in the test() function between those two tests.

I believe you can still create/scan debugging dumps with dg-do run test.


> +DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path_split")
tree path split rather than using underscores

> diff --git a/gcc/tracer.c b/gcc/tracer.c
> index cad7ab1..206692f 100644
> --- a/gcc/tracer.c
> +++ b/gcc/tracer.c
> @@ -58,11 +58,13 @@
>   #include "fibonacci_heap.h"
>
>   static int count_insns (basic_block);
> -static bool ignore_bb_p (const_basic_block);
> +bool ignore_bb_p (const_basic_block);
>   static bool better_p (const_edge, const_edge);
>   static edge find_best_successor (basic_block);
>   static edge find_best_predecessor (basic_block);
>   static int find_trace (basic_block, basic_block *);
> +basic_block transform_duplicate(basic_block bb,
> +                                basic_block  bb2);
Please create a tracer.h and put the newly exported prototypes in 
tracer.h.  Then include tracer.h in tracer.c and tree-ssa-path-split.c.

> @@ -224,6 +226,24 @@ find_trace (basic_block bb, basic_block *trace)
>     return i;
>   }
>
> +/* Transform the block that needs to be duplicated.  */
> +
> +basic_block
> +transform_duplicate(basic_block bb,
> +                    basic_block  bb2)
Space between the name of the function and first paren.  It looks like 
these two lines should be joined.  Single space between the type and the 
name of the argument.

Ultimately there's not a lot of shared code between the tracer and path 
splitting, which is basically what I expected.  Nevertheless, sharing a 
single implementation of those routines is probably wise.


> +
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#include "backend.h"
> +#include "cfghooks.h"
> +#include "tree.h"
> +#include "gimple.h"
> +#include "rtl.h"
> +#include "ssa.h"
> +#include "flags.h"
> +#include "alias.h"
> +#include "fold-const.h"
> +#include "stor-layout.h"
> +#include "calls.h"
> +#include "cfganal.h"
> +#include "internal-fn.h"
> +#include "gimple-fold.h"
> +#include "tree-eh.h"
> +#include "gimple-iterator.h"
> +#include "gimple-walk.h"
> +#include "tree-cfg.h"
> +#include "tree-ssa-loop-manip.h"
> +#include "tree-ssa-loop-niter.h"
> +#include "tree-ssa-loop.h"
> +#include "tree-into-ssa.h"
> +#include "tree-ssa.h"
> +#include "tree-pass.h"
> +#include "tree-dump.h"
> +#include "gimple-pretty-print.h"
> +#include "diagnostic-core.h"
> +#include "intl.h"
> +#include "cfgloop.h"
> +#include "tree-scalar-evolution.h"
> +#include "tree-ssa-propagate.h"
> +#include "tree-chrec.h"
> +#include "insn-config.h"
> +#include "expmed.h"
> +#include "dojump.h"
> +#include "explow.h"
> +#include "emit-rtl.h"
> +#include "varasm.h"
> +#include "stmt.h"
> +#include "expr.h"
> +#include "insn-codes.h"
> +#include "optabs.h"
> +#include "fibonacci_heap.h"
I'm having a hard time seeing how all these are needed.  Especially the 
RTL related includes.  These really need to be trimmed.


> +
> +extern basic_block transform_duplicate(basic_block bb, basic_block  bb2);
> +extern bool ignore_bb_p (const_basic_block bb);
With the prototypes moved to tracer.h, you won't want the extern 
declarations in here.

> +
> +/* This function gets the join blocks same as the source
> +   node of the loop latch nodes and form the trace with
> +   the join block and its predecessor.  */
I'm having a hard time parsing this comment. Please try to rewrite it to 
be clearer.

I suspect this routine is more complicated than it needs to be because 
of how you're searching for our subgraphs.


> +
> +static int
> +find_trace_loop_latch_same_as_join_blk (basic_block bb,
> +                                        basic_block *trace)
> +{
> +  vec<basic_block> bbs;
> +  basic_block bb1;
> +  unsigned int i;
> +  edge_iterator ei;
> +  edge e1;
> +  bool found = false;
> +  int n = 0;
> +
> +  bbs = get_all_dominated_blocks (CDI_DOMINATORS,
> +                                  bb );
> +  FOR_EACH_VEC_ELT (bbs, i, bb1)
> +  {
> +    FOR_EACH_EDGE (e1, ei, bb->succs)
> +    {
> +      if ( bb1 == e1->dest)
> +        {
> +          found = true;
> +        }
> +    }
> +
> +    if (!found && bb1 != bb)
> +      {
> +        found = false;
> +        FOR_EACH_EDGE (e1, ei, bb1->succs)
> +        {
> +          if (e1->flags & (EDGE_DFS_BACK))
> +            {
> +              trace[1] = e1->src;
> +              n++;
> +              found = true;
> +            }
> +        }
It seems to me all this can be changed to look for the backedges via the 
loop tree.



> +
> +        if (found && EDGE_COUNT(bb1->preds) == 2)
Space between EDGE_COUNT and the open paren.

You'd want to keep this test, then do some kind of dominance test like 
we talked about earlier in the discussion of this patch.

Jeff
> +          {
> +            FOR_EACH_EDGE (e1, ei, bb1->preds)
> +            {
> +              if (single_succ_p(e1->src) &&
> +                  single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
And you'd keep these tests in some form.

> +
> +/* This function performs the feasibility tests for path splitting
> +   to perform. Return false if the feasibility for path splitting
> +   is not done and returns true if the feasbility for path splitting
> +   is done. Following feasibility tests are performed.
> +
> +   1. Return false if the join block has rhs casting for assign
> +      gimple statements.
> +   2. If the number of phis is greater than 1 or the phi node in
> +      the join block has virtual operand return false.
> +   3. Return true if the number of sequential statements is
> +      greater than 2.
> +   4. If the phi result in the phi node of the join block is not
> +      used inside the same join block return false.
> +   7. Otherwise returns true.  */
These seem totally arbitrary.  What's the reason behind each of these 
restrictions?  None should be a correctness requirement AFAICT.  Others 
(like the number of statements) should probably be conditionalized on 
size vs speed optimizations.



  The join
> +   same as the source of the loop latch node is identified along
> +   with their predecessors.
I couldn't parse this sentence.



Based on the feasibility tests for
> +   path splitting the path splitting is performed by adding the
> +   join blocks into the predecessors after propagating the phi
> +   result with the corresponding phi arguments for the predecessors.
> +   The  tree-cfg-cleanup will merge the blocks in the predecessors
> +   path and the update-ssa will update the ssa representation after
> +   the path splitting is performed.  */
It would probably help to show a visual representation of what's 
happening.  ie, show a little CFG before and after.



> +
> +static bool
> +perform_path_splitting ()
> +{
> +  bool changed = false;
> +  basic_block trace[2] = {NULL,NULL};
> +  basic_block bb;
> +  auto_vec<fibonacci_node<long, basic_block_def>*> blocks;
> +  blocks.safe_grow_cleared (last_basic_block_for_fn (cfun));
> +  fibonacci_heap<long, basic_block_def> heap (LONG_MIN);
> +
> +  initialize_original_copy_tables();
> +  calculate_dominance_info (CDI_DOMINATORS);

> +
> +  FOR_EACH_BB_FN (bb, cfun)
> +  {
> +    if (!ignore_bb_p (bb))
> +      blocks[bb->index] = heap.insert (-bb->frequency, bb);
> +  }
> +
> +  while (!heap.empty())
Wouldn't it be easier to walk the loop tree?  We've got a fairly 
specific subgraph that we're looking for.  Why not walk the loop tree, 
using the latch as a point to search backwards for the shape of the 
subgraph we're interested in transforming?

That would seem to be a lot less expensive than looking at every block 
looking for the start of the subgraphs we're interested in.  That'd also 
seem to eliminate all the fibheap stuff.

> +
> +static unsigned int
> +execute_path_split (void)
> +{
> +  bool changed;
> +
> +  if (n_basic_blocks_for_fn (cfun) <= NUM_FIXED_BLOCKS + 1)
> +      return 0;
> +
> +  mark_dfs_back_edges ();
ISTM you could check the return value from mark_dfs_back_edges and do 
nothing if no back edges were found.

> +
> +static bool
> +gate_path_split(void)
> +{
> +  return flag_tree_path_split != 0;
> +}
> +
> +namespace {
> +
> +const pass_data pass_data_path_split =
> +{
> +  GIMPLE_PASS, /* type */
> +  "path_split", /* name */
> +  OPTGROUP_NONE, /* optinfo_flags */
> +  TV_TREE_PATH_SPLIT, /* tv_id */
> +  0, /* properties_required */
> +  0, /* properties_provided */
> +  0, /* properties_destroyed */
> +  0, /* todo_flags_start */
> +  TODO_update_ssa, /* todo_flags_finish */
It seems to me that you're probably missing some properties_required. 
You depend on a proper CFG and SSA graph, so at the least PROP_cfg and 
PROP_ssa.

Presumably you don't set TODO_cleanup_cfg so that you can avoid the cfg 
cleanup if nothing gets changed.  Is there some reason you don't do the 
same for the SSA graph update?

Jeff



^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-08-19 19:47               ` Jeff Law
@ 2015-08-20 15:40                 ` Ajit Kumar Agarwal
  2015-08-20 15:49                   ` Jeff Law
  0 siblings, 1 reply; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-08-20 15:40 UTC (permalink / raw)
  To: Jeff Law, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala



-----Original Message-----
From: Jeff Law [mailto:law@redhat.com] 
Sent: Thursday, August 20, 2015 1:13 AM
To: Ajit Kumar Agarwal; Richard Biener
Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On 08/15/2015 11:01 AM, Ajit Kumar Agarwal wrote:
> All:
>
> Please find the updated patch with suggestion and feedback 
> incorporated.
>
> Thanks Jeff and Richard for the review comments.
>
> Following changes were done based on the feedback on RFC comments.
> and the review for the previous patch.
>
> 1. Both tracer and path splitting pass are separate passes so  that 
> two instances of the pass will run in the end, one doing path 
> splitting and one doing  tracing, at different times in the 
> optimization pipeline.
>>I'll have to think about this.  I'm not sure I agree totally with Richi's assertion that we should share code with the tracer pass, but I'll give it a good looksie.



> 2. Transform code is shared for tracer and path splitting pass. The 
> common code in extracted in a given function transform_duplicate And 
> place the function in tracer.c and the path splitting pass uses the 
> transform code.
>>OK.  I'll take a good look at that.


> 3. Analysis for the basic block population and traversing the basic 
> block using the Fibonacci heap is commonly used. This cannot be 
> Factored out into new function as the tracer pass does more analysis 
> based on the profile and the different heuristics is used in tracer 
> And path splitting pass.
>>Understood.


> 4. The include headers is minimal and presence of what is required for 
> the path splitting pass.
>>THanks.


> 5. The earlier patch does the SSA updating  with replace function to 
> preserve the SSA representation required to move the loop latch node 
> same as join Block to its predecessors and the loop latch node is just 
> forward block. Such replace function are not required as suggested by 
> the Jeff. Such replace Function goes away with this patch and the 
> transformed code is factored into a given function which is shared 
> between tracer and path splitting pass.
>>Sounds good.

>
> Bootstrapping with i386 and Microblaze target works fine. No 
> regression is seen in Deja GNU tests for Microblaze. There are lesser 
> failures. Mibench/EEMBC benchmarks were run for Microblaze target and 
> the gain of 9.3% is seen in rgbcmy_lite the EEMBC benchmarks.
>>What do you mean by there are "lesser failures"?  Are you saying there are cases where path splitting generates incorrect code, or cases where path >>splitting produces code that is less efficient, or something else?

I meant there are more Deja GNU testcases passes with the path splitting changes.

>
> SPEC 2000 benchmarks were run with i386 target and the following
> performance number is achieved.
>
> INT benchmarks with path splitting(ratio) Vs INT benchmarks without
> path splitting(ratio) = 3661.225091 vs 3621.520572
>>That's an impressive improvement.

>>Anyway, I'll start taking a close look at this momentarily.

Thanks & Regards
Ajit

Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-08-19 21:53               ` Jeff Law
@ 2015-08-20 15:41                 ` Ajit Kumar Agarwal
  2015-09-04 18:07                 ` Ajit Kumar Agarwal
  1 sibling, 0 replies; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-08-20 15:41 UTC (permalink / raw)
  To: Jeff Law, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala



-----Original Message-----
From: Jeff Law [mailto:law@redhat.com] 
Sent: Thursday, August 20, 2015 3:16 AM
To: Ajit Kumar Agarwal; Richard Biener
Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On 08/15/2015 11:01 AM, Ajit Kumar Agarwal wrote:
>
>
>  From cf2b64cc1d6623424d770f2a9ea257eb7e58e887 Mon Sep 17 00:00:00 
> 2001
> From: Ajit Kumar Agarwal<ajitkum@xilix.com>
> Date: Sat, 15 Aug 2015 18:19:14 +0200
> Subject: [PATCH] [Patch,tree-optimization]: Add new path Splitting pass on
>   tree ssa representation.
>
> Added a new pass on path splitting on tree SSA representation. The 
> path splitting optimization does the CFG transformation of join block 
> of the if-then-else same as the loop latch node is moved and merged 
> with the predecessor blocks after preserving the SSA representation.
>
> ChangeLog:
> 2015-08-15  Ajit Agarwal<ajitkum@xilinx.com>
>
> 	* gcc/Makefile.in: Add the build of the new file
> 	tree-ssa-path-split.c
Instead:

>>	* Makefile.in (OBJS): Add tree-ssa-path-split.o.


> 	* gcc/opts.c (OPT_ftree_path_split) : Add an entry for
> 	Path splitting pass with optimization flag greater and
> 	equal to O2.

>>	* opts.c (default_options_table): Add entry for path splitting
>>	optimization at -O2 and above.



> 	* gcc/passes.def (path_split): add new path splitting pass.
>>Capitalize "add".




> 	* gcc/tree-ssa-path-split.c: New.
>>Use "New file".

> 	* gcc/tracer.c (transform_duplicate): New.
>>Use "New function".

> 	* gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c: New.
> 	* gcc/testsuite/gcc.dg/path-split-1.c: New.
>>These belong in gcc/testsuite/ChangeLog and remove the "gcc/testsuite" 
>>prefix.

> 	* gcc/doc/invoke.texi
> 	(ftree-path-split): Document.
> 	(fdump-tree-path_split): Document.
>>Should just be two lines instead of three.

>>And more generally, there's no need to prefix ChangeLog entries with "gcc/".

>>Now that the ChangeLog nits are out of the way, let's get to stuff that's more interesting.

I will incorporate all the above changes  in the upcoming patches.

>
> Signed-off-by:Ajit Agarwalajitkum@xilinx.com
> ---
>   gcc/Makefile.in                              |   1 +
>   gcc/common.opt                               |   4 +
>   gcc/doc/invoke.texi                          |  16 +-
>   gcc/opts.c                                   |   1 +
>   gcc/passes.def                               |   1 +
>   gcc/testsuite/gcc.dg/path-split-1.c          |  65 ++++++
>   gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c |  60 +++++
>   gcc/timevar.def                              |   1 +
>   gcc/tracer.c                                 |  37 +--
>   gcc/tree-pass.h                              |   1 +
>   gcc/tree-ssa-path-split.c                    | 330 +++++++++++++++++++++++++++
>   11 files changed, 503 insertions(+), 14 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.dg/path-split-1.c
>   create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>   create mode 100644 gcc/tree-ssa-path-split.c
>
> diff --git a/gcc/common.opt b/gcc/common.opt
> index e80eadf..1d02582 100644
> --- a/gcc/common.opt
> +++ b/gcc/common.opt
> @@ -2378,6 +2378,10 @@ ftree-vrp
>   Common Report Var(flag_tree_vrp) Init(0) Optimization
>   Perform Value Range Propagation on trees
>
> +ftree-path-split
> +Common Report Var(flag_tree_path_split) Init(0) Optimization
> +Perform Path Splitting
>>Maybe "Perform Path Splitting for loop backedges" or something which is 
>>a little more descriptive.  The above isn't exactly right, so don't use 
>>it as-is.



> @@ -9068,6 +9075,13 @@ enabled by default at @option{-O2} and higher.  Null pointer check
>   elimination is only done if @option{-fdelete-null-pointer-checks} is
>   enabled.
>
> +@item -ftree-path-split
> +@opindex ftree-path-split
> +Perform Path Splitting  on trees.  The join blocks of IF-THEN-ELSE same
> +as loop latch node is moved to its predecessor and the loop latch node
> +will be forwarding block.  This is enabled by default at @option{-O2}
> +and higher.
>>Needs some work.  Maybe something along the lines of

>>When two paths of execution merge immediately before a loop latch node, 
>>try to duplicate the merge node into the two paths.

I will incorporate all the above changes.

> diff --git a/gcc/passes.def b/gcc/passes.def
> index 6b66f8f..20ddf3d 100644
> --- a/gcc/passes.def
> +++ b/gcc/passes.def
> @@ -82,6 +82,7 @@ along with GCC; see the file COPYING3.  If not see
>   	  NEXT_PASS (pass_ccp);
>   	  /* After CCP we rewrite no longer addressed locals into SSA
>   	     form if possible.  */
> +          NEXT_PASS (pass_path_split);
>   	  NEXT_PASS (pass_forwprop);
>   	  NEXT_PASS (pass_sra_early);
>>I can't recall if we've discussed the location of the pass at all.  I'm 
>>not objecting to this location, but would like to hear why you chose 
>>this particular location in the optimization pipeline.

I have placed the path splitting pass as above as per your suggestions and the maximum gains achieved by placing the pass at the
Above location. The other reasons is to place the pass  before the PRE,CCP, Copy Propagation, forward propagation and any SSA updating pass.

>   	  /* pass_build_ealias is a dummy pass that ensures that we
> diff --git a/gcc/testsuite/gcc.dg/path-split-1.c b/gcc/testsuite/gcc.dg/path-split-1.c
>>ISTM the two tests should be combined into a single test.  I didn't see 
>>a functional difference in the test() function between those two tests.

>>I believe you can still create/scan debugging dumps with dg-do run test.


> +DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path_split")
>>tree path split rather than using underscores

> diff --git a/gcc/tracer.c b/gcc/tracer.c
> index cad7ab1..206692f 100644
> --- a/gcc/tracer.c
> +++ b/gcc/tracer.c
> @@ -58,11 +58,13 @@
>   #include "fibonacci_heap.h"
>
>   static int count_insns (basic_block);
> -static bool ignore_bb_p (const_basic_block);
> +bool ignore_bb_p (const_basic_block);
>   static bool better_p (const_edge, const_edge);
>   static edge find_best_successor (basic_block);
>   static edge find_best_predecessor (basic_block);
>   static int find_trace (basic_block, basic_block *);
> +basic_block transform_duplicate(basic_block bb,
> +                                basic_block  bb2);
>>Please create a tracer.h and put the newly exported prototypes in 
>>tracer.h.  Then include tracer.h in tracer.c and tree-ssa-path-split.c.

> @@ -224,6 +226,24 @@ find_trace (basic_block bb, basic_block *trace)
>     return i;
>   }
>
> +/* Transform the block that needs to be duplicated.  */
> +
> +basic_block
> +transform_duplicate(basic_block bb,
> +                    basic_block  bb2)
>>Space between the name of the function and first paren.  It looks like 
>>these two lines should be joined.  Single space between the type and the 
>>name of the argument.

>>Ultimately there's not a lot of shared code between the tracer and path 
>>splitting, which is basically what I expected.  Nevertheless, sharing a 
>>single implementation of those routines is probably wise.


> +
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#include "backend.h"
> +#include "cfghooks.h"
> +#include "tree.h"
> +#include "gimple.h"
> +#include "rtl.h"
> +#include "ssa.h"
> +#include "flags.h"
> +#include "alias.h"
> +#include "fold-const.h"
> +#include "stor-layout.h"
> +#include "calls.h"
> +#include "cfganal.h"
> +#include "internal-fn.h"
> +#include "gimple-fold.h"
> +#include "tree-eh.h"
> +#include "gimple-iterator.h"
> +#include "gimple-walk.h"
> +#include "tree-cfg.h"
> +#include "tree-ssa-loop-manip.h"
> +#include "tree-ssa-loop-niter.h"
> +#include "tree-ssa-loop.h"
> +#include "tree-into-ssa.h"
> +#include "tree-ssa.h"
> +#include "tree-pass.h"
> +#include "tree-dump.h"
> +#include "gimple-pretty-print.h"
> +#include "diagnostic-core.h"
> +#include "intl.h"
> +#include "cfgloop.h"
> +#include "tree-scalar-evolution.h"
> +#include "tree-ssa-propagate.h"
> +#include "tree-chrec.h"
> +#include "insn-config.h"
> +#include "expmed.h"
> +#include "dojump.h"
> +#include "explow.h"
> +#include "emit-rtl.h"
> +#include "varasm.h"
> +#include "stmt.h"
> +#include "expr.h"
> +#include "insn-codes.h"
> +#include "optabs.h"
> +#include "fibonacci_heap.h"
>>I'm having a hard time seeing how all these are needed.  Especially the 
>>RTL related includes.  These really need to be trimmed.


> +
> +extern basic_block transform_duplicate(basic_block bb, basic_block  bb2);
> +extern bool ignore_bb_p (const_basic_block bb);
>>With the prototypes moved to tracer.h, you won't want the extern 
>>declarations in here.

> +
> +/* This function gets the join blocks same as the source
> +   node of the loop latch nodes and form the trace with
> +   the join block and its predecessor.  */
>>I'm having a hard time parsing this comment. Please try to rewrite it to 
>>be clearer.

>>I suspect this routine is more complicated than it needs to be because 
>>of how you're searching for our subgraphs.


> +
> +static int
> +find_trace_loop_latch_same_as_join_blk (basic_block bb,
> +                                        basic_block *trace)
> +{
> +  vec<basic_block> bbs;
> +  basic_block bb1;
> +  unsigned int i;
> +  edge_iterator ei;
> +  edge e1;
> +  bool found = false;
> +  int n = 0;
> +
> +  bbs = get_all_dominated_blocks (CDI_DOMINATORS,
> +                                  bb );
> +  FOR_EACH_VEC_ELT (bbs, i, bb1)
> +  {
> +    FOR_EACH_EDGE (e1, ei, bb->succs)
> +    {
> +      if ( bb1 == e1->dest)
> +        {
> +          found = true;
> +        }
> +    }
> +
> +    if (!found && bb1 != bb)
> +      {
> +        found = false;
> +        FOR_EACH_EDGE (e1, ei, bb1->succs)
> +        {
> +          if (e1->flags & (EDGE_DFS_BACK))
> +            {
> +              trace[1] = e1->src;
> +              n++;
> +              found = true;
> +            }
> +        }
>>It seems to me all this can be changed to look for the backedges via the 
>>loop tree.



> +
> +        if (found && EDGE_COUNT(bb1->preds) == 2)
>>Space between EDGE_COUNT and the open paren.

>>You'd want to keep this test, then do some kind of dominance test like 
>>we talked about earlier in the discussion of this patch.

Jeff
> +          {
> +            FOR_EACH_EDGE (e1, ei, bb1->preds)
> +            {
> +              if (single_succ_p(e1->src) &&
> +                  single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
>>And you'd keep these tests in some form.

> +
> +/* This function performs the feasibility tests for path splitting
> +   to perform. Return false if the feasibility for path splitting
> +   is not done and returns true if the feasbility for path splitting
> +   is done. Following feasibility tests are performed.
> +
> +   1. Return false if the join block has rhs casting for assign
> +      gimple statements.
> +   2. If the number of phis is greater than 1 or the phi node in
> +      the join block has virtual operand return false.
> +   3. Return true if the number of sequential statements is
> +      greater than 2.
> +   4. If the phi result in the phi node of the join block is not
> +      used inside the same join block return false.
> +   7. Otherwise returns true.  */
>>These seem totally arbitrary.  What's the reason behind each of these 
>>restrictions?  None should be a correctness requirement AFAICT.  Others 
>>(like the number of statements) should probably be conditionalized on 
>>size vs speed optimizations.



 >> The join
> +   same as the source of the loop latch node is identified along
> +   with their predecessors.
>>I couldn't parse this sentence.



Based on the feasibility tests for
> +   path splitting the path splitting is performed by adding the
> +   join blocks into the predecessors after propagating the phi
> +   result with the corresponding phi arguments for the predecessors.
> +   The  tree-cfg-cleanup will merge the blocks in the predecessors
> +   path and the update-ssa will update the ssa representation after
> +   the path splitting is performed.  */
>>It would probably help to show a visual representation of what's 
>>happening.  ie, show a little CFG before and after.



> +
> +static bool
> +perform_path_splitting ()
> +{
> +  bool changed = false;
> +  basic_block trace[2] = {NULL,NULL};
> +  basic_block bb;
> +  auto_vec<fibonacci_node<long, basic_block_def>*> blocks;
> +  blocks.safe_grow_cleared (last_basic_block_for_fn (cfun));
> +  fibonacci_heap<long, basic_block_def> heap (LONG_MIN);
> +
> +  initialize_original_copy_tables();
> +  calculate_dominance_info (CDI_DOMINATORS);

> +
> +  FOR_EACH_BB_FN (bb, cfun)
> +  {
> +    if (!ignore_bb_p (bb))
> +      blocks[bb->index] = heap.insert (-bb->frequency, bb);
> +  }
> +
> +  while (!heap.empty())
>>Wouldn't it be easier to walk the loop tree?  We've got a fairly 
>>specific subgraph that we're looking for.  Why not walk the loop tree, 
>>using the latch as a point to search backwards for the shape of the 
>>subgraph we're interested in transforming?

>>That would seem to be a lot less expensive than looking at every block 
>>looking for the start of the subgraphs we're interested in.  That'd also 
>>seem to eliminate all the fibheap stuff.

> +
> +static unsigned int
> +execute_path_split (void)
> +{
> +  bool changed;
> +
> +  if (n_basic_blocks_for_fn (cfun) <= NUM_FIXED_BLOCKS + 1)
> +      return 0;
> +
> +  mark_dfs_back_edges ();
>>ISTM you could check the return value from mark_dfs_back_edges and do 
>>nothing if no back edges were found.

> +
> +static bool
> +gate_path_split(void)
> +{
> +  return flag_tree_path_split != 0;
> +}
> +
> +namespace {
> +
> +const pass_data pass_data_path_split =
> +{
> +  GIMPLE_PASS, /* type */
> +  "path_split", /* name */
> +  OPTGROUP_NONE, /* optinfo_flags */
> +  TV_TREE_PATH_SPLIT, /* tv_id */
> +  0, /* properties_required */
> +  0, /* properties_provided */
> +  0, /* properties_destroyed */
> +  0, /* todo_flags_start */
> +  TODO_update_ssa, /* todo_flags_finish */
>>It seems to me that you're probably missing some properties_required. 
>>You depend on a proper CFG and SSA graph, so at the least PROP_cfg and 
>>PROP_ssa.

I will incorporate all the above suggestions and changes and send the new patch.

>>Presumably you don't set TODO_cleanup_cfg so that you can avoid the cfg 
>>cleanup if nothing gets changed.  Is there some reason you don't do the 
>>same for the SSA graph update?

I am not sure the above logic can be used SSA graph Update and  having update SSA and CFG cleanup if the CFG is changed
Can be done at the same.

Thanks & Regards
Ajit

Jeff




^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-08-20 15:40                 ` Ajit Kumar Agarwal
@ 2015-08-20 15:49                   ` Jeff Law
  2015-08-27  6:00                     ` Ajit Kumar Agarwal
  0 siblings, 1 reply; 72+ messages in thread
From: Jeff Law @ 2015-08-20 15:49 UTC (permalink / raw)
  To: Ajit Kumar Agarwal, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On 08/20/2015 09:38 AM, Ajit Kumar Agarwal wrote:

>>
>> Bootstrapping with i386 and Microblaze target works fine. No
>> regression is seen in Deja GNU tests for Microblaze. There are lesser
>> failures. Mibench/EEMBC benchmarks were run for Microblaze target and
>> the gain of 9.3% is seen in rgbcmy_lite the EEMBC benchmarks.
>>> What do you mean by there are "lesser failures"?  Are you saying there are cases where path splitting generates incorrect code, or cases where path >>splitting produces code that is less efficient, or something else?
>
> I meant there are more Deja GNU testcases passes with the path splitting changes.
Ah, in that case, that's definitely good news!

jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-08-20 15:49                   ` Jeff Law
@ 2015-08-27  6:00                     ` Ajit Kumar Agarwal
  2015-09-09 21:45                       ` Jeff Law
  0 siblings, 1 reply; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-08-27  6:00 UTC (permalink / raw)
  To: Jeff Law, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala



-----Original Message-----
From: Jeff Law [mailto:law@redhat.com] 
Sent: Thursday, August 20, 2015 9:19 PM
To: Ajit Kumar Agarwal; Richard Biener
Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On 08/20/2015 09:38 AM, Ajit Kumar Agarwal wrote:

>>
>> Bootstrapping with i386 and Microblaze target works fine. No 
>> regression is seen in Deja GNU tests for Microblaze. There are lesser 
>> failures. Mibench/EEMBC benchmarks were run for Microblaze target and 
>> the gain of 9.3% is seen in rgbcmy_lite the EEMBC benchmarks.
>>> What do you mean by there are "lesser failures"?  Are you saying there are cases where path splitting generates incorrect code, or cases where path >>splitting produces code that is less efficient, or something else?
>
> I meant there are more Deja GNU testcases passes with the path splitting changes.
>>Ah, in that case, that's definitely good news!

Thanks. The following testcase testsuite/gcc.dg/tree-ssa/ifc-5.c

void
dct_unquantize_h263_inter_c (short *block, int n, int qscale, int nCoeffs)
{
  int i, level, qmul, qadd;

  qadd = (qscale - 1) | 1;
  qmul = qscale << 1;

  for (i = 0; i <= nCoeffs; i++)
    {
      level = block[i];
      if (level < 0)
        level = level * qmul - qadd;
      else
        level = level * qmul + qadd;
      block[i] = level;
    }
}

The above Loop is a candidate of path splitting as the IF block merges at the latch of the Loop and the path splitting duplicates
The latch of the loop which is the statement block[i] = level into the predecessors THEN and ELSE block.

Due to above path splitting,  the IF conversion is disabled and the above IF-THEN-ELSE is not IF-converted and the test case fails.

There were following review comments from the above patch.

+/* This function performs the feasibility tests for path splitting
> +   to perform. Return false if the feasibility for path splitting
> +   is not done and returns true if the feasibility for path splitting
> +   is done. Following feasibility tests are performed.
> +
> +   1. Return false if the join block has rhs casting for assign
> +      gimple statements.

Comments from Jeff:

>>These seem totally arbitrary.  What's the reason behind each of these 
>>restrictions?  None should be a correctness requirement AFAICT.  

In the above patch I have made a check given in point 1. in the loop latch and the Path splitting is disabled and the IF-conversion
happens and the test case passes.

I have incorporated the above review comments of not doing the above feasibility check of the point 1 and the above testcases goes
For path splitting and due to path splitting the if-cvt is not happening and the test case fails (expecting the pattern Applying if conversion 
To be present). With the above patch given for review and the Feasibility check of cast assign in the latch of the loop as given in point 1
 disables the path splitting  and if-cvt happens and the above test case passes.

Please let me know whether to keep the above feasibility check as given in point 1  or better appropriate changes required for the above 
Test case scenario of path splitting vs IF-conversion.

Thanks & Regards
Ajit


jeff


^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-08-19 21:53               ` Jeff Law
  2015-08-20 15:41                 ` Ajit Kumar Agarwal
@ 2015-09-04 18:07                 ` Ajit Kumar Agarwal
  2015-11-11 20:38                   ` Jeff Law
  1 sibling, 1 reply; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-09-04 18:07 UTC (permalink / raw)
  To: Jeff Law, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

[-- Attachment #1: Type: text/plain, Size: 15724 bytes --]

All:

Thanks Jeff for the review comments.

The patch attached incorporate all the review comments given below.

Bootstrapped on i386 and Microblaze and the Deja GNU tests for Microblaze results looks fine.

[Patch,tree-optimization]: Add new path Splitting pass on
 tree ssa representation.

Added a new pass on path splitting on tree SSA representation. The path
splitting optimization does the CFG transformation when the two execution
paths of the IF-THEN-ELSE merge at the latch node of loop, then duplicate
the merge mode into two paths preserving the SSA semantics.

ChangeLog:
2015-09-05  Ajit Agarwal  <ajitkum@xilinx.com>

	* Makefile.in (OBJS): Add tree-ssa-path-split.o
	* common.opt (ftree-path-split): Add the new flag.
	* opts.c (default_options_table) : Add an entry for
	Path splitting optimization at -O2 and above.
	* passes.def (path_split): Add new path splitting pass.
	* timevar.def (TV_TREE_PATH_SPLIT): New.
	* tree-pass.h (make_pass_path_split): New declaration.
	* tree-ssa-path-split.c: New file.
	* tracer.c (transform_duplicate): New function.
	* tracer.h: New header file.
	* doc/invoke.texi (ftree-path-split): Document.
	(fdump-tree-path_split): Document.
	* testsuite/gcc.dg/path-split-1.c: New.

Signed-off-by:Ajit Agarwal ajitkum@xilinx.com

Thanks & Regards
Ajit
-----Original Message-----
From: Jeff Law [mailto:law@redhat.com] 
Sent: Thursday, August 20, 2015 3:16 AM
To: Ajit Kumar Agarwal; Richard Biener
Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On 08/15/2015 11:01 AM, Ajit Kumar Agarwal wrote:
>
>
>  From cf2b64cc1d6623424d770f2a9ea257eb7e58e887 Mon Sep 17 00:00:00 
> 2001
> From: Ajit Kumar Agarwal<ajitkum@xilix.com>
> Date: Sat, 15 Aug 2015 18:19:14 +0200
> Subject: [PATCH] [Patch,tree-optimization]: Add new path Splitting pass on
>   tree ssa representation.
>
> Added a new pass on path splitting on tree SSA representation. The 
> path splitting optimization does the CFG transformation of join block 
> of the if-then-else same as the loop latch node is moved and merged 
> with the predecessor blocks after preserving the SSA representation.
>
> ChangeLog:
> 2015-08-15  Ajit Agarwal<ajitkum@xilinx.com>
>
> 	* gcc/Makefile.in: Add the build of the new file
> 	tree-ssa-path-split.c
Instead:

	* Makefile.in (OBJS): Add tree-ssa-path-split.o.


> 	* gcc/opts.c (OPT_ftree_path_split) : Add an entry for
> 	Path splitting pass with optimization flag greater and
> 	equal to O2.

	* opts.c (default_options_table): Add entry for path splitting
	optimization at -O2 and above.



> 	* gcc/passes.def (path_split): add new path splitting pass.
Capitalize "add".




> 	* gcc/tree-ssa-path-split.c: New.
Use "New file".

> 	* gcc/tracer.c (transform_duplicate): New.
Use "New function".

> 	* gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c: New.
> 	* gcc/testsuite/gcc.dg/path-split-1.c: New.
These belong in gcc/testsuite/ChangeLog and remove the "gcc/testsuite" 
prefix.

> 	* gcc/doc/invoke.texi
> 	(ftree-path-split): Document.
> 	(fdump-tree-path_split): Document.
Should just be two lines instead of three.

And more generally, there's no need to prefix ChangeLog entries with "gcc/".

Now that the ChangeLog nits are out of the way, let's get to stuff that's more interesting.



>
> Signed-off-by:Ajit Agarwalajitkum@xilinx.com
> ---
>   gcc/Makefile.in                              |   1 +
>   gcc/common.opt                               |   4 +
>   gcc/doc/invoke.texi                          |  16 +-
>   gcc/opts.c                                   |   1 +
>   gcc/passes.def                               |   1 +
>   gcc/testsuite/gcc.dg/path-split-1.c          |  65 ++++++
>   gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c |  60 +++++
>   gcc/timevar.def                              |   1 +
>   gcc/tracer.c                                 |  37 +--
>   gcc/tree-pass.h                              |   1 +
>   gcc/tree-ssa-path-split.c                    | 330 +++++++++++++++++++++++++++
>   11 files changed, 503 insertions(+), 14 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.dg/path-split-1.c
>   create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/path-split-2.c
>   create mode 100644 gcc/tree-ssa-path-split.c
>
> diff --git a/gcc/common.opt b/gcc/common.opt
> index e80eadf..1d02582 100644
> --- a/gcc/common.opt
> +++ b/gcc/common.opt
> @@ -2378,6 +2378,10 @@ ftree-vrp
>   Common Report Var(flag_tree_vrp) Init(0) Optimization
>   Perform Value Range Propagation on trees
>
> +ftree-path-split
> +Common Report Var(flag_tree_path_split) Init(0) Optimization
> +Perform Path Splitting
Maybe "Perform Path Splitting for loop backedges" or something which is 
a little more descriptive.  The above isn't exactly right, so don't use 
it as-is.



> @@ -9068,6 +9075,13 @@ enabled by default at @option{-O2} and higher.  Null pointer check
>   elimination is only done if @option{-fdelete-null-pointer-checks} is
>   enabled.
>
> +@item -ftree-path-split
> +@opindex ftree-path-split
> +Perform Path Splitting  on trees.  The join blocks of IF-THEN-ELSE same
> +as loop latch node is moved to its predecessor and the loop latch node
> +will be forwarding block.  This is enabled by default at @option{-O2}
> +and higher.
Needs some work.  Maybe something along the lines of

When two paths of execution merge immediately before a loop latch node, 
try to duplicate the merge node into the two paths.

> diff --git a/gcc/passes.def b/gcc/passes.def
> index 6b66f8f..20ddf3d 100644
> --- a/gcc/passes.def
> +++ b/gcc/passes.def
> @@ -82,6 +82,7 @@ along with GCC; see the file COPYING3.  If not see
>   	  NEXT_PASS (pass_ccp);
>   	  /* After CCP we rewrite no longer addressed locals into SSA
>   	     form if possible.  */
> +          NEXT_PASS (pass_path_split);
>   	  NEXT_PASS (pass_forwprop);
>   	  NEXT_PASS (pass_sra_early);
I can't recall if we've discussed the location of the pass at all.  I'm 
not objecting to this location, but would like to hear why you chose 
this particular location in the optimization pipeline.

>   	  /* pass_build_ealias is a dummy pass that ensures that we
> diff --git a/gcc/testsuite/gcc.dg/path-split-1.c b/gcc/testsuite/gcc.dg/path-split-1.c
ISTM the two tests should be combined into a single test.  I didn't see 
a functional difference in the test() function between those two tests.

I believe you can still create/scan debugging dumps with dg-do run test.


> +DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path_split")
tree path split rather than using underscores

> diff --git a/gcc/tracer.c b/gcc/tracer.c
> index cad7ab1..206692f 100644
> --- a/gcc/tracer.c
> +++ b/gcc/tracer.c
> @@ -58,11 +58,13 @@
>   #include "fibonacci_heap.h"
>
>   static int count_insns (basic_block);
> -static bool ignore_bb_p (const_basic_block);
> +bool ignore_bb_p (const_basic_block);
>   static bool better_p (const_edge, const_edge);
>   static edge find_best_successor (basic_block);
>   static edge find_best_predecessor (basic_block);
>   static int find_trace (basic_block, basic_block *);
> +basic_block transform_duplicate(basic_block bb,
> +                                basic_block  bb2);
Please create a tracer.h and put the newly exported prototypes in 
tracer.h.  Then include tracer.h in tracer.c and tree-ssa-path-split.c.

> @@ -224,6 +226,24 @@ find_trace (basic_block bb, basic_block *trace)
>     return i;
>   }
>
> +/* Transform the block that needs to be duplicated.  */
> +
> +basic_block
> +transform_duplicate(basic_block bb,
> +                    basic_block  bb2)
Space between the name of the function and first paren.  It looks like 
these two lines should be joined.  Single space between the type and the 
name of the argument.

Ultimately there's not a lot of shared code between the tracer and path 
splitting, which is basically what I expected.  Nevertheless, sharing a 
single implementation of those routines is probably wise.


> +
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#include "backend.h"
> +#include "cfghooks.h"
> +#include "tree.h"
> +#include "gimple.h"
> +#include "rtl.h"
> +#include "ssa.h"
> +#include "flags.h"
> +#include "alias.h"
> +#include "fold-const.h"
> +#include "stor-layout.h"
> +#include "calls.h"
> +#include "cfganal.h"
> +#include "internal-fn.h"
> +#include "gimple-fold.h"
> +#include "tree-eh.h"
> +#include "gimple-iterator.h"
> +#include "gimple-walk.h"
> +#include "tree-cfg.h"
> +#include "tree-ssa-loop-manip.h"
> +#include "tree-ssa-loop-niter.h"
> +#include "tree-ssa-loop.h"
> +#include "tree-into-ssa.h"
> +#include "tree-ssa.h"
> +#include "tree-pass.h"
> +#include "tree-dump.h"
> +#include "gimple-pretty-print.h"
> +#include "diagnostic-core.h"
> +#include "intl.h"
> +#include "cfgloop.h"
> +#include "tree-scalar-evolution.h"
> +#include "tree-ssa-propagate.h"
> +#include "tree-chrec.h"
> +#include "insn-config.h"
> +#include "expmed.h"
> +#include "dojump.h"
> +#include "explow.h"
> +#include "emit-rtl.h"
> +#include "varasm.h"
> +#include "stmt.h"
> +#include "expr.h"
> +#include "insn-codes.h"
> +#include "optabs.h"
> +#include "fibonacci_heap.h"
I'm having a hard time seeing how all these are needed.  Especially the 
RTL related includes.  These really need to be trimmed.


> +
> +extern basic_block transform_duplicate(basic_block bb, basic_block  bb2);
> +extern bool ignore_bb_p (const_basic_block bb);
With the prototypes moved to tracer.h, you won't want the extern 
declarations in here.

> +
> +/* This function gets the join blocks same as the source
> +   node of the loop latch nodes and form the trace with
> +   the join block and its predecessor.  */
I'm having a hard time parsing this comment. Please try to rewrite it to 
be clearer.

I suspect this routine is more complicated than it needs to be because 
of how you're searching for our subgraphs.


> +
> +static int
> +find_trace_loop_latch_same_as_join_blk (basic_block bb,
> +                                        basic_block *trace)
> +{
> +  vec<basic_block> bbs;
> +  basic_block bb1;
> +  unsigned int i;
> +  edge_iterator ei;
> +  edge e1;
> +  bool found = false;
> +  int n = 0;
> +
> +  bbs = get_all_dominated_blocks (CDI_DOMINATORS,
> +                                  bb );
> +  FOR_EACH_VEC_ELT (bbs, i, bb1)
> +  {
> +    FOR_EACH_EDGE (e1, ei, bb->succs)
> +    {
> +      if ( bb1 == e1->dest)
> +        {
> +          found = true;
> +        }
> +    }
> +
> +    if (!found && bb1 != bb)
> +      {
> +        found = false;
> +        FOR_EACH_EDGE (e1, ei, bb1->succs)
> +        {
> +          if (e1->flags & (EDGE_DFS_BACK))
> +            {
> +              trace[1] = e1->src;
> +              n++;
> +              found = true;
> +            }
> +        }
It seems to me all this can be changed to look for the backedges via the 
loop tree.



> +
> +        if (found && EDGE_COUNT(bb1->preds) == 2)
Space between EDGE_COUNT and the open paren.

You'd want to keep this test, then do some kind of dominance test like 
we talked about earlier in the discussion of this patch.

Jeff
> +          {
> +            FOR_EACH_EDGE (e1, ei, bb1->preds)
> +            {
> +              if (single_succ_p(e1->src) &&
> +                  single_succ_edge (e1->src)->flags & EDGE_FALLTHRU)
And you'd keep these tests in some form.

> +
> +/* This function performs the feasibility tests for path splitting
> +   to perform. Return false if the feasibility for path splitting
> +   is not done and returns true if the feasbility for path splitting
> +   is done. Following feasibility tests are performed.
> +
> +   1. Return false if the join block has rhs casting for assign
> +      gimple statements.
> +   2. If the number of phis is greater than 1 or the phi node in
> +      the join block has virtual operand return false.
> +   3. Return true if the number of sequential statements is
> +      greater than 2.
> +   4. If the phi result in the phi node of the join block is not
> +      used inside the same join block return false.
> +   7. Otherwise returns true.  */
These seem totally arbitrary.  What's the reason behind each of these 
restrictions?  None should be a correctness requirement AFAICT.  Others 
(like the number of statements) should probably be conditionalized on 
size vs speed optimizations.



  The join
> +   same as the source of the loop latch node is identified along
> +   with their predecessors.
I couldn't parse this sentence.



Based on the feasibility tests for
> +   path splitting the path splitting is performed by adding the
> +   join blocks into the predecessors after propagating the phi
> +   result with the corresponding phi arguments for the predecessors.
> +   The  tree-cfg-cleanup will merge the blocks in the predecessors
> +   path and the update-ssa will update the ssa representation after
> +   the path splitting is performed.  */
It would probably help to show a visual representation of what's 
happening.  ie, show a little CFG before and after.



> +
> +static bool
> +perform_path_splitting ()
> +{
> +  bool changed = false;
> +  basic_block trace[2] = {NULL,NULL};
> +  basic_block bb;
> +  auto_vec<fibonacci_node<long, basic_block_def>*> blocks;
> +  blocks.safe_grow_cleared (last_basic_block_for_fn (cfun));
> +  fibonacci_heap<long, basic_block_def> heap (LONG_MIN);
> +
> +  initialize_original_copy_tables();
> +  calculate_dominance_info (CDI_DOMINATORS);

> +
> +  FOR_EACH_BB_FN (bb, cfun)
> +  {
> +    if (!ignore_bb_p (bb))
> +      blocks[bb->index] = heap.insert (-bb->frequency, bb);
> +  }
> +
> +  while (!heap.empty())
Wouldn't it be easier to walk the loop tree?  We've got a fairly 
specific subgraph that we're looking for.  Why not walk the loop tree, 
using the latch as a point to search backwards for the shape of the 
subgraph we're interested in transforming?

That would seem to be a lot less expensive than looking at every block 
looking for the start of the subgraphs we're interested in.  That'd also 
seem to eliminate all the fibheap stuff.

> +
> +static unsigned int
> +execute_path_split (void)
> +{
> +  bool changed;
> +
> +  if (n_basic_blocks_for_fn (cfun) <= NUM_FIXED_BLOCKS + 1)
> +      return 0;
> +
> +  mark_dfs_back_edges ();
ISTM you could check the return value from mark_dfs_back_edges and do 
nothing if no back edges were found.

> +
> +static bool
> +gate_path_split(void)
> +{
> +  return flag_tree_path_split != 0;
> +}
> +
> +namespace {
> +
> +const pass_data pass_data_path_split =
> +{
> +  GIMPLE_PASS, /* type */
> +  "path_split", /* name */
> +  OPTGROUP_NONE, /* optinfo_flags */
> +  TV_TREE_PATH_SPLIT, /* tv_id */
> +  0, /* properties_required */
> +  0, /* properties_provided */
> +  0, /* properties_destroyed */
> +  0, /* todo_flags_start */
> +  TODO_update_ssa, /* todo_flags_finish */
It seems to me that you're probably missing some properties_required. 
You depend on a proper CFG and SSA graph, so at the least PROP_cfg and 
PROP_ssa.

Presumably you don't set TODO_cleanup_cfg so that you can avoid the cfg 
cleanup if nothing gets changed.  Is there some reason you don't do the 
same for the SSA graph update?

Jeff




[-- Attachment #2: path-splitting.patch.patch --]
[-- Type: application/octet-stream, Size: 18398 bytes --]

From 5280ddaddf71ed2f15340b7e6be12682294134ea Mon Sep 17 00:00:00 2001
From: Ajit Kumar Agarwal <ajitkum@xilix.com>
Date: Fri, 4 Sep 2015 18:50:48 +0200
Subject: [PATCH] [Patch,tree-optimization]: Add new path Splitting pass on
 tree ssa representation.

Added a new pass on path splitting on tree SSA representation. The path
splitting optimization does the CFG transformation when the two execution
paths of the IF-THEN-ELSE merge at the latch node of loop, then duplicate
the merge mode into two paths preserving the SSA semantics.

ChangeLog:
2015-09-05  Ajit Agarwal  <ajitkum@xilinx.com>

	* Makefile.in (OBJS): Add tree-ssa-path-split.o
	* common.opt (ftree-path-split): Add the new flag.
	* opts.c (default_options_table) : Add an entry for
	Path splitting optimization at -O2 and above.
	* passes.def (path_split): Add new path splitting pass.
	* timevar.def (TV_TREE_PATH_SPLIT): New.
	* tree-pass.h (make_pass_path_split): New declaration.
	* tree-ssa-path-split.c: New file.
	* tracer.c (transform_duplicate): New function.
	* tracer.h: New header file.
	* doc/invoke.texi (ftree-path-split): Document.
	(fdump-tree-path_split): Document.
	* testsuite/gcc.dg/path-split-1.c: New.

Signed-off-by:Ajit Agarwal ajitkum@xilinx.com
---
 gcc/Makefile.in                     |   1 +
 gcc/common.opt                      |   4 +
 gcc/doc/invoke.texi                 |  16 ++-
 gcc/opts.c                          |   1 +
 gcc/passes.def                      |   1 +
 gcc/testsuite/gcc.dg/path-split-1.c |  69 ++++++++++
 gcc/timevar.def                     |   1 +
 gcc/tracer.c                        |  33 +++--
 gcc/tracer.h                        |  26 ++++
 gcc/tree-pass.h                     |   1 +
 gcc/tree-ssa-path-split.c           | 255 ++++++++++++++++++++++++++++++++++++
 11 files changed, 394 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/path-split-1.c
 create mode 100644 gcc/tracer.h
 create mode 100644 gcc/tree-ssa-path-split.c

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 3d1c1e5..7d3abf6 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1462,6 +1462,7 @@ OBJS = \
 	tree-ssa-loop.o \
 	tree-ssa-math-opts.o \
 	tree-ssa-operands.o \
+	tree-ssa-path-split.o \
 	tree-ssa-phiopt.o \
 	tree-ssa-phiprop.o \
 	tree-ssa-pre.o \
diff --git a/gcc/common.opt b/gcc/common.opt
index 94d1d88..da76d74 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2378,6 +2378,10 @@ ftree-vrp
 Common Report Var(flag_tree_vrp) Init(0) Optimization
 Perform Value Range Propagation on trees
 
+ftree-path-split
+Common Report Var(flag_tree_path_split) Init(0) Optimization
+Perform Path Splitting on trees for loop backedges
+
 funit-at-a-time
 Common Report Var(flag_unit_at_a_time) Init(1)
 Compile whole compilation unit at a time
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index d7dc64e..c5e06e2 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -349,6 +349,7 @@ Objective-C and Objective-C++ Dialects}.
 -fdump-tree-fre@r{[}-@var{n}@r{]} @gol
 -fdump-tree-vtable-verify @gol
 -fdump-tree-vrp@r{[}-@var{n}@r{]} @gol
+-fdump-tree-path_split@r{[}-@var{n}@r{]} @gol
 -fdump-tree-storeccp@r{[}-@var{n}@r{]} @gol
 -fdump-final-insns=@var{file} @gol
 -fcompare-debug@r{[}=@var{opts}@r{]}  -fcompare-debug-second @gol
@@ -456,7 +457,7 @@ Objective-C and Objective-C++ Dialects}.
 -ftree-parallelize-loops=@var{n} -ftree-pre -ftree-partial-pre -ftree-pta @gol
 -ftree-reassoc -ftree-sink -ftree-slsr -ftree-sra @gol
 -ftree-switch-conversion -ftree-tail-merge -ftree-ter @gol
--ftree-vectorize -ftree-vrp @gol
+-ftree-vectorize -ftree-vrp @gol -ftree-path-split @gol
 -funit-at-a-time -funroll-all-loops -funroll-loops @gol
 -funsafe-loop-optimizations -funsafe-math-optimizations -funswitch-loops @gol
 -fipa-ra -fvariable-expansion-in-unroller -fvect-cost-model -fvpt @gol
@@ -7154,6 +7155,11 @@ is made by appending @file{.slp} to the source file name.
 Dump each function after Value Range Propagation (VRP).  The file name
 is made by appending @file{.vrp} to the source file name.
 
+@item path_split
+@opindex fdump-tree-path_split
+Dump each function after path splitting.  The file name is made by
+appending @file{.path_split} to the source file name
+
 @item all
 @opindex fdump-tree-all
 Enable all the available tree dumps with the flags provided in this option.
@@ -7656,6 +7662,7 @@ also turns on the following optimization flags:
 -ftree-switch-conversion -ftree-tail-merge @gol
 -ftree-pre @gol
 -ftree-vrp @gol
+-ftree-path-split @gol
 -fipa-ra}
 
 Please note the warning under @option{-fgcse} about
@@ -8957,6 +8964,13 @@ enabled by default at @option{-O2} and higher.  Null pointer check
 elimination is only done if @option{-fdelete-null-pointer-checks} is
 enabled.
 
+@item -ftree-path-split
+@opindex ftree-path-split
+Perform Path Splitting  on trees.  When two execution path of the
+if-then-else merge at the loop latch node, try to duplicate the
+merge node into two paths. This is enabled by default at @option{-O2}
+and above.
+
 @item -fsplit-ivs-in-unroller
 @opindex fsplit-ivs-in-unroller
 Enables expression of values of induction variables in later iterations
diff --git a/gcc/opts.c b/gcc/opts.c
index f1a9acd..b75becc 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -506,6 +506,7 @@ static const struct default_options default_options_table[] =
     { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
+    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },
 
     /* -O3 optimizations.  */
     { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
diff --git a/gcc/passes.def b/gcc/passes.def
index 64fc4d9..536ef32 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -81,6 +81,7 @@ along with GCC; see the file COPYING3.  If not see
 	  NEXT_PASS (pass_ccp);
 	  /* After CCP we rewrite no longer addressed locals into SSA
 	     form if possible.  */
+          NEXT_PASS (pass_path_split);
 	  NEXT_PASS (pass_forwprop);
 	  NEXT_PASS (pass_sra_early);
 	  /* pass_build_ealias is a dummy pass that ensures that we
diff --git a/gcc/testsuite/gcc.dg/path-split-1.c b/gcc/testsuite/gcc.dg/path-split-1.c
new file mode 100644
index 0000000..af01e49
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/path-split-1.c
@@ -0,0 +1,69 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-tree-path_split " } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define RGBMAX 255
+
+int
+test()
+{
+  int i, Pels;
+  unsigned char sum = 0;
+  unsigned char xr, xg, xb;
+  unsigned char xc, xm, xy, xk;
+  unsigned char *ReadPtr, *EritePtr;
+
+  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+  EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+
+  for (i = 0; i < 100;i++)
+     {
+       ReadPtr[i] = 100 - i;
+     }
+
+  for (i = 0; i < 100; i++)
+     {
+       xr = *ReadPtr++;
+       xg = *ReadPtr++;
+       xb = *ReadPtr++;
+
+       xc = (unsigned char) (RGBMAX - xr);
+       xm = (unsigned char) (RGBMAX - xg);
+       xy = (unsigned char) (RGBMAX - xb);
+
+       if (xc < xm)
+         {
+           xk = (unsigned char) (xc < xy ? xc : xy);
+         }
+       else
+        {
+          xk = (unsigned char) (xm < xy ? xm : xy);
+        }
+
+       xc = (unsigned char) (xc - xk);
+       xm = (unsigned char) (xm - xk);
+       xy = (unsigned char) (xy - xk);
+
+       *EritePtr++ = xc;
+       *EritePtr++ = xm;
+       *EritePtr++ = xy;
+       *EritePtr++ = xk;
+       sum += *EritePtr;
+    }
+  return sum;
+}
+
+int
+main()
+{
+  if (test() != 33)
+    abort();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "xc_\[0-9\]\[0-9\]* -> { xc_\[0-9\]\[0-9\]* }" "path_split"} } */
+/* { dg-final { scan-tree-dump "xm_\[0-9\]\[0-9\]* -> { xm_\[0-9\]\[0-9\]* }" "path_split"} } */
+/* { dg-final { scan-tree-dump "xy_\[0-9\]\[0-9\]* -> { xy_\[0-9\]\[0-9\]* }" "path_split"} } */
diff --git a/gcc/timevar.def b/gcc/timevar.def
index ac41075..e57e2ab 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -298,3 +298,4 @@ DEFTIMEVAR (TV_LINK		     , "link JIT code")
 DEFTIMEVAR (TV_LOAD		     , "load JIT result")
 DEFTIMEVAR (TV_JIT_ACQUIRING_MUTEX   , "acquiring JIT mutex")
 DEFTIMEVAR (TV_JIT_CLIENT_CODE   , "JIT client code")
+DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path split")
diff --git a/gcc/tracer.c b/gcc/tracer.c
index cad7ab1..fb618d6 100644
--- a/gcc/tracer.c
+++ b/gcc/tracer.c
@@ -56,9 +56,9 @@
 #include "tree-inline.h"
 #include "cfgloop.h"
 #include "fibonacci_heap.h"
+#include "tracer.h"
 
 static int count_insns (basic_block);
-static bool ignore_bb_p (const_basic_block);
 static bool better_p (const_edge, const_edge);
 static edge find_best_successor (basic_block);
 static edge find_best_predecessor (basic_block);
@@ -90,7 +90,7 @@ bb_seen_p (basic_block bb)
 }
 
 /* Return true if we should ignore the basic block for purposes of tracing.  */
-static bool
+bool
 ignore_bb_p (const_basic_block bb)
 {
   gimple g;
@@ -224,6 +224,22 @@ find_trace (basic_block bb, basic_block *trace)
   return i;
 }
 
+basic_block
+transform_duplicate (basic_block bb, basic_block bb2)
+{
+  edge e;
+  basic_block copy;
+
+  e = find_edge (bb, bb2);
+
+  copy = duplicate_block (bb2, e, bb);
+  flush_pending_stmts (e);
+
+  add_phi_args_after_copy (&copy, 1, NULL);
+
+  return (copy);
+}
+
 /* Look for basic blocks in frequency order, construct traces and tail duplicate
    if profitable.  */
 
@@ -319,17 +335,8 @@ tail_duplicate (void)
 		 entries or at least rotate the loop.  */
 	      && bb2->loop_father->header != bb2)
 	    {
-	      edge e;
-	      basic_block copy;
-
-	      nduplicated += counts [bb2->index];
-
-	      e = find_edge (bb, bb2);
-
-	      copy = duplicate_block (bb2, e, bb);
-	      flush_pending_stmts (e);
-
-	      add_phi_args_after_copy (&copy, 1, NULL);
+              nduplicated += counts [bb2->index];
+              basic_block copy = transform_duplicate (bb, bb2);
 
 	      /* Reconsider the original copy of block we've duplicated.
 	         Removing the most common predecessor may make it to be
diff --git a/gcc/tracer.h b/gcc/tracer.h
new file mode 100644
index 0000000..454d3b7
--- /dev/null
+++ b/gcc/tracer.h
@@ -0,0 +1,26 @@
+/* Header file for Tracer.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_TRACER_H
+#define GCC_TRACER_H
+
+extern basic_block transform_duplicate (basic_block bb, basic_block bb2);
+extern bool ignore_bb_p (const_basic_block bb);
+
+#endif /* GCC_TRaCER_H */
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 7b66a1c..6af7f0d 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -383,6 +383,7 @@ extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_path_split (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_build_ssa (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_build_alias (gcc::context *ctxt);
diff --git a/gcc/tree-ssa-path-split.c b/gcc/tree-ssa-path-split.c
new file mode 100644
index 0000000..f8fd098
--- /dev/null
+++ b/gcc/tree-ssa-path-split.c
@@ -0,0 +1,255 @@
+/* Support routines for Path Splitting.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "cfghooks.h"
+#include "tree.h"
+#include "gimple.h"
+#include "rtl.h"
+#include "ssa.h"
+#include "flags.h"
+#include "alias.h"
+#include "fold-const.h"
+#include "stor-layout.h"
+#include "calls.h"
+#include "cfganal.h"
+#include "internal-fn.h"
+#include "gimple-fold.h"
+#include "tree-eh.h"
+#include "gimple-iterator.h"
+#include "gimple-walk.h"
+#include "tree-cfg.h"
+#include "tree-ssa-loop-manip.h"
+#include "tree-ssa-loop-niter.h"
+#include "tree-ssa-loop.h"
+#include "tree-into-ssa.h"
+#include "tree-ssa.h"
+#include "tree-pass.h"
+#include "tree-dump.h"
+#include "cfgloop.h"
+#include "tree-scalar-evolution.h"
+#include "tree-ssa-propagate.h"
+#include "tree-chrec.h"
+#include "tracer.h"
+
+/* Get the latch node and predecessor of the latch node of loop if
+   the immediate dominator of the latch node of loop is the block
+   with conditional branch.  */
+
+static void
+find_trace_loop_latch_same_as_join_blk (loop_p loop, basic_block *trace)
+{
+  edge_iterator ei;
+  edge e1;
+
+  basic_block latch = loop->latch;
+
+  if (EDGE_COUNT (latch->preds) == 2)
+    {
+      basic_block bb
+                 = get_immediate_dominator (CDI_DOMINATORS, latch);
+      gimple last = gsi_stmt (gsi_last_bb (bb));
+
+      if (last && gimple_code (last) != GIMPLE_COND)
+        return ;
+
+      trace[1] = latch;
+
+      FOR_EACH_EDGE (e1, ei, latch->preds)
+      {
+        if (!single_succ_p(e1->src) ||
+            !(single_succ_edge (e1->src)->flags & EDGE_FALLTHRU))
+          break;
+        else
+          {
+            trace[0] = e1->src;
+            break;
+          }
+      }
+   }
+}
+
+/* Return false when there is cast assign stmt. Return true
+   if the number of stmt greater than 1 otherwise false.  */
+
+static bool
+is_feasible_trace (basic_block bb)
+{
+  int num_stmt = 0;
+  gimple_stmt_iterator gsi;
+
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+     {
+       gimple stmt = gsi_stmt (gsi);
+       if (gimple_assign_cast_p (stmt))
+         return false;
+       if (!is_gimple_debug (stmt))
+         num_stmt++;
+     }
+
+  if (num_stmt > 1)
+    return true;
+
+  return false;
+}
+
+/* If the immediate dominator of the latch of the loop is
+   block with conditional branch, then the loop latch  is
+   duplicated to its predecessors path preserving the SSA
+   semantics.
+
+   CFG before transformation.
+ 
+   <bb 6>:
+      xk_35 = MIN_EXPR <xy_34, xc_32>;
+      goto <bb 8>;
+
+   <bb 7>:
+      xk_36 = MIN_EXPR <xy_34, xm_33>;
+
+   <bb 8>:
+      # xk_4 = PHI <xk_35(6), xk_36(7)>
+      xc_37 = xc_32 - xk_4;
+      xm_38 = xm_33 - xk_4;
+      xy_39 = xy_34 - xk_4;
+
+   CFG After Path Splitting transformation
+   before cleanup phase.
+
+   <bb 7>:
+     xk_35 = MIN_EXPR <xy_34, xc_32>;
+
+   <bb 8>:
+     # xk_29 = PHI <xk_35(7)>
+     xc_56 = xc_32 - xk_29;
+     xm_57 = xm_33 - xk_29;
+     xy_58 = xy_34 - xk_29;
+     goto <bb 11>;
+
+   <bb 9>:
+     xk_36 = MIN_EXPR <xy_34, xm_33>;
+
+   <bb 10>:
+     # xk_4 = PHI <xk_36(9)>
+     xc_37 = xc_32 - xk_4;
+     xm_38 = xm_33 - xk_4;
+     xy_39 = xy_34 - xk_4;
+
+  <bb 11>: .......  */
+ 
+static bool
+perform_path_splitting ()
+{
+  bool changed = false;
+  basic_block trace[2] = {NULL, NULL};
+  loop_p loop;
+
+  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
+  initialize_original_copy_tables ();
+  calculate_dominance_info (CDI_DOMINATORS);
+
+  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
+  {
+    if (ignore_bb_p (loop->latch))
+      continue;
+    find_trace_loop_latch_same_as_join_blk (loop, trace);
+
+    if (trace[0] && trace[1] && is_feasible_trace (trace[1]))
+      {
+        transform_duplicate (trace[0], trace[1]);
+        trace[0] = NULL;
+        trace[1] = NULL;
+        changed = true;
+      }
+  }
+  loop_optimizer_finalize ();
+  free_original_copy_tables ();
+  return changed;
+}
+
+static unsigned int
+execute_path_split (void)
+{
+  bool changed;
+
+  if (n_basic_blocks_for_fn (cfun) <= NUM_FIXED_BLOCKS + 1)
+    return 0;
+
+  if (!(mark_dfs_back_edges ()))
+    return 0;
+
+  changed = perform_path_splitting();
+
+  if (changed)
+    {
+      free_dominance_info (CDI_DOMINATORS);
+      /* If we changed the CFG schedule loops for fixup by cleanup_cfg.  */
+      if (current_loops)
+        loops_state_set (LOOPS_NEED_FIXUP);
+    }
+ 
+  return changed ? TODO_cleanup_cfg : 0;
+
+}
+
+static bool
+gate_path_split(void)
+{
+  return flag_tree_path_split != 0;
+}
+
+namespace {
+
+const pass_data pass_data_path_split =
+{
+  GIMPLE_PASS, /* type */
+  "path_split", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_TREE_PATH_SPLIT, /* tv_id */
+  PROP_ssa, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_update_ssa, /* todo_flags_finish */
+};
+
+class pass_path_split : public gimple_opt_pass
+{
+   public:
+    pass_path_split (gcc::context *ctxt)
+      : gimple_opt_pass (pass_data_path_split, ctxt)
+    {}
+   /* opt_pass methods: */
+   opt_pass * clone () { return new pass_path_split (m_ctxt); }
+   virtual bool gate (function *) { return gate_path_split (); }
+   virtual unsigned int execute (function *) { return execute_path_split (); }
+ 
+}; // class pass_path_split
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_path_split (gcc::context *ctxt)
+{
+  return new pass_path_split (ctxt);
+}
-- 
1.8.2.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-08-27  6:00                     ` Ajit Kumar Agarwal
@ 2015-09-09 21:45                       ` Jeff Law
  2015-09-12 12:05                         ` Ajit Kumar Agarwal
  0 siblings, 1 reply; 72+ messages in thread
From: Jeff Law @ 2015-09-09 21:45 UTC (permalink / raw)
  To: Ajit Kumar Agarwal, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On 08/26/2015 11:29 PM, Ajit Kumar Agarwal wrote:
>
> Thanks. The following testcase testsuite/gcc.dg/tree-ssa/ifc-5.c
>
> void dct_unquantize_h263_inter_c (short *block, int n, int qscale,
> int nCoeffs) { int i, level, qmul, qadd;
>
> qadd = (qscale - 1) | 1; qmul = qscale << 1;
>
> for (i = 0; i <= nCoeffs; i++) { level = block[i]; if (level < 0)
> level = level * qmul - qadd; else level = level * qmul + qadd;
> block[i] = level; } }
>
> The above Loop is a candidate of path splitting as the IF block
> merges at the latch of the Loop and the path splitting duplicates The
> latch of the loop which is the statement block[i] = level into the
> predecessors THEN and ELSE block.
>
> Due to above path splitting,  the IF conversion is disabled and the
> above IF-THEN-ELSE is not IF-converted and the test case fails.
So I think the question then becomes which of the two styles generally 
results in better code?  The path-split version or the older 
if-converted version.

If the latter, then this may suggest that we've got the path splitting 
code at the wrong stage in the optimizer pipeline or that we need better 
heuristics for when to avoid applying path splitting.



>
> There were following review comments from the above patch.
>
> +/* This function performs the feasibility tests for path splitting
>> +   to perform. Return false if the feasibility for path splitting
>> +   is not done and returns true if the feasibility for path
>> splitting +   is done. Following feasibility tests are performed.
>> + +   1. Return false if the join block has rhs casting for assign
>> +      gimple statements.
>
> Comments from Jeff:
>
>>> These seem totally arbitrary.  What's the reason behind each of
>>> these restrictions?  None should be a correctness requirement
>>> AFAICT.
>
> In the above patch I have made a check given in point 1. in the loop
> latch and the Path splitting is disabled and the IF-conversion
> happens and the test case passes.
That sounds more like a work-around/hack.  There's nothing inherent with 
a type conversion that should disable path splitting.

What happens if we delay path splitting to a point after if-conversion 
is complete?

Alternately, could if-conversion export a routine which indicates if a 
particular sub-graph is likely to be if-convertable?  The path splitting 
pass could then use that routine to help determine if the path ought to 
be split or if it should instead rely on if-conversion.


Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-09-09 21:45                       ` Jeff Law
@ 2015-09-12 12:05                         ` Ajit Kumar Agarwal
  2015-10-20 16:05                           ` Ajit Kumar Agarwal
  2015-11-11  7:01                           ` Jeff Law
  0 siblings, 2 replies; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-09-12 12:05 UTC (permalink / raw)
  To: Jeff Law, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala



-----Original Message-----
From: Jeff Law [mailto:law@redhat.com] 
Sent: Thursday, September 10, 2015 3:10 AM
To: Ajit Kumar Agarwal; Richard Biener
Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On 08/26/2015 11:29 PM, Ajit Kumar Agarwal wrote:
>
> Thanks. The following testcase testsuite/gcc.dg/tree-ssa/ifc-5.c
>
> void dct_unquantize_h263_inter_c (short *block, int n, int qscale, int 
> nCoeffs) { int i, level, qmul, qadd;
>
> qadd = (qscale - 1) | 1; qmul = qscale << 1;
>
> for (i = 0; i <= nCoeffs; i++) { level = block[i]; if (level < 0) 
> level = level * qmul - qadd; else level = level * qmul + qadd; 
> block[i] = level; } }
>
> The above Loop is a candidate of path splitting as the IF block merges 
> at the latch of the Loop and the path splitting duplicates The latch 
> of the loop which is the statement block[i] = level into the 
> predecessors THEN and ELSE block.
>
> Due to above path splitting,  the IF conversion is disabled and the 
> above IF-THEN-ELSE is not IF-converted and the test case fails.
>>So I think the question then becomes which of the two styles generally results in better code?  The path-split version or the older if-converted version.

>>If the latter, then this may suggest that we've got the path splitting code at the wrong stage in the optimizer pipeline or that we need better heuristics for >>when to avoid applying path splitting.

The code generated by the Path Splitting is useful when it exposes the DCE, PRE,CCP candidates. Whereas the IF-conversion is useful
When the if-conversion exposes the vectorization candidates. If the  if-conversion doesn't exposes the vectorization and the path splitting doesn't 
Exposes the DCE, PRE redundancy candidates, it's hard to predict. If the if-conversion does not exposes the vectorization and in the similar case
Path splitting exposes the DCE , PRE  and CCP redundancy candidates then path splitting is useful. Also the path splitting increases the granularity of the
THEN and ELSE path makes better register allocation and code scheduling.

The suggestion for keeping the path splitting later in the pipeline after the if-conversion and the vectorization is useful as it doesn't break the
Existing Deja GNU tests. Also useful to keep the path splitting later in the pipeline after the if-conversion and vectorization is that path splitting
Can always duplicate the merge node into its predecessor after the if-conversion and vectorization pass, if the if-conversion and vectorization
Is not applicable to the Loops. But this suppresses the CCP, PRE candidates which are earlier in the optimization pipeline.


>
> There were following review comments from the above patch.
>
> +/* This function performs the feasibility tests for path splitting
>> +   to perform. Return false if the feasibility for path splitting
>> +   is not done and returns true if the feasibility for path
>> splitting +   is done. Following feasibility tests are performed.
>> + +   1. Return false if the join block has rhs casting for assign
>> +      gimple statements.
>
> Comments from Jeff:
>
>>> These seem totally arbitrary.  What's the reason behind each of 
>>> these restrictions?  None should be a correctness requirement 
>>> AFAICT.
>
> In the above patch I have made a check given in point 1. in the loop 
> latch and the Path splitting is disabled and the IF-conversion happens 
> and the test case passes.
>>That sounds more like a work-around/hack.  There's nothing inherent with a type conversion that should disable path splitting.

I have sent the patch with this change and I will remove the above check from the patch.

>>What happens if we delay path splitting to a point after if-conversion is complete?

This is better suggestion as explained above, but gains achieved through path splitting by keeping earlier in the pipeline before if-conversion
, tree-vectorization, tree-vrp is suppressed if the following optimization after path splitting is not applicable for the above loops.

I have made the above changes and the existing set up doesn't break but the gains achieved in the benchmarks like rgbcmy_lite(EEMBC)
Benchmarks is suppressed. The path splitting for the above EEMBC benchmarks give gains of 9% and for such loops if-conversion and
Vectorization is not applicable  exposing gain with path splitting optimizations.

>>Alternately, could if-conversion export a routine which indicates if a particular sub-graph is likely to be if-convertable?  The path splitting pass could then use >>that routine to help determine if the path ought to be split or if it should instead rely on if-conversion.

Exporting the above routine from IF-conversion is not useful as the heuristics used in IF-conversion populates the Data Dependence through
Scalar evolution which is trigger much later in the optimization pipeline. Populating such info in the earlier stage of the optimizations will not work as the 
Data Dependence through scalar evolution is trigger much later in the optimization  pipeline. Structure of basic block hierarchy required for if-conversion
Looks similar to path splitting and exporting such function disables the path splitting for the cases where if-conversion is not applicable as there
Are more data dependency check that distinguishes the if-conversion and path splitting.

Considering all the above, Keeping path splitting after if-conversion and vectorization looks better suggestion though this suppresses gains achieved 
Given above because of having path splitting after  tree-vrp, PRE, CCP. There is a dce path after the tree-vrp, if-conversion and tree-vectorize
So the path splitting exposing DCE will not be affected.

Please suggest.

Thanks & Regards
Ajit

Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-09-12 12:05                         ` Ajit Kumar Agarwal
@ 2015-10-20 16:05                           ` Ajit Kumar Agarwal
  2015-11-11  7:01                           ` Jeff Law
  1 sibling, 0 replies; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-10-20 16:05 UTC (permalink / raw)
  To: Jeff Law, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

Hello Jeff:

Did you get a chance to look at the below response. Please let me know your opinion on the below.

Thanks & Regards
Ajit

-----Original Message-----
From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-owner@gcc.gnu.org] On Behalf Of Ajit Kumar Agarwal
Sent: Saturday, September 12, 2015 4:09 PM
To: Jeff Law; Richard Biener
Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation



-----Original Message-----
From: Jeff Law [mailto:law@redhat.com]
Sent: Thursday, September 10, 2015 3:10 AM
To: Ajit Kumar Agarwal; Richard Biener
Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On 08/26/2015 11:29 PM, Ajit Kumar Agarwal wrote:
>
> Thanks. The following testcase testsuite/gcc.dg/tree-ssa/ifc-5.c
>
> void dct_unquantize_h263_inter_c (short *block, int n, int qscale, int
> nCoeffs) { int i, level, qmul, qadd;
>
> qadd = (qscale - 1) | 1; qmul = qscale << 1;
>
> for (i = 0; i <= nCoeffs; i++) { level = block[i]; if (level < 0) 
> level = level * qmul - qadd; else level = level * qmul + qadd; 
> block[i] = level; } }
>
> The above Loop is a candidate of path splitting as the IF block merges 
> at the latch of the Loop and the path splitting duplicates The latch 
> of the loop which is the statement block[i] = level into the 
> predecessors THEN and ELSE block.
>
> Due to above path splitting,  the IF conversion is disabled and the 
> above IF-THEN-ELSE is not IF-converted and the test case fails.
>>So I think the question then becomes which of the two styles generally results in better code?  The path-split version or the older if-converted version.

>>If the latter, then this may suggest that we've got the path splitting code at the wrong stage in the optimizer pipeline or that we need better heuristics for >>when to avoid applying path splitting.

The code generated by the Path Splitting is useful when it exposes the DCE, PRE,CCP candidates. Whereas the IF-conversion is useful When the if-conversion exposes the vectorization candidates. If the  if-conversion doesn't exposes the vectorization and the path splitting doesn't Exposes the DCE, PRE redundancy candidates, it's hard to predict. If the if-conversion does not exposes the vectorization and in the similar case Path splitting exposes the DCE , PRE  and CCP redundancy candidates then path splitting is useful. Also the path splitting increases the granularity of the THEN and ELSE path makes better register allocation and code scheduling.

The suggestion for keeping the path splitting later in the pipeline after the if-conversion and the vectorization is useful as it doesn't break the Existing Deja GNU tests. Also useful to keep the path splitting later in the pipeline after the if-conversion and vectorization is that path splitting Can always duplicate the merge node into its predecessor after the if-conversion and vectorization pass, if the if-conversion and vectorization Is not applicable to the Loops. But this suppresses the CCP, PRE candidates which are earlier in the optimization pipeline.


>
> There were following review comments from the above patch.
>
> +/* This function performs the feasibility tests for path splitting
>> +   to perform. Return false if the feasibility for path splitting
>> +   is not done and returns true if the feasibility for path
>> splitting +   is done. Following feasibility tests are performed.
>> + +   1. Return false if the join block has rhs casting for assign
>> +      gimple statements.
>
> Comments from Jeff:
>
>>> These seem totally arbitrary.  What's the reason behind each of 
>>> these restrictions?  None should be a correctness requirement 
>>> AFAICT.
>
> In the above patch I have made a check given in point 1. in the loop 
> latch and the Path splitting is disabled and the IF-conversion happens 
> and the test case passes.
>>That sounds more like a work-around/hack.  There's nothing inherent with a type conversion that should disable path splitting.

I have sent the patch with this change and I will remove the above check from the patch.

>>What happens if we delay path splitting to a point after if-conversion is complete?

This is better suggestion as explained above, but gains achieved through path splitting by keeping earlier in the pipeline before if-conversion , tree-vectorization, tree-vrp is suppressed if the following optimization after path splitting is not applicable for the above loops.

I have made the above changes and the existing set up doesn't break but the gains achieved in the benchmarks like rgbcmy_lite(EEMBC) Benchmarks is suppressed. The path splitting for the above EEMBC benchmarks give gains of 9% and for such loops if-conversion and Vectorization is not applicable  exposing gain with path splitting optimizations.

>>Alternately, could if-conversion export a routine which indicates if a particular sub-graph is likely to be if-convertable?  The path splitting pass could then use >>that routine to help determine if the path ought to be split or if it should instead rely on if-conversion.

Exporting the above routine from IF-conversion is not useful as the heuristics used in IF-conversion populates the Data Dependence through Scalar evolution which is trigger much later in the optimization pipeline. Populating such info in the earlier stage of the optimizations will not work as the Data Dependence through scalar evolution is trigger much later in the optimization  pipeline. Structure of basic block hierarchy required for if-conversion Looks similar to path splitting and exporting such function disables the path splitting for the cases where if-conversion is not applicable as there Are more data dependency check that distinguishes the if-conversion and path splitting.

Considering all the above, Keeping path splitting after if-conversion and vectorization looks better suggestion though this suppresses gains achieved Given above because of having path splitting after  tree-vrp, PRE, CCP. There is a dce path after the tree-vrp, if-conversion and tree-vectorize So the path splitting exposing DCE will not be affected.

Please suggest.

Thanks & Regards
Ajit

Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-09-12 12:05                         ` Ajit Kumar Agarwal
  2015-10-20 16:05                           ` Ajit Kumar Agarwal
@ 2015-11-11  7:01                           ` Jeff Law
  1 sibling, 0 replies; 72+ messages in thread
From: Jeff Law @ 2015-11-11  7:01 UTC (permalink / raw)
  To: Ajit Kumar Agarwal, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On 09/12/2015 04:38 AM, Ajit Kumar Agarwal wrote:
>
>
> -----Original Message----- From: Jeff Law [mailto:law@redhat.com]
> Sent: Thursday, September 10, 2015 3:10 AM To: Ajit Kumar Agarwal;
> Richard Biener Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta;
> Vidhumouli Hunsigida; Nagaraju Mekala Subject: Re:
> [Patch,tree-optimization]: Add new path Splitting pass on tree ssa
> representation
>
> On 08/26/2015 11:29 PM, Ajit Kumar Agarwal wrote:
>>
>> Thanks. The following testcase testsuite/gcc.dg/tree-ssa/ifc-5.c
>>
>> void dct_unquantize_h263_inter_c (short *block, int n, int qscale,
>> int nCoeffs) { int i, level, qmul, qadd;
>>
>> qadd = (qscale - 1) | 1; qmul = qscale << 1;
>>
>> for (i = 0; i <= nCoeffs; i++) { level = block[i]; if (level < 0)
>> level = level * qmul - qadd; else level = level * qmul + qadd;
>> block[i] = level; } }
>>
>> The above Loop is a candidate of path splitting as the IF block
>> merges at the latch of the Loop and the path splitting duplicates
>> The latch of the loop which is the statement block[i] = level into
>> the predecessors THEN and ELSE block.
So coming back to this patch -- which IMHO is a candidate for including 
given it was posted well before stage1 close.

I wonder if as an initial heuristic, we could avoid path splitting when 
the THEN/ELSE blocks are relatively small and have a well defined 
structure.  If we look at ifc-5.c we have this in the ivcanon dumps for 
the two key blocks:

;;   basic block 6, loop depth 1, count 0, freq 2457, maybe hot
;;    prev block 5, next block 7, flags: (NEW, REACHABLE)
;;    pred:       5 [27.0%]  (TRUE_VALUE,EXECUTABLE)
   _16 = qmul_7 * level_15;
   level_17 = _16 - qadd_6;
   goto <bb 8>;
;;    succ:       8 [100.0%]  (FALLTHRU,EXECUTABLE)

;;   basic block 7, loop depth 1, count 0, freq 6643, maybe hot
;;    prev block 6, next block 8, flags: (NEW, REACHABLE)
;;    pred:       5 [73.0%]  (FALSE_VALUE,EXECUTABLE)
   _18 = qmul_7 * level_15;
   level_19 = qadd_6 + _18;
;;    succ:       8 [100.0%]  (FALLTHRU,EXECUTABLE)


There's a lot of commonality there.

Alternately, we could look at those blocks and the merge point;


;;   basic block 8, loop depth 1, count 0, freq 9100, maybe hot
;;    prev block 7, next block 9, flags: (NEW, REACHABLE)
;;    pred:       6 [100.0%]  (FALLTHRU,EXECUTABLE)
;;                7 [100.0%]  (FALLTHRU,EXECUTABLE)
   # level_2 = PHI <level_17(6), level_19(7)>
   _20 = (short int) level_2;
   *_13 = _20;
   i_22 = i_24 + 1;
   if (nCoeffs_9(D) >= i_22)
     goto <bb 10>;
   else
     goto <bb 9>;
;;    succ:       10 [91.0%]  (TRUE_VALUE,EXECUTABLE)


And make a guess that there's not going to be any DCE/CSE opportunities 
if we path split.  It might help to see some sample codes where path 
splitting helps to see if there's patterns in the code that we can look for.



>>
>> Due to above path splitting,  the IF conversion is disabled and
>> the above IF-THEN-ELSE is not IF-converted and the test case
>> fails.
>>> So I think the question then becomes which of the two styles
>>> generally results in better code?  The path-split version or the
>>> older if-converted version.
>
>>> If the latter, then this may suggest that we've got the path
>>> splitting code at the wrong stage in the optimizer pipeline or
>>> that we need better heuristics for >>when to avoid applying path
>>> splitting.
>
> The code generated by the Path Splitting is useful when it exposes
> the DCE, PRE,CCP candidates. Whereas the IF-conversion is useful When
> the if-conversion exposes the vectorization candidates.
So perhaps two path splitting passes.  One early in the pipeline that 
runs when we're not vectorizing, and one late that runs if we are 
vectorizing.

I don't generally like conditionalizing passes like that, but it might 
be a reasonable way to go in this case if we can't come up with a good 
heuristic around when to split paths or leave them alone.

Thoughts?

>
> The suggestion for keeping the path splitting later in the pipeline
> after the if-conversion and the vectorization is useful as it doesn't
> break the Existing Deja GNU tests. Also useful to keep the path
> splitting later in the pipeline after the if-conversion and
> vectorization is that path splitting Can always duplicate the merge
> node into its predecessor after the if-conversion and vectorization
> pass, if the if-conversion and vectorization Is not applicable to the
> Loops. But this suppresses the CCP, PRE candidates which are earlier
> in the optimization pipeline.
Just to be clear, I'm less concerned about specific tests in the dejagnu 
suite than I am overall performance.

Anyway, I'm going to drop the patch into my local tree and try to get it 
updated to work with the trunk so I can poke around a bit.



Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-09-04 18:07                 ` Ajit Kumar Agarwal
@ 2015-11-11 20:38                   ` Jeff Law
  2015-11-12 10:58                     ` Richard Biener
  0 siblings, 1 reply; 72+ messages in thread
From: Jeff Law @ 2015-11-11 20:38 UTC (permalink / raw)
  To: Ajit Kumar Agarwal, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

[-- Attachment #1: Type: text/plain, Size: 4977 bytes --]

On 09/04/2015 11:36 AM, Ajit Kumar Agarwal wrote:

>> diff --git a/gcc/passes.def b/gcc/passes.def
>> index 6b66f8f..20ddf3d 100644
>> --- a/gcc/passes.def
>> +++ b/gcc/passes.def
>> @@ -82,6 +82,7 @@ along with GCC; see the file COPYING3.  If not see
>>    	  NEXT_PASS (pass_ccp);
>>    	  /* After CCP we rewrite no longer addressed locals into SSA
>>    	     form if possible.  */
>> +          NEXT_PASS (pass_path_split);
>>    	  NEXT_PASS (pass_forwprop);
>>    	  NEXT_PASS (pass_sra_early);
> I can't recall if we've discussed the location of the pass at all.  I'm
> not objecting to this location, but would like to hear why you chose
> this particular location in the optimization pipeline.
So returning to the question of where this would live in the 
optimization pipeline and how it interacts with if-conversion and 
vectorization.

The concern with moving it to late in the pipeline was that we'd miss 
VRP/DCE/CSE opportunities.  I'm not sure if you're aware, but we 
actually run those passes more than once.  So it would be possible to 
run path splitting after if-conversion & vectorization, but before the 
second passs of VRP & DOM.  But trying that seems to result in something 
scrambling the loop enough that the path splitting opportunity is 
missed.  That might be worth deeper investigation if we can't come up 
with some kind of heuristics to fire or suppress path splitting.

Other random notes as I look over the code:

Call the pass "path-split", not "path_split".  I don't think we have any 
passes with underscores in their names, dump files, etc.

You factored out the code for transform_duplicate.  When you create new 
functions, they should all have a block comment indicating what they do, 
return values, etc.

I asked you to trim down the #includes in tree-ssa-path-split.c  Most 
were ultimately unnecessary.  The trimmed list is just 11 headers.

Various functions in tree-ssa-path-split.c were missing their block 
comments.  There were several places in tree-ssa-path-split that I felt 
deserved a comment.  While you are familiar with the code, it's likely 
someone else will have to look at and modify this code at some point in 
the future.  The comments help make that easier.

In find_trace_loop_latch_same_as_join_blk, we find the immediate 
dominator of the latch and verify it ends in a conditional.  That's 
fine.  Then we look at the predecessors of the latch to see if one is 
succeeded only by the latch and falls through to the latch.  That is the 
block we'll end up redirecting to a copy of the latch.  Also fine.

Note how there is no testing for the relationship between the immediate 
dominator of the latch and the predecessors of the latch.  ISTM that we 
can have a fairly arbitrary region in the THEN/ELSE arms of the 
conditional.  Was this intentional?  Would it be advisable to verify 
that the THEN/ELSE arms are single blocks?  Do we want to verify that 
neither the THEN/ELSE arms transfer control other than to the latch?  Do 
we want to verify the predecessors of the latch are immediate successors 
of the latch's immediate dominator?

The is_feasible_trace routine was still checking if the block had a 
conversion and rejecting it.  I removed that check.  It does seem to me 
that we need an upper limit on the number of statements.  I wonder if we 
should factor out the maximum statements to copy code from jump 
threading and use it for both jump threading and path splitting.

Instead of creating loop with multiple latches, what ever happened to 
the idea of duplicating the latch block twice -- once into each path. 
Remove the control statement in each duplicate.  Then remove everything 
but the control statement in the original latch.


I added some direct dump support.  Essentially anytime we split the 
path, we output something like this:

Split path in loop: latch block 9, predecessor 7.

That allows tests in the testsuite to look for the "Split path in loop" 
string rather than inferring the information from the SSA graph update's 
replacement table.  It also allows us to do things like count how many 
paths get split if we have more complex tests.

On the topic of tests.  Is the one you provided something where path 
splitting results in a significant improvement?  From looking at the 
x86_64 output, I can see the path splitting transformation occur, but 
not any improvement in the final code.

While the existing test is useful, testing on code that actually 
improves as a result of path splitting is better.  Ideally we'd test 
both that path splitting occurred and that the secondary optimizations 
we wanted triggered.

The tests should go into gcc.dg/tree-ssa rather than just gcc.dg.

ANyway, here's my work-in-progress.  Your thoughts on the various 
questions, concerns, ideas noted above would be appreciated.  Obviously 
I'd like to wrap things up quickly and include this patch in gcc6.

Note, I haven't bootstrapped or regression tested this version.







[-- Attachment #2: P --]
[-- Type: text/plain, Size: 17254 bytes --]

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 34d2356..6613e83 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1474,6 +1474,7 @@ OBJS = \
 	tree-ssa-loop.o \
 	tree-ssa-math-opts.o \
 	tree-ssa-operands.o \
+	tree-ssa-path-split.o \
 	tree-ssa-phionlycprop.o \
 	tree-ssa-phiopt.o \
 	tree-ssa-phiprop.o \
diff --git a/gcc/common.opt b/gcc/common.opt
index 757ce85..9cc94e2 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2403,6 +2403,10 @@ ftree-vrp
 Common Report Var(flag_tree_vrp) Init(0) Optimization
 Perform Value Range Propagation on trees.
 
+ftree-path-split
+Common Report Var(flag_tree_path_split) Init(0) Optimization
+Perform Path Splitting on trees for loop backedges
+
 funit-at-a-time
 Common Report Var(flag_unit_at_a_time) Init(1)
 Compile whole compilation unit at a time.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 213a9d0..b1e95da 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -354,6 +354,7 @@ Objective-C and Objective-C++ Dialects}.
 -fdump-tree-fre@r{[}-@var{n}@r{]} @gol
 -fdump-tree-vtable-verify @gol
 -fdump-tree-vrp@r{[}-@var{n}@r{]} @gol
+-fdump-tree-path-split@r{[}-@var{n}@r{]} @gol
 -fdump-tree-storeccp@r{[}-@var{n}@r{]} @gol
 -fdump-final-insns=@var{file} @gol
 -fcompare-debug@r{[}=@var{opts}@r{]}  -fcompare-debug-second @gol
@@ -462,7 +463,7 @@ Objective-C and Objective-C++ Dialects}.
 -ftree-parallelize-loops=@var{n} -ftree-pre -ftree-partial-pre -ftree-pta @gol
 -ftree-reassoc -ftree-sink -ftree-slsr -ftree-sra @gol
 -ftree-switch-conversion -ftree-tail-merge -ftree-ter @gol
--ftree-vectorize -ftree-vrp @gol
+-ftree-vectorize -ftree-vrp @gol -ftree-path-split @gol
 -funit-at-a-time -funroll-all-loops -funroll-loops @gol
 -funsafe-loop-optimizations -funsafe-math-optimizations -funswitch-loops @gol
 -fipa-ra -fvariable-expansion-in-unroller -fvect-cost-model -fvpt @gol
@@ -7169,6 +7170,11 @@ output on to @file{stderr}. If two conflicting dump filenames are
 given for the same pass, then the latter option overrides the earlier
 one.
 
+@item path-split
+@opindex fdump-tree-path-split
+Dump each function after path splitting.  The file name is made by
+appending @file{.path-split} to the source file name
+
 @item all
 Turn on all options, except @option{raw}, @option{slim}, @option{verbose}
 and @option{lineno}.
@@ -7811,6 +7817,7 @@ also turns on the following optimization flags:
 -ftree-switch-conversion -ftree-tail-merge @gol
 -ftree-pre @gol
 -ftree-vrp @gol
+-ftree-path-split @gol
 -fipa-ra}
 
 Please note the warning under @option{-fgcse} about
@@ -8819,7 +8826,7 @@ currently enabled, but may be enabled by @option{-O2} in the future.
 
 @item -ftree-sink
 @opindex ftree-sink
-Perform forward store motion  on trees.  This flag is
+Perform forward store motion on trees.  This flag is
 enabled by default at @option{-O} and higher.
 
 @item -ftree-bit-ccp
@@ -9125,6 +9132,13 @@ enabled by default at @option{-O2} and higher.  Null pointer check
 elimination is only done if @option{-fdelete-null-pointer-checks} is
 enabled.
 
+@item -ftree-path-split
+@opindex ftree-path-split
+Perform Path Splitting on trees.  When the two execution paths of a
+if-then-else merge at the loop latch node, try to duplicate the
+merge node into two paths. This is enabled by default at @option{-O2}
+and above.
+
 @item -fsplit-ivs-in-unroller
 @opindex fsplit-ivs-in-unroller
 Enables expression of values of induction variables in later iterations
diff --git a/gcc/opts.c b/gcc/opts.c
index 9a3fbb3..9a0b27c 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -509,6 +509,7 @@ static const struct default_options default_options_table[] =
     { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
+    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },
 
     /* -O3 optimizations.  */
     { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
diff --git a/gcc/passes.def b/gcc/passes.def
index c0ab6b9..e0c1cd8 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -79,6 +79,7 @@ along with GCC; see the file COPYING3.  If not see
 	  NEXT_PASS (pass_remove_cgraph_callee_edges);
 	  NEXT_PASS (pass_object_sizes);
 	  NEXT_PASS (pass_ccp);
+	  NEXT_PASS (pass_path_split);
 	  /* After CCP we rewrite no longer addressed locals into SSA
 	     form if possible.  */
 	  NEXT_PASS (pass_forwprop);
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/path-split-1.c b/gcc/testsuite/gcc.dg/tree-ssa/path-split-1.c
new file mode 100644
index 0000000..4b8637b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/path-split-1.c
@@ -0,0 +1,67 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-tree-path-split-details " } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define RGBMAX 255
+
+int
+test()
+{
+  int i, Pels;
+  unsigned char sum = 0;
+  unsigned char xr, xg, xb;
+  unsigned char xc, xm, xy, xk;
+  unsigned char *ReadPtr, *EritePtr;
+
+  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+  EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+
+  for (i = 0; i < 100;i++)
+     {
+       ReadPtr[i] = 100 - i;
+     }
+
+  for (i = 0; i < 100; i++)
+     {
+       xr = *ReadPtr++;
+       xg = *ReadPtr++;
+       xb = *ReadPtr++;
+
+       xc = (unsigned char) (RGBMAX - xr);
+       xm = (unsigned char) (RGBMAX - xg);
+       xy = (unsigned char) (RGBMAX - xb);
+
+       if (xc < xm)
+         {
+           xk = (unsigned char) (xc < xy ? xc : xy);
+         }
+       else
+        {
+          xk = (unsigned char) (xm < xy ? xm : xy);
+        }
+
+       xc = (unsigned char) (xc - xk);
+       xm = (unsigned char) (xm - xk);
+       xy = (unsigned char) (xy - xk);
+
+       *EritePtr++ = xc;
+       *EritePtr++ = xm;
+       *EritePtr++ = xy;
+       *EritePtr++ = xk;
+       sum += *EritePtr;
+    }
+  return sum;
+}
+
+int
+main()
+{
+  if (test() != 33)
+    abort();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "Split path in loop:" "path-split" } } */
diff --git a/gcc/timevar.def b/gcc/timevar.def
index b429faf..3dba68b 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -300,3 +300,4 @@ DEFTIMEVAR (TV_LINK		     , "link JIT code")
 DEFTIMEVAR (TV_LOAD		     , "load JIT result")
 DEFTIMEVAR (TV_JIT_ACQUIRING_MUTEX   , "acquiring JIT mutex")
 DEFTIMEVAR (TV_JIT_CLIENT_CODE   , "JIT client code")
+DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path split")
diff --git a/gcc/tracer.c b/gcc/tracer.c
index 941dc20..c7b5150 100644
--- a/gcc/tracer.c
+++ b/gcc/tracer.c
@@ -51,9 +51,9 @@
 #include "tree-inline.h"
 #include "cfgloop.h"
 #include "fibonacci_heap.h"
+#include "tracer.h"
 
 static int count_insns (basic_block);
-static bool ignore_bb_p (const_basic_block);
 static bool better_p (const_edge, const_edge);
 static edge find_best_successor (basic_block);
 static edge find_best_predecessor (basic_block);
@@ -85,7 +85,7 @@ bb_seen_p (basic_block bb)
 }
 
 /* Return true if we should ignore the basic block for purposes of tracing.  */
-static bool
+bool
 ignore_bb_p (const_basic_block bb)
 {
   if (bb->index < NUM_FIXED_BLOCKS)
@@ -226,6 +226,24 @@ find_trace (basic_block bb, basic_block *trace)
   return i;
 }
 
+/* Duplicate block BB2, placing it after BB in the CFG.  Return the
+   newly created block.  */
+basic_block
+transform_duplicate (basic_block bb, basic_block bb2)
+{
+  edge e;
+  basic_block copy;
+
+  e = find_edge (bb, bb2);
+
+  copy = duplicate_block (bb2, e, bb);
+  flush_pending_stmts (e);
+
+  add_phi_args_after_copy (&copy, 1, NULL);
+
+  return (copy);
+}
+
 /* Look for basic blocks in frequency order, construct traces and tail duplicate
    if profitable.  */
 
@@ -321,17 +339,8 @@ tail_duplicate (void)
 		 entries or at least rotate the loop.  */
 	      && bb2->loop_father->header != bb2)
 	    {
-	      edge e;
-	      basic_block copy;
-
-	      nduplicated += counts [bb2->index];
-
-	      e = find_edge (bb, bb2);
-
-	      copy = duplicate_block (bb2, e, bb);
-	      flush_pending_stmts (e);
-
-	      add_phi_args_after_copy (&copy, 1, NULL);
+              nduplicated += counts [bb2->index];
+              basic_block copy = transform_duplicate (bb, bb2);
 
 	      /* Reconsider the original copy of block we've duplicated.
 	         Removing the most common predecessor may make it to be
diff --git a/gcc/tracer.h b/gcc/tracer.h
new file mode 100644
index 0000000..454d3b7
--- /dev/null
+++ b/gcc/tracer.h
@@ -0,0 +1,26 @@
+/* Header file for Tracer.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_TRACER_H
+#define GCC_TRACER_H
+
+extern basic_block transform_duplicate (basic_block bb, basic_block bb2);
+extern bool ignore_bb_p (const_basic_block bb);
+
+#endif /* GCC_TRaCER_H */
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 49e22a9..6963acc 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -390,6 +390,7 @@ extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_path_split (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_build_ssa (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_build_alias (gcc::context *ctxt);
diff --git a/gcc/tree-ssa-path-split.c b/gcc/tree-ssa-path-split.c
new file mode 100644
index 0000000..33dbc4d
--- /dev/null
+++ b/gcc/tree-ssa-path-split.c
@@ -0,0 +1,254 @@
+/* Support routines for Path Splitting.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "tree.h"
+#include "gimple.h"
+#include "tree-pass.h"
+#include "cfganal.h"
+#include "cfgloop.h"
+#include "gimple-iterator.h"
+#include "tracer.h"
+
+/* Given LOOP, if its latch has an immediate dominator that ends
+   in a simple conditional, then search for a block that is a
+   predecessor of the latch that falls through to the latch and
+   has no other successors.
+
+   When found, put that block into TRACE[0] and the latch block into
+   TRACE[1].  Otherwise do nothing.  */
+
+static void
+find_trace_loop_latch_same_as_join_blk (loop_p loop, basic_block *trace)
+{
+  basic_block latch = loop->latch;
+
+  /* We don't support path splitting if the latch has more than two
+     predecessors.  */
+  if (EDGE_COUNT (latch->preds) == 2)
+    {
+      basic_block bb = get_immediate_dominator (CDI_DOMINATORS, latch);
+      gimple *last = gsi_stmt (gsi_last_bb (bb));
+
+      /* The immediate dominator of the latch must end in a conditional.  */
+      if (last && gimple_code (last) != GIMPLE_COND)
+	return;
+
+      /* This looks for a predecessor of the latch which has a single
+	 fallthru successor (that is the latch).  Only one of the blocks
+	 can match that pattern. 
+
+	 Do we need to check that E->src is a successor of BB?  */
+      edge_iterator ei;
+      edge e;
+      FOR_EACH_EDGE (e, ei, latch->preds)
+	{
+	  if (!single_succ_p (e->src)
+	      || !(single_succ_edge (e->src)->flags & EDGE_FALLTHRU))
+	    break;
+	  else
+	    {
+	      trace[0] = e->src;
+	      trace[1] = latch;
+	      break;
+	    }
+	}
+    }
+}
+
+/* Return TRUE if BB is a reasonable block to duplicate by examining
+   its size, false otherwise.  BB will always be a loop latch block.
+
+   Should this use the same tests as we do for jump threading?  */
+
+static bool
+is_feasible_trace (basic_block bb)
+{
+  int num_stmt = 0;
+  gimple_stmt_iterator gsi;
+
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+      if (!is_gimple_debug (stmt))
+	num_stmt++;
+    }
+
+  /* We may want to limit how many statements we copy.  */
+  if (num_stmt > 1)
+    return true;
+
+  return false;
+}
+
+/* If the immediate dominator of the latch of the loop is
+   block with conditional branch, then the loop latch  is
+   duplicated to its predecessors path preserving the SSA
+   semantics.
+
+   CFG before transformation.
+
+   <bb 6>:
+      xk_35 = MIN_EXPR <xy_34, xc_32>;
+      goto <bb 8>;
+
+   <bb 7>:
+      xk_36 = MIN_EXPR <xy_34, xm_33>;
+
+   <bb 8>:
+      # xk_4 = PHI <xk_35(6), xk_36(7)>
+      xc_37 = xc_32 - xk_4;
+      xm_38 = xm_33 - xk_4;
+      xy_39 = xy_34 - xk_4;
+
+   CFG After Path Splitting transformation
+   before cleanup phase.
+
+   <bb 7>:
+     xk_35 = MIN_EXPR <xy_34, xc_32>;
+
+   <bb 8>:
+     # xk_29 = PHI <xk_35(7)>
+     xc_56 = xc_32 - xk_29;
+     xm_57 = xm_33 - xk_29;
+     xy_58 = xy_34 - xk_29;
+     goto <bb 11>;
+
+   <bb 9>:
+     xk_36 = MIN_EXPR <xy_34, xm_33>;
+
+   <bb 10>:
+     # xk_4 = PHI <xk_36(9)>
+     xc_37 = xc_32 - xk_4;
+     xm_38 = xm_33 - xk_4;
+     xy_39 = xy_34 - xk_4;
+
+  <bb 11>: .......  */
+
+static bool
+perform_path_splitting ()
+{
+  bool changed = false;
+  basic_block trace[2] = {NULL, NULL};
+  loop_p loop;
+
+  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
+  initialize_original_copy_tables ();
+  calculate_dominance_info (CDI_DOMINATORS);
+
+  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
+    {
+      /* If we're optimizing for size, or should not duplicate
+	 the latch block for some other reason, then ignore this
+	 loop.  */
+      if (ignore_bb_p (loop->latch))
+	continue;
+
+      /* Get the latch node and its predecessor, storing them into
+	 trace[0] and trace[1] respectively.  */
+      find_trace_loop_latch_same_as_join_blk (loop, trace);
+
+      if (trace[0] && trace[1] && is_feasible_trace (trace[1]))
+	{
+          if (dump_file && (dump_flags & TDF_DETAILS)) 
+            fprintf (dump_file,
+		     "Split path in loop: latch block %d, predecessor %d.\n",
+		     trace[1]->index, trace[0]->index);
+	  transform_duplicate (trace[0], trace[1]);
+	  trace[0] = NULL;
+	  trace[1] = NULL;
+	  changed = true;
+	}
+    }
+
+  loop_optimizer_finalize ();
+  free_original_copy_tables ();
+  return changed;
+}
+
+/* Main entry point for path splitting.  Returns TODO_cleanup_cfg if any
+   paths where split, otherwise return zero.  */
+
+static unsigned int
+execute_path_split (void)
+{
+  /* If we don't have at least 2 real blocks and backedges in the
+     CFG, then there's no point in trying to perform path splitting.  */
+  if (n_basic_blocks_for_fn (cfun) <= NUM_FIXED_BLOCKS + 1
+      || !mark_dfs_back_edges ())
+    return 0;
+
+  bool changed = perform_path_splitting();
+  if (changed)
+    {
+      free_dominance_info (CDI_DOMINATORS);
+      /* If we changed the CFG schedule loops for fixup by cleanup_cfg.  */
+      if (current_loops)
+	loops_state_set (LOOPS_NEED_FIXUP);
+    }
+
+  return changed ? TODO_cleanup_cfg : 0;
+
+}
+
+static bool
+gate_path_split(void)
+{
+  return flag_tree_path_split != 0;
+}
+
+namespace {
+
+const pass_data pass_data_path_split =
+{
+  GIMPLE_PASS, /* type */
+  "path-split", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_TREE_PATH_SPLIT, /* tv_id */
+  PROP_ssa, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_update_ssa, /* todo_flags_finish */
+};
+
+class pass_path_split : public gimple_opt_pass
+{
+   public:
+    pass_path_split (gcc::context *ctxt)
+      : gimple_opt_pass (pass_data_path_split, ctxt)
+    {}
+   /* opt_pass methods: */
+   opt_pass * clone () { return new pass_path_split (m_ctxt); }
+   virtual bool gate (function *) { return gate_path_split (); }
+   virtual unsigned int execute (function *) { return execute_path_split (); }
+
+}; // class pass_path_split
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_path_split (gcc::context *ctxt)
+{
+  return new pass_path_split (ctxt);
+}

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-11-11 20:38                   ` Jeff Law
@ 2015-11-12 10:58                     ` Richard Biener
  2015-11-12 17:05                       ` Jeff Law
  0 siblings, 1 reply; 72+ messages in thread
From: Richard Biener @ 2015-11-12 10:58 UTC (permalink / raw)
  To: Jeff Law
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On Wed, Nov 11, 2015 at 9:38 PM, Jeff Law <law@redhat.com> wrote:
> On 09/04/2015 11:36 AM, Ajit Kumar Agarwal wrote:
>
>>> diff --git a/gcc/passes.def b/gcc/passes.def
>>> index 6b66f8f..20ddf3d 100644
>>> --- a/gcc/passes.def
>>> +++ b/gcc/passes.def
>>> @@ -82,6 +82,7 @@ along with GCC; see the file COPYING3.  If not see
>>>           NEXT_PASS (pass_ccp);
>>>           /* After CCP we rewrite no longer addressed locals into SSA
>>>              form if possible.  */
>>> +          NEXT_PASS (pass_path_split);
>>>           NEXT_PASS (pass_forwprop);
>>>           NEXT_PASS (pass_sra_early);
>>
>> I can't recall if we've discussed the location of the pass at all.  I'm
>> not objecting to this location, but would like to hear why you chose
>> this particular location in the optimization pipeline.
>
> So returning to the question of where this would live in the optimization
> pipeline and how it interacts with if-conversion and vectorization.

Note that adding passes to the early pipeline that do code duplication
is a no-no.
The early pipeline should be exclusively for things making functions
more suitable
for inlining.

> The concern with moving it to late in the pipeline was that we'd miss
> VRP/DCE/CSE opportunities.  I'm not sure if you're aware, but we actually
> run those passes more than once.  So it would be possible to run path
> splitting after if-conversion & vectorization, but before the second passs
> of VRP & DOM.  But trying that seems to result in something scrambling the
> loop enough that the path splitting opportunity is missed.  That might be
> worth deeper investigation if we can't come up with some kind of heuristics
> to fire or suppress path splitting.

As I still think it is a transform similar to tracer just put it next to that.

But IIRC you mentioned it should enable vectorization or so?  In this case
that's obviously too late.

Richard.

> Other random notes as I look over the code:
>
> Call the pass "path-split", not "path_split".  I don't think we have any
> passes with underscores in their names, dump files, etc.
>
> You factored out the code for transform_duplicate.  When you create new
> functions, they should all have a block comment indicating what they do,
> return values, etc.
>
> I asked you to trim down the #includes in tree-ssa-path-split.c  Most were
> ultimately unnecessary.  The trimmed list is just 11 headers.
>
> Various functions in tree-ssa-path-split.c were missing their block
> comments.  There were several places in tree-ssa-path-split that I felt
> deserved a comment.  While you are familiar with the code, it's likely
> someone else will have to look at and modify this code at some point in the
> future.  The comments help make that easier.
>
> In find_trace_loop_latch_same_as_join_blk, we find the immediate dominator
> of the latch and verify it ends in a conditional.  That's fine.  Then we
> look at the predecessors of the latch to see if one is succeeded only by the
> latch and falls through to the latch.  That is the block we'll end up
> redirecting to a copy of the latch.  Also fine.
>
> Note how there is no testing for the relationship between the immediate
> dominator of the latch and the predecessors of the latch.  ISTM that we can
> have a fairly arbitrary region in the THEN/ELSE arms of the conditional.
> Was this intentional?  Would it be advisable to verify that the THEN/ELSE
> arms are single blocks?  Do we want to verify that neither the THEN/ELSE
> arms transfer control other than to the latch?  Do we want to verify the
> predecessors of the latch are immediate successors of the latch's immediate
> dominator?
>
> The is_feasible_trace routine was still checking if the block had a
> conversion and rejecting it.  I removed that check.  It does seem to me that
> we need an upper limit on the number of statements.  I wonder if we should
> factor out the maximum statements to copy code from jump threading and use
> it for both jump threading and path splitting.
>
> Instead of creating loop with multiple latches, what ever happened to the
> idea of duplicating the latch block twice -- once into each path. Remove the
> control statement in each duplicate.  Then remove everything but the control
> statement in the original latch.
>
>
> I added some direct dump support.  Essentially anytime we split the path, we
> output something like this:
>
> Split path in loop: latch block 9, predecessor 7.
>
> That allows tests in the testsuite to look for the "Split path in loop"
> string rather than inferring the information from the SSA graph update's
> replacement table.  It also allows us to do things like count how many paths
> get split if we have more complex tests.
>
> On the topic of tests.  Is the one you provided something where path
> splitting results in a significant improvement?  From looking at the x86_64
> output, I can see the path splitting transformation occur, but not any
> improvement in the final code.
>
> While the existing test is useful, testing on code that actually improves as
> a result of path splitting is better.  Ideally we'd test both that path
> splitting occurred and that the secondary optimizations we wanted triggered.
>
> The tests should go into gcc.dg/tree-ssa rather than just gcc.dg.
>
> ANyway, here's my work-in-progress.  Your thoughts on the various questions,
> concerns, ideas noted above would be appreciated.  Obviously I'd like to
> wrap things up quickly and include this patch in gcc6.
>
> Note, I haven't bootstrapped or regression tested this version.
>
>
>
>
>
>
>
> diff --git a/gcc/Makefile.in b/gcc/Makefile.in
> index 34d2356..6613e83 100644
> --- a/gcc/Makefile.in
> +++ b/gcc/Makefile.in
> @@ -1474,6 +1474,7 @@ OBJS = \
>         tree-ssa-loop.o \
>         tree-ssa-math-opts.o \
>         tree-ssa-operands.o \
> +       tree-ssa-path-split.o \
>         tree-ssa-phionlycprop.o \
>         tree-ssa-phiopt.o \
>         tree-ssa-phiprop.o \
> diff --git a/gcc/common.opt b/gcc/common.opt
> index 757ce85..9cc94e2 100644
> --- a/gcc/common.opt
> +++ b/gcc/common.opt
> @@ -2403,6 +2403,10 @@ ftree-vrp
>  Common Report Var(flag_tree_vrp) Init(0) Optimization
>  Perform Value Range Propagation on trees.
>
> +ftree-path-split
> +Common Report Var(flag_tree_path_split) Init(0) Optimization
> +Perform Path Splitting on trees for loop backedges
> +
>  funit-at-a-time
>  Common Report Var(flag_unit_at_a_time) Init(1)
>  Compile whole compilation unit at a time.
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 213a9d0..b1e95da 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -354,6 +354,7 @@ Objective-C and Objective-C++ Dialects}.
>  -fdump-tree-fre@r{[}-@var{n}@r{]} @gol
>  -fdump-tree-vtable-verify @gol
>  -fdump-tree-vrp@r{[}-@var{n}@r{]} @gol
> +-fdump-tree-path-split@r{[}-@var{n}@r{]} @gol
>  -fdump-tree-storeccp@r{[}-@var{n}@r{]} @gol
>  -fdump-final-insns=@var{file} @gol
>  -fcompare-debug@r{[}=@var{opts}@r{]}  -fcompare-debug-second @gol
> @@ -462,7 +463,7 @@ Objective-C and Objective-C++ Dialects}.
>  -ftree-parallelize-loops=@var{n} -ftree-pre -ftree-partial-pre -ftree-pta
> @gol
>  -ftree-reassoc -ftree-sink -ftree-slsr -ftree-sra @gol
>  -ftree-switch-conversion -ftree-tail-merge -ftree-ter @gol
> --ftree-vectorize -ftree-vrp @gol
> +-ftree-vectorize -ftree-vrp @gol -ftree-path-split @gol
>  -funit-at-a-time -funroll-all-loops -funroll-loops @gol
>  -funsafe-loop-optimizations -funsafe-math-optimizations -funswitch-loops
> @gol
>  -fipa-ra -fvariable-expansion-in-unroller -fvect-cost-model -fvpt @gol
> @@ -7169,6 +7170,11 @@ output on to @file{stderr}. If two conflicting dump
> filenames are
>  given for the same pass, then the latter option overrides the earlier
>  one.
>
> +@item path-split
> +@opindex fdump-tree-path-split
> +Dump each function after path splitting.  The file name is made by
> +appending @file{.path-split} to the source file name
> +
>  @item all
>  Turn on all options, except @option{raw}, @option{slim}, @option{verbose}
>  and @option{lineno}.
> @@ -7811,6 +7817,7 @@ also turns on the following optimization flags:
>  -ftree-switch-conversion -ftree-tail-merge @gol
>  -ftree-pre @gol
>  -ftree-vrp @gol
> +-ftree-path-split @gol
>  -fipa-ra}
>
>  Please note the warning under @option{-fgcse} about
> @@ -8819,7 +8826,7 @@ currently enabled, but may be enabled by @option{-O2}
> in the future.
>
>  @item -ftree-sink
>  @opindex ftree-sink
> -Perform forward store motion  on trees.  This flag is
> +Perform forward store motion on trees.  This flag is
>  enabled by default at @option{-O} and higher.
>
>  @item -ftree-bit-ccp
> @@ -9125,6 +9132,13 @@ enabled by default at @option{-O2} and higher.  Null
> pointer check
>  elimination is only done if @option{-fdelete-null-pointer-checks} is
>  enabled.
>
> +@item -ftree-path-split
> +@opindex ftree-path-split
> +Perform Path Splitting on trees.  When the two execution paths of a
> +if-then-else merge at the loop latch node, try to duplicate the
> +merge node into two paths. This is enabled by default at @option{-O2}
> +and above.
> +
>  @item -fsplit-ivs-in-unroller
>  @opindex fsplit-ivs-in-unroller
>  Enables expression of values of induction variables in later iterations
> diff --git a/gcc/opts.c b/gcc/opts.c
> index 9a3fbb3..9a0b27c 100644
> --- a/gcc/opts.c
> +++ b/gcc/opts.c
> @@ -509,6 +509,7 @@ static const struct default_options
> default_options_table[] =
>      { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1
> },
>      { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
>      { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
> +    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },
>
>      /* -O3 optimizations.  */
>      { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
> diff --git a/gcc/passes.def b/gcc/passes.def
> index c0ab6b9..e0c1cd8 100644
> --- a/gcc/passes.def
> +++ b/gcc/passes.def
> @@ -79,6 +79,7 @@ along with GCC; see the file COPYING3.  If not see
>           NEXT_PASS (pass_remove_cgraph_callee_edges);
>           NEXT_PASS (pass_object_sizes);
>           NEXT_PASS (pass_ccp);
> +         NEXT_PASS (pass_path_split);
>           /* After CCP we rewrite no longer addressed locals into SSA
>              form if possible.  */
>           NEXT_PASS (pass_forwprop);
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/path-split-1.c
> b/gcc/testsuite/gcc.dg/tree-ssa/path-split-1.c
> new file mode 100644
> index 0000000..4b8637b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/path-split-1.c
> @@ -0,0 +1,67 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fdump-tree-path-split-details " } */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +
> +#define RGBMAX 255
> +
> +int
> +test()
> +{
> +  int i, Pels;
> +  unsigned char sum = 0;
> +  unsigned char xr, xg, xb;
> +  unsigned char xc, xm, xy, xk;
> +  unsigned char *ReadPtr, *EritePtr;
> +
> +  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
> +  EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
> +
> +  for (i = 0; i < 100;i++)
> +     {
> +       ReadPtr[i] = 100 - i;
> +     }
> +
> +  for (i = 0; i < 100; i++)
> +     {
> +       xr = *ReadPtr++;
> +       xg = *ReadPtr++;
> +       xb = *ReadPtr++;
> +
> +       xc = (unsigned char) (RGBMAX - xr);
> +       xm = (unsigned char) (RGBMAX - xg);
> +       xy = (unsigned char) (RGBMAX - xb);
> +
> +       if (xc < xm)
> +         {
> +           xk = (unsigned char) (xc < xy ? xc : xy);
> +         }
> +       else
> +        {
> +          xk = (unsigned char) (xm < xy ? xm : xy);
> +        }
> +
> +       xc = (unsigned char) (xc - xk);
> +       xm = (unsigned char) (xm - xk);
> +       xy = (unsigned char) (xy - xk);
> +
> +       *EritePtr++ = xc;
> +       *EritePtr++ = xm;
> +       *EritePtr++ = xy;
> +       *EritePtr++ = xk;
> +       sum += *EritePtr;
> +    }
> +  return sum;
> +}
> +
> +int
> +main()
> +{
> +  if (test() != 33)
> +    abort();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump "Split path in loop:" "path-split" } } */
> diff --git a/gcc/timevar.def b/gcc/timevar.def
> index b429faf..3dba68b 100644
> --- a/gcc/timevar.def
> +++ b/gcc/timevar.def
> @@ -300,3 +300,4 @@ DEFTIMEVAR (TV_LINK              , "link JIT code")
>  DEFTIMEVAR (TV_LOAD                 , "load JIT result")
>  DEFTIMEVAR (TV_JIT_ACQUIRING_MUTEX   , "acquiring JIT mutex")
>  DEFTIMEVAR (TV_JIT_CLIENT_CODE   , "JIT client code")
> +DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path split")
> diff --git a/gcc/tracer.c b/gcc/tracer.c
> index 941dc20..c7b5150 100644
> --- a/gcc/tracer.c
> +++ b/gcc/tracer.c
> @@ -51,9 +51,9 @@
>  #include "tree-inline.h"
>  #include "cfgloop.h"
>  #include "fibonacci_heap.h"
> +#include "tracer.h"
>
>  static int count_insns (basic_block);
> -static bool ignore_bb_p (const_basic_block);
>  static bool better_p (const_edge, const_edge);
>  static edge find_best_successor (basic_block);
>  static edge find_best_predecessor (basic_block);
> @@ -85,7 +85,7 @@ bb_seen_p (basic_block bb)
>  }
>
>  /* Return true if we should ignore the basic block for purposes of tracing.
> */
> -static bool
> +bool
>  ignore_bb_p (const_basic_block bb)
>  {
>    if (bb->index < NUM_FIXED_BLOCKS)
> @@ -226,6 +226,24 @@ find_trace (basic_block bb, basic_block *trace)
>    return i;
>  }
>
> +/* Duplicate block BB2, placing it after BB in the CFG.  Return the
> +   newly created block.  */
> +basic_block
> +transform_duplicate (basic_block bb, basic_block bb2)
> +{
> +  edge e;
> +  basic_block copy;
> +
> +  e = find_edge (bb, bb2);
> +
> +  copy = duplicate_block (bb2, e, bb);
> +  flush_pending_stmts (e);
> +
> +  add_phi_args_after_copy (&copy, 1, NULL);
> +
> +  return (copy);
> +}
> +
>  /* Look for basic blocks in frequency order, construct traces and tail
> duplicate
>     if profitable.  */
>
> @@ -321,17 +339,8 @@ tail_duplicate (void)
>                  entries or at least rotate the loop.  */
>               && bb2->loop_father->header != bb2)
>             {
> -             edge e;
> -             basic_block copy;
> -
> -             nduplicated += counts [bb2->index];
> -
> -             e = find_edge (bb, bb2);
> -
> -             copy = duplicate_block (bb2, e, bb);
> -             flush_pending_stmts (e);
> -
> -             add_phi_args_after_copy (&copy, 1, NULL);
> +              nduplicated += counts [bb2->index];
> +              basic_block copy = transform_duplicate (bb, bb2);
>
>               /* Reconsider the original copy of block we've duplicated.
>                  Removing the most common predecessor may make it to be
> diff --git a/gcc/tracer.h b/gcc/tracer.h
> new file mode 100644
> index 0000000..454d3b7
> --- /dev/null
> +++ b/gcc/tracer.h
> @@ -0,0 +1,26 @@
> +/* Header file for Tracer.
> +   Copyright (C) 2015 Free Software Foundation, Inc.
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify it under
> +the terms of the GNU General Public License as published by the Free
> +Software Foundation; either version 3, or (at your option) any later
> +version.
> +
> +GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> + for more details.
> +
> +You should have received a copy of the GNU General Public License
> +along with GCC; see the file COPYING3.  If not see
> +<http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_TRACER_H
> +#define GCC_TRACER_H
> +
> +extern basic_block transform_duplicate (basic_block bb, basic_block bb2);
> +extern bool ignore_bb_p (const_basic_block bb);
> +
> +#endif /* GCC_TRaCER_H */
> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
> index 49e22a9..6963acc 100644
> --- a/gcc/tree-pass.h
> +++ b/gcc/tree-pass.h
> @@ -390,6 +390,7 @@ extern gimple_opt_pass *make_pass_tree_loop_done
> (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
> +extern gimple_opt_pass *make_pass_path_split (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_build_ssa (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_build_alias (gcc::context *ctxt);
> diff --git a/gcc/tree-ssa-path-split.c b/gcc/tree-ssa-path-split.c
> new file mode 100644
> index 0000000..33dbc4d
> --- /dev/null
> +++ b/gcc/tree-ssa-path-split.c
> @@ -0,0 +1,254 @@
> +/* Support routines for Path Splitting.
> +   Copyright (C) 2015 Free Software Foundation, Inc.
> +   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
> +
> + This file is part of GCC.
> +
> + GCC is free software; you can redistribute it and/or modify
> + it under the terms of the GNU General Public License as published by
> + the Free Software Foundation; either version 3, or (at your option)
> + any later version.
> +
> +GCC is distributed in the hope that it will be useful,
> +but WITHOUT ANY WARRANTY; without even the implied warranty of
> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +GNU General Public License for more details.
> +
> +You should have received a copy of the GNU General Public License
> +along with GCC; see the file COPYING3.  If not see
> +<http://www.gnu.org/licenses/>.  */
> +
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#include "backend.h"
> +#include "tree.h"
> +#include "gimple.h"
> +#include "tree-pass.h"
> +#include "cfganal.h"
> +#include "cfgloop.h"
> +#include "gimple-iterator.h"
> +#include "tracer.h"
> +
> +/* Given LOOP, if its latch has an immediate dominator that ends
> +   in a simple conditional, then search for a block that is a
> +   predecessor of the latch that falls through to the latch and
> +   has no other successors.
> +
> +   When found, put that block into TRACE[0] and the latch block into
> +   TRACE[1].  Otherwise do nothing.  */
> +
> +static void
> +find_trace_loop_latch_same_as_join_blk (loop_p loop, basic_block *trace)
> +{
> +  basic_block latch = loop->latch;
> +
> +  /* We don't support path splitting if the latch has more than two
> +     predecessors.  */
> +  if (EDGE_COUNT (latch->preds) == 2)
> +    {
> +      basic_block bb = get_immediate_dominator (CDI_DOMINATORS, latch);
> +      gimple *last = gsi_stmt (gsi_last_bb (bb));
> +
> +      /* The immediate dominator of the latch must end in a conditional.
> */
> +      if (last && gimple_code (last) != GIMPLE_COND)
> +       return;
> +
> +      /* This looks for a predecessor of the latch which has a single
> +        fallthru successor (that is the latch).  Only one of the blocks
> +        can match that pattern.
> +
> +        Do we need to check that E->src is a successor of BB?  */
> +      edge_iterator ei;
> +      edge e;
> +      FOR_EACH_EDGE (e, ei, latch->preds)
> +       {
> +         if (!single_succ_p (e->src)
> +             || !(single_succ_edge (e->src)->flags & EDGE_FALLTHRU))
> +           break;
> +         else
> +           {
> +             trace[0] = e->src;
> +             trace[1] = latch;
> +             break;
> +           }
> +       }
> +    }
> +}
> +
> +/* Return TRUE if BB is a reasonable block to duplicate by examining
> +   its size, false otherwise.  BB will always be a loop latch block.
> +
> +   Should this use the same tests as we do for jump threading?  */
> +
> +static bool
> +is_feasible_trace (basic_block bb)
> +{
> +  int num_stmt = 0;
> +  gimple_stmt_iterator gsi;
> +
> +  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +    {
> +      gimple *stmt = gsi_stmt (gsi);
> +      if (!is_gimple_debug (stmt))
> +       num_stmt++;
> +    }
> +
> +  /* We may want to limit how many statements we copy.  */
> +  if (num_stmt > 1)
> +    return true;
> +
> +  return false;
> +}
> +
> +/* If the immediate dominator of the latch of the loop is
> +   block with conditional branch, then the loop latch  is
> +   duplicated to its predecessors path preserving the SSA
> +   semantics.
> +
> +   CFG before transformation.
> +
> +   <bb 6>:
> +      xk_35 = MIN_EXPR <xy_34, xc_32>;
> +      goto <bb 8>;
> +
> +   <bb 7>:
> +      xk_36 = MIN_EXPR <xy_34, xm_33>;
> +
> +   <bb 8>:
> +      # xk_4 = PHI <xk_35(6), xk_36(7)>
> +      xc_37 = xc_32 - xk_4;
> +      xm_38 = xm_33 - xk_4;
> +      xy_39 = xy_34 - xk_4;
> +
> +   CFG After Path Splitting transformation
> +   before cleanup phase.
> +
> +   <bb 7>:
> +     xk_35 = MIN_EXPR <xy_34, xc_32>;
> +
> +   <bb 8>:
> +     # xk_29 = PHI <xk_35(7)>
> +     xc_56 = xc_32 - xk_29;
> +     xm_57 = xm_33 - xk_29;
> +     xy_58 = xy_34 - xk_29;
> +     goto <bb 11>;
> +
> +   <bb 9>:
> +     xk_36 = MIN_EXPR <xy_34, xm_33>;
> +
> +   <bb 10>:
> +     # xk_4 = PHI <xk_36(9)>
> +     xc_37 = xc_32 - xk_4;
> +     xm_38 = xm_33 - xk_4;
> +     xy_39 = xy_34 - xk_4;
> +
> +  <bb 11>: .......  */
> +
> +static bool
> +perform_path_splitting ()
> +{
> +  bool changed = false;
> +  basic_block trace[2] = {NULL, NULL};
> +  loop_p loop;
> +
> +  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
> +  initialize_original_copy_tables ();
> +  calculate_dominance_info (CDI_DOMINATORS);
> +
> +  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
> +    {
> +      /* If we're optimizing for size, or should not duplicate
> +        the latch block for some other reason, then ignore this
> +        loop.  */
> +      if (ignore_bb_p (loop->latch))
> +       continue;
> +
> +      /* Get the latch node and its predecessor, storing them into
> +        trace[0] and trace[1] respectively.  */
> +      find_trace_loop_latch_same_as_join_blk (loop, trace);
> +
> +      if (trace[0] && trace[1] && is_feasible_trace (trace[1]))
> +       {
> +          if (dump_file && (dump_flags & TDF_DETAILS))
> +            fprintf (dump_file,
> +                    "Split path in loop: latch block %d, predecessor
> %d.\n",
> +                    trace[1]->index, trace[0]->index);
> +         transform_duplicate (trace[0], trace[1]);
> +         trace[0] = NULL;
> +         trace[1] = NULL;
> +         changed = true;
> +       }
> +    }
> +
> +  loop_optimizer_finalize ();
> +  free_original_copy_tables ();
> +  return changed;
> +}
> +
> +/* Main entry point for path splitting.  Returns TODO_cleanup_cfg if any
> +   paths where split, otherwise return zero.  */
> +
> +static unsigned int
> +execute_path_split (void)
> +{
> +  /* If we don't have at least 2 real blocks and backedges in the
> +     CFG, then there's no point in trying to perform path splitting.  */
> +  if (n_basic_blocks_for_fn (cfun) <= NUM_FIXED_BLOCKS + 1
> +      || !mark_dfs_back_edges ())
> +    return 0;
> +
> +  bool changed = perform_path_splitting();
> +  if (changed)
> +    {
> +      free_dominance_info (CDI_DOMINATORS);
> +      /* If we changed the CFG schedule loops for fixup by cleanup_cfg.  */
> +      if (current_loops)
> +       loops_state_set (LOOPS_NEED_FIXUP);
> +    }
> +
> +  return changed ? TODO_cleanup_cfg : 0;
> +
> +}
> +
> +static bool
> +gate_path_split(void)
> +{
> +  return flag_tree_path_split != 0;
> +}
> +
> +namespace {
> +
> +const pass_data pass_data_path_split =
> +{
> +  GIMPLE_PASS, /* type */
> +  "path-split", /* name */
> +  OPTGROUP_NONE, /* optinfo_flags */
> +  TV_TREE_PATH_SPLIT, /* tv_id */
> +  PROP_ssa, /* properties_required */
> +  0, /* properties_provided */
> +  0, /* properties_destroyed */
> +  0, /* todo_flags_start */
> +  TODO_update_ssa, /* todo_flags_finish */
> +};
> +
> +class pass_path_split : public gimple_opt_pass
> +{
> +   public:
> +    pass_path_split (gcc::context *ctxt)
> +      : gimple_opt_pass (pass_data_path_split, ctxt)
> +    {}
> +   /* opt_pass methods: */
> +   opt_pass * clone () { return new pass_path_split (m_ctxt); }
> +   virtual bool gate (function *) { return gate_path_split (); }
> +   virtual unsigned int execute (function *) { return execute_path_split
> (); }
> +
> +}; // class pass_path_split
> +
> +} // anon namespace
> +
> +gimple_opt_pass *
> +make_pass_path_split (gcc::context *ctxt)
> +{
> +  return new pass_path_split (ctxt);
> +}
>

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-11-12 10:58                     ` Richard Biener
@ 2015-11-12 17:05                       ` Jeff Law
  2015-11-12 18:33                         ` Jeff Law
  0 siblings, 1 reply; 72+ messages in thread
From: Jeff Law @ 2015-11-12 17:05 UTC (permalink / raw)
  To: Richard Biener
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On 11/12/2015 03:58 AM, Richard Biener wrote:
> On Wed, Nov 11, 2015 at 9:38 PM, Jeff Law <law@redhat.com> wrote:
>> On 09/04/2015 11:36 AM, Ajit Kumar Agarwal wrote:
>>
>>>> diff --git a/gcc/passes.def b/gcc/passes.def
>>>> index 6b66f8f..20ddf3d 100644
>>>> --- a/gcc/passes.def
>>>> +++ b/gcc/passes.def
>>>> @@ -82,6 +82,7 @@ along with GCC; see the file COPYING3.  If not see
>>>>            NEXT_PASS (pass_ccp);
>>>>            /* After CCP we rewrite no longer addressed locals into SSA
>>>>               form if possible.  */
>>>> +          NEXT_PASS (pass_path_split);
>>>>            NEXT_PASS (pass_forwprop);
>>>>            NEXT_PASS (pass_sra_early);
>>>
>>> I can't recall if we've discussed the location of the pass at all.  I'm
>>> not objecting to this location, but would like to hear why you chose
>>> this particular location in the optimization pipeline.
>>
>> So returning to the question of where this would live in the optimization
>> pipeline and how it interacts with if-conversion and vectorization.
>
> Note that adding passes to the early pipeline that do code duplication
> is a no-no.
> The early pipeline should be exclusively for things making functions
> more suitable for inlining.
I'd been experimenting with moving it down in the pipeline.  It 
certainly doesn't seem to need to be in the early optimizations.   At 
some point we force latches to have single successors which spoils the 
simplistic region recognition of the path splitting pass.

>
>> The concern with moving it to late in the pipeline was that we'd miss
>> VRP/DCE/CSE opportunities.  I'm not sure if you're aware, but we actually
>> run those passes more than once.  So it would be possible to run path
>> splitting after if-conversion & vectorization, but before the second passs
>> of VRP & DOM.  But trying that seems to result in something scrambling the
>> loop enough that the path splitting opportunity is missed.  That might be
>> worth deeper investigation if we can't come up with some kind of heuristics
>> to fire or suppress path splitting.
>
> As I still think it is a transform similar to tracer just put it next to that.
The CFG has changed shape significantly by that point.  So some 
adjustments would be needed.  Essentially it's no longer the latch that 
needs to be duplicated into the THEN/ELSE clauses, but the join point 
that's the predecessor of the latch.

But that's probably a good change to make anyway because we end up doing 
less damage to the overall shape of the CFG.  Essentially path splitting 
would look like creating superblocks by target duplication, and that's 
kind of what I expected this to look like all-along.


>
> But IIRC you mentioned it should enable vectorization or so?  In this case
> that's obviously too late.
The opposite.  Path splitting interferes with if-conversion & 
vectorization.  Path splitting mucks up the CFG enough that 
if-conversion won't fire and as a result vectorization is inhibited.  It 
also creates multi-latch loops, which isn't a great situation either.

It *may* be the case that dropping it that far down in the pipeline and 
making the modifications necessary to handle simple latches may in turn 
make the path splitting code play better with if-conversion and 
vectorization and avoid creation of multi-latch loops.  At least that's 
how it looks on paper when I draw out the CFG manipulations.

I'll do some experiments.

Jeff


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-11-12 17:05                       ` Jeff Law
@ 2015-11-12 18:33                         ` Jeff Law
  2015-11-12 19:40                           ` Richard Biener
  2015-11-12 21:58                           ` Jeff Law
  0 siblings, 2 replies; 72+ messages in thread
From: Jeff Law @ 2015-11-12 18:33 UTC (permalink / raw)
  To: Richard Biener
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On 11/12/2015 10:05 AM, Jeff Law wrote:
>> But IIRC you mentioned it should enable vectorization or so?  In this
>> case
>> that's obviously too late.
> The opposite.  Path splitting interferes with if-conversion &
> vectorization.  Path splitting mucks up the CFG enough that
> if-conversion won't fire and as a result vectorization is inhibited.  It
> also creates multi-latch loops, which isn't a great situation either.
>
> It *may* be the case that dropping it that far down in the pipeline and
> making the modifications necessary to handle simple latches may in turn
> make the path splitting code play better with if-conversion and
> vectorization and avoid creation of multi-latch loops.  At least that's
> how it looks on paper when I draw out the CFG manipulations.
>
> I'll do some experiments.
It doesn't look too terrible to ravamp the recognition code to work 
later in the pipeline with simple latches.  Sadly that doesn't seem to 
have fixed the bad interactions with if-conversion.

*But* that does open up the possibility of moving the path splitting 
pass even deeper in the pipeline -- in particular we can move it past 
the vectorizer.  Which is may be a win.

So the big question is whether or not we'll still see enough benefits 
from having it so late in the pipeline.  It's still early enough that we 
get DOM, VRP, reassoc, forwprop, phiopt, etc.

Ajit, I'll pass along an updated patch after doing some more testing.

Jeff


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-11-12 18:33                         ` Jeff Law
@ 2015-11-12 19:40                           ` Richard Biener
  2015-11-12 19:52                             ` Jeff Law
  2015-11-12 21:58                           ` Jeff Law
  1 sibling, 1 reply; 72+ messages in thread
From: Richard Biener @ 2015-11-12 19:40 UTC (permalink / raw)
  To: Jeff Law
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On November 12, 2015 7:32:57 PM GMT+01:00, Jeff Law <law@redhat.com> wrote:
>On 11/12/2015 10:05 AM, Jeff Law wrote:
>>> But IIRC you mentioned it should enable vectorization or so?  In
>this
>>> case
>>> that's obviously too late.
>> The opposite.  Path splitting interferes with if-conversion &
>> vectorization.  Path splitting mucks up the CFG enough that
>> if-conversion won't fire and as a result vectorization is inhibited. 
>It
>> also creates multi-latch loops, which isn't a great situation either.
>>
>> It *may* be the case that dropping it that far down in the pipeline
>and
>> making the modifications necessary to handle simple latches may in
>turn
>> make the path splitting code play better with if-conversion and
>> vectorization and avoid creation of multi-latch loops.  At least
>that's
>> how it looks on paper when I draw out the CFG manipulations.
>>
>> I'll do some experiments.
>It doesn't look too terrible to ravamp the recognition code to work 
>later in the pipeline with simple latches.  Sadly that doesn't seem to 
>have fixed the bad interactions with if-conversion.
>
>*But* that does open up the possibility of moving the path splitting 
>pass even deeper in the pipeline -- in particular we can move it past 
>the vectorizer.  Which is may be a win.
>
>So the big question is whether or not we'll still see enough benefits 
>from having it so late in the pipeline.  It's still early enough that
>we 
>get DOM, VRP, reassoc, forwprop, phiopt, etc.
>
>Ajit, I'll pass along an updated patch after doing some more testing.

BTW, if you not use loops_normal for loop init you don't get simple latches forced (and cfg-cleanup will remove them)

Richard.

>Jeff


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-11-12 19:40                           ` Richard Biener
@ 2015-11-12 19:52                             ` Jeff Law
  0 siblings, 0 replies; 72+ messages in thread
From: Jeff Law @ 2015-11-12 19:52 UTC (permalink / raw)
  To: Richard Biener
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On 11/12/2015 12:40 PM, Richard Biener wrote:
> On November 12, 2015 7:32:57 PM GMT+01:00, Jeff Law <law@redhat.com>
> wrote:
>> On 11/12/2015 10:05 AM, Jeff Law wrote:
>>>> But IIRC you mentioned it should enable vectorization or so?
>>>> In
>> this
>>>> case that's obviously too late.
>>> The opposite.  Path splitting interferes with if-conversion &
>>> vectorization.  Path splitting mucks up the CFG enough that
>>> if-conversion won't fire and as a result vectorization is
>>> inhibited.
>> It
>>> also creates multi-latch loops, which isn't a great situation
>>> either.
>>>
>>> It *may* be the case that dropping it that far down in the
>>> pipeline
>> and
>>> making the modifications necessary to handle simple latches may
>>> in
>> turn
>>> make the path splitting code play better with if-conversion and
>>> vectorization and avoid creation of multi-latch loops.  At least
>> that's
>>> how it looks on paper when I draw out the CFG manipulations.
>>>
>>> I'll do some experiments.
>> It doesn't look too terrible to ravamp the recognition code to
>> work later in the pipeline with simple latches.  Sadly that doesn't
>> seem to have fixed the bad interactions with if-conversion.
>>
>> *But* that does open up the possibility of moving the path
>> splitting pass even deeper in the pipeline -- in particular we can
>> move it past the vectorizer.  Which is may be a win.
>>
>> So the big question is whether or not we'll still see enough
>> benefits from having it so late in the pipeline.  It's still early
>> enough that we get DOM, VRP, reassoc, forwprop, phiopt, etc.
>>
>> Ajit, I'll pass along an updated patch after doing some more
>> testing.
>
> BTW, if you not use loops_normal for loop init you don't get simple
> latches forced (and cfg-cleanup will remove them)
I think I'd prefer to have loops start in simple-latches form and 
preserve the simple-latches form.    Detection is slightly harder, but 
transformation without creating multiple latches is easier.

jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-11-12 18:33                         ` Jeff Law
  2015-11-12 19:40                           ` Richard Biener
@ 2015-11-12 21:58                           ` Jeff Law
  2015-11-13 10:13                             ` Richard Biener
  2015-11-13 13:19                             ` Ajit Kumar Agarwal
  1 sibling, 2 replies; 72+ messages in thread
From: Jeff Law @ 2015-11-12 21:58 UTC (permalink / raw)
  To: Richard Biener
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

[-- Attachment #1: Type: text/plain, Size: 2516 bytes --]

On 11/12/2015 11:32 AM, Jeff Law wrote:
> On 11/12/2015 10:05 AM, Jeff Law wrote:
>>> But IIRC you mentioned it should enable vectorization or so?  In this
>>> case
>>> that's obviously too late.
>> The opposite.  Path splitting interferes with if-conversion &
>> vectorization.  Path splitting mucks up the CFG enough that
>> if-conversion won't fire and as a result vectorization is inhibited.  It
>> also creates multi-latch loops, which isn't a great situation either.
>>
>> It *may* be the case that dropping it that far down in the pipeline and
>> making the modifications necessary to handle simple latches may in turn
>> make the path splitting code play better with if-conversion and
>> vectorization and avoid creation of multi-latch loops.  At least that's
>> how it looks on paper when I draw out the CFG manipulations.
>>
>> I'll do some experiments.
> It doesn't look too terrible to ravamp the recognition code to work
> later in the pipeline with simple latches.  Sadly that doesn't seem to
> have fixed the bad interactions with if-conversion.
>
> *But* that does open up the possibility of moving the path splitting
> pass even deeper in the pipeline -- in particular we can move it past
> the vectorizer.  Which is may be a win.
>
> So the big question is whether or not we'll still see enough benefits
> from having it so late in the pipeline.  It's still early enough that we
> get DOM, VRP, reassoc, forwprop, phiopt, etc.
>
> Ajit, I'll pass along an updated patch after doing some more testing.
So here's what I'm working with.  It runs after the vectorizer now.

Ajit, if you could benchmark this it would be greatly appreciated.  I 
know you saw significant improvements on one or more benchmarks in the 
past.  It'd be good to know that the updated placement of the pass 
doesn't invalidate the gains you saw.

With the updated pass placement, we don't have to worry about switching 
the pass on/off based on whether or not the vectorizer & if-conversion 
are enabled.  So that hackery is gone.

I think I've beefed up the test to identify the diamond patterns we want 
so that it's stricter in what we accept.  The call to ignore_bb_p is a 
part of that test so that we're actually looking at the right block in a 
world where we're doing this transformation with simple latches.

I've also put a graphical comment before perform_path_splitting which 
hopefully shows the CFG transformation we're making a bit clearer.

This bootstraps and regression tests cleanly on x86_64-linux-gnu.



[-- Attachment #2: patch --]
[-- Type: text/plain, Size: 18505 bytes --]

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 34d2356..6613e83 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1474,6 +1474,7 @@ OBJS = \
 	tree-ssa-loop.o \
 	tree-ssa-math-opts.o \
 	tree-ssa-operands.o \
+	tree-ssa-path-split.o \
 	tree-ssa-phionlycprop.o \
 	tree-ssa-phiopt.o \
 	tree-ssa-phiprop.o \
diff --git a/gcc/common.opt b/gcc/common.opt
index 757ce85..3e946ca 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2403,6 +2403,10 @@ ftree-vrp
 Common Report Var(flag_tree_vrp) Init(0) Optimization
 Perform Value Range Propagation on trees.
 
+ftree-path-split
+Common Report Var(flag_tree_path_split) Init(0) Optimization
+Perform Path Splitting on trees for loop backedges.
+
 funit-at-a-time
 Common Report Var(flag_unit_at_a_time) Init(1)
 Compile whole compilation unit at a time.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 213a9d0..b1e95da 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -354,6 +354,7 @@ Objective-C and Objective-C++ Dialects}.
 -fdump-tree-fre@r{[}-@var{n}@r{]} @gol
 -fdump-tree-vtable-verify @gol
 -fdump-tree-vrp@r{[}-@var{n}@r{]} @gol
+-fdump-tree-path-split@r{[}-@var{n}@r{]} @gol
 -fdump-tree-storeccp@r{[}-@var{n}@r{]} @gol
 -fdump-final-insns=@var{file} @gol
 -fcompare-debug@r{[}=@var{opts}@r{]}  -fcompare-debug-second @gol
@@ -462,7 +463,7 @@ Objective-C and Objective-C++ Dialects}.
 -ftree-parallelize-loops=@var{n} -ftree-pre -ftree-partial-pre -ftree-pta @gol
 -ftree-reassoc -ftree-sink -ftree-slsr -ftree-sra @gol
 -ftree-switch-conversion -ftree-tail-merge -ftree-ter @gol
--ftree-vectorize -ftree-vrp @gol
+-ftree-vectorize -ftree-vrp @gol -ftree-path-split @gol
 -funit-at-a-time -funroll-all-loops -funroll-loops @gol
 -funsafe-loop-optimizations -funsafe-math-optimizations -funswitch-loops @gol
 -fipa-ra -fvariable-expansion-in-unroller -fvect-cost-model -fvpt @gol
@@ -7169,6 +7170,11 @@ output on to @file{stderr}. If two conflicting dump filenames are
 given for the same pass, then the latter option overrides the earlier
 one.
 
+@item path-split
+@opindex fdump-tree-path-split
+Dump each function after path splitting.  The file name is made by
+appending @file{.path-split} to the source file name.
+
 @item all
 Turn on all options, except @option{raw}, @option{slim}, @option{verbose}
 and @option{lineno}.
@@ -7811,6 +7817,7 @@ also turns on the following optimization flags:
 -ftree-switch-conversion -ftree-tail-merge @gol
 -ftree-pre @gol
 -ftree-vrp @gol
+-ftree-path-split @gol
 -fipa-ra}
 
 Please note the warning under @option{-fgcse} about
@@ -8819,7 +8826,7 @@ currently enabled, but may be enabled by @option{-O2} in the future.
 
 @item -ftree-sink
 @opindex ftree-sink
-Perform forward store motion  on trees.  This flag is
+Perform forward store motion on trees.  This flag is
 enabled by default at @option{-O} and higher.
 
 @item -ftree-bit-ccp
@@ -9125,6 +9132,13 @@ enabled by default at @option{-O2} and higher.  Null pointer check
 elimination is only done if @option{-fdelete-null-pointer-checks} is
 enabled.
 
+@item -ftree-path-split
+@opindex ftree-path-split
+Perform Path Splitting on trees.  When the two execution paths of a
+if-then-else merge at the loop latch node, try to duplicate the
+merge node into two paths. This is enabled by default at @option{-O2}
+and above.
+
 @item -fsplit-ivs-in-unroller
 @opindex fsplit-ivs-in-unroller
 Enables expression of values of induction variables in later iterations
diff --git a/gcc/opts.c b/gcc/opts.c
index 9a3fbb3..9a0b27c 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -509,6 +509,7 @@ static const struct default_options default_options_table[] =
     { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
+    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },
 
     /* -O3 optimizations.  */
     { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
diff --git a/gcc/passes.def b/gcc/passes.def
index c0ab6b9..4c9ef5f 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -274,6 +274,7 @@ along with GCC; see the file COPYING3.  If not see
       POP_INSERT_PASSES ()
       NEXT_PASS (pass_simduid_cleanup);
       NEXT_PASS (pass_lower_vector_ssa);
+      NEXT_PASS (pass_path_split);
       NEXT_PASS (pass_cse_reciprocals);
       NEXT_PASS (pass_reassoc);
       NEXT_PASS (pass_strength_reduction);
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/path-split-1.c b/gcc/testsuite/gcc.dg/tree-ssa/path-split-1.c
new file mode 100644
index 0000000..c7e9515
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/path-split-1.c
@@ -0,0 +1,67 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-tree-path-split-details " } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define RGBMAX 255
+
+int
+test()
+{
+  int i, Pels;
+  unsigned char sum = 0;
+  unsigned char xr, xg, xb;
+  unsigned char xc, xm, xy, xk;
+  unsigned char *ReadPtr, *EritePtr;
+
+  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+  EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+
+  for (i = 0; i < 100;i++)
+     {
+       ReadPtr[i] = 100 - i;
+     }
+
+  for (i = 0; i < 100; i++)
+     {
+       xr = *ReadPtr++;
+       xg = *ReadPtr++;
+       xb = *ReadPtr++;
+
+       xc = (unsigned char) (RGBMAX - xr);
+       xm = (unsigned char) (RGBMAX - xg);
+       xy = (unsigned char) (RGBMAX - xb);
+
+       if (xc < xm)
+         {
+           xk = (unsigned char) (xc < xy ? xc : xy);
+         }
+       else
+        {
+          xk = (unsigned char) (xm < xy ? xm : xy);
+        }
+
+       xc = (unsigned char) (xc - xk);
+       xm = (unsigned char) (xm - xk);
+       xy = (unsigned char) (xy - xk);
+
+       *EritePtr++ = xc;
+       *EritePtr++ = xm;
+       *EritePtr++ = xy;
+       *EritePtr++ = xk;
+       sum += *EritePtr;
+    }
+  return sum;
+}
+
+int
+main()
+{
+  if (test() != 33)
+    abort();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "Duplicating join block" "path-split" } } */
diff --git a/gcc/timevar.def b/gcc/timevar.def
index b429faf..3dba68b 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -300,3 +300,4 @@ DEFTIMEVAR (TV_LINK		     , "link JIT code")
 DEFTIMEVAR (TV_LOAD		     , "load JIT result")
 DEFTIMEVAR (TV_JIT_ACQUIRING_MUTEX   , "acquiring JIT mutex")
 DEFTIMEVAR (TV_JIT_CLIENT_CODE   , "JIT client code")
+DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path split")
diff --git a/gcc/tracer.c b/gcc/tracer.c
index 941dc20..c7b5150 100644
--- a/gcc/tracer.c
+++ b/gcc/tracer.c
@@ -51,9 +51,9 @@
 #include "tree-inline.h"
 #include "cfgloop.h"
 #include "fibonacci_heap.h"
+#include "tracer.h"
 
 static int count_insns (basic_block);
-static bool ignore_bb_p (const_basic_block);
 static bool better_p (const_edge, const_edge);
 static edge find_best_successor (basic_block);
 static edge find_best_predecessor (basic_block);
@@ -85,7 +85,7 @@ bb_seen_p (basic_block bb)
 }
 
 /* Return true if we should ignore the basic block for purposes of tracing.  */
-static bool
+bool
 ignore_bb_p (const_basic_block bb)
 {
   if (bb->index < NUM_FIXED_BLOCKS)
@@ -226,6 +226,24 @@ find_trace (basic_block bb, basic_block *trace)
   return i;
 }
 
+/* Duplicate block BB2, placing it after BB in the CFG.  Return the
+   newly created block.  */
+basic_block
+transform_duplicate (basic_block bb, basic_block bb2)
+{
+  edge e;
+  basic_block copy;
+
+  e = find_edge (bb, bb2);
+
+  copy = duplicate_block (bb2, e, bb);
+  flush_pending_stmts (e);
+
+  add_phi_args_after_copy (&copy, 1, NULL);
+
+  return (copy);
+}
+
 /* Look for basic blocks in frequency order, construct traces and tail duplicate
    if profitable.  */
 
@@ -321,17 +339,8 @@ tail_duplicate (void)
 		 entries or at least rotate the loop.  */
 	      && bb2->loop_father->header != bb2)
 	    {
-	      edge e;
-	      basic_block copy;
-
-	      nduplicated += counts [bb2->index];
-
-	      e = find_edge (bb, bb2);
-
-	      copy = duplicate_block (bb2, e, bb);
-	      flush_pending_stmts (e);
-
-	      add_phi_args_after_copy (&copy, 1, NULL);
+              nduplicated += counts [bb2->index];
+              basic_block copy = transform_duplicate (bb, bb2);
 
 	      /* Reconsider the original copy of block we've duplicated.
 	         Removing the most common predecessor may make it to be
diff --git a/gcc/tracer.h b/gcc/tracer.h
new file mode 100644
index 0000000..cd1792a
--- /dev/null
+++ b/gcc/tracer.h
@@ -0,0 +1,26 @@
+/* Header file for Tracer.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_TRACER_H
+#define GCC_TRACER_H
+
+extern basic_block transform_duplicate (basic_block bb, basic_block bb2);
+extern bool ignore_bb_p (const_basic_block bb);
+
+#endif /* GCC_TRACER_H */
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 49e22a9..6963acc 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -390,6 +390,7 @@ extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_path_split (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_build_ssa (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_build_alias (gcc::context *ctxt);
diff --git a/gcc/tree-ssa-path-split.c b/gcc/tree-ssa-path-split.c
new file mode 100644
index 0000000..9f61bd4
--- /dev/null
+++ b/gcc/tree-ssa-path-split.c
@@ -0,0 +1,275 @@
+/* Support routines for Path Splitting.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "tree.h"
+#include "gimple.h"
+#include "tree-pass.h"
+#include "cfganal.h"
+#include "cfgloop.h"
+#include "gimple-iterator.h"
+#include "tracer.h"
+
+/* Given LATCH, the latch block in a loop, see if the shape of the
+   path reaching LATCH is suitable for path splitting.  If so, return
+   the block that will be duplicated into its predecessor paths.  Else
+   return NULL.  */
+
+static basic_block
+find_block_to_duplicate_for_path_splitting (basic_block latch)
+{
+  /* We should have simple latches at this point.  So the latch should
+     have a single successor.  This implies the predecessor of the latch
+     likely has the loop exit.  And it's that predecessor we're most
+     interested in. To keep things simple, we're going to require that
+     the latch have a single predecessor too.  */
+  if (single_succ_p (latch) && single_pred_p (latch))
+    {
+      basic_block bb = get_immediate_dominator (CDI_DOMINATORS, latch);
+      gcc_assert (single_pred_edge (latch)->src == bb);
+
+      /* If BB has been marked as not to be duplicated, then honor that
+	 request.  */
+      if (ignore_bb_p (bb))
+	return NULL;
+
+      gimple *last = gsi_stmt (gsi_last_nondebug_bb (bb));
+      /* The immediate dominator of the latch must end in a conditional.  */
+      if (!last || gimple_code (last) != GIMPLE_COND)
+	return NULL;
+
+      /* We're hoping that BB is a join point for an IF-THEN-ELSE diamond
+	 region.  Verify that it is.
+
+	 First, verify that BB has two predecessors (each arm of the
+	 IF-THEN-ELSE) and two successors (the latch and exit).  */
+      if (EDGE_COUNT (bb->preds) == 2 && EDGE_COUNT (bb->succs) == 2)
+	{
+	  /* Now verify that BB's immediate dominator ends in a
+	     conditional as well.  */
+	  basic_block bb_idom = get_immediate_dominator (CDI_DOMINATORS, bb);
+	  gimple *last = gsi_stmt (gsi_last_nondebug_bb (bb_idom));
+	  if (!last || gimple_code (last) != GIMPLE_COND)
+	    return NULL;
+
+	  /* And that BB's immediate dominator's successors are the
+	     the predecessors of BB.  */
+	  if (!find_edge (bb_idom, EDGE_PRED (bb, 0)->src)
+	      || !find_edge (bb_idom, EDGE_PRED (bb, 1)->src))
+	    return NULL;
+
+	  /* So at this point we have a simple diamond for an IF-THEN-ELSE
+	     construct starting at BB_IDOM, with a join point at BB.  BB
+	     pass control outside the loop or to the loop latch.
+
+	     We're going to want to create two duplicates of BB, one for
+	     each successor of BB_IDOM.  */
+	  return bb;
+	}
+    }
+  return NULL;
+}
+
+/* Return TRUE if BB is a reasonable block to duplicate by examining
+   its size, false otherwise.  BB will always be a loop latch block.
+
+   Should this use the same tests as we do for jump threading?  */
+
+static bool
+is_feasible_trace (basic_block bb)
+{
+  int num_stmt = 0;
+  gimple_stmt_iterator gsi;
+
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+      if (!is_gimple_debug (stmt))
+	num_stmt++;
+    }
+
+  /* We may want to limit how many statements we copy.  */
+  if (num_stmt > 1)
+    return true;
+
+  return false;
+}
+
+/* If the immediate dominator of the latch of the loop is
+   block with conditional branch, then the loop latch  is
+   duplicated to its predecessors path preserving the SSA
+   semantics.
+
+   CFG before transformation.
+
+              2
+              |
+              |
+        +---->3
+        |    / \
+        |   /   \
+        |  4     5
+        |   \   /
+        |    \ /
+        |     6
+        |    / \
+        |   /   \
+        |  8     7
+        |  |     |
+        ---+     E
+    
+
+
+    Block 8 is the latch.  We're going to make copies of block 6 (9 & 10)
+    and wire things up so they look like this:
+
+              2
+              |
+              |
+        +---->3
+        |    / \
+        |   /   \
+        |  4     5
+        |  |     |
+        |  |     |
+        |  9    10
+        |  |\   /|
+        |  | \ / |
+        |  |  7  |
+        |  |  |  |
+        |  |  E  |
+        |  |     |
+        |   \   /
+        |    \ /
+        +-----8 
+
+
+    Blocks 9 and 10 will get merged into blocks 4 & 5 respectively which
+    enables CSE, DCE and other optimizations to occur on a larger block
+    of code.   */
+
+static bool
+perform_path_splitting ()
+{
+  bool changed = false;
+  loop_p loop;
+
+  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
+  initialize_original_copy_tables ();
+  calculate_dominance_info (CDI_DOMINATORS);
+
+  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
+    {
+      /* See if there is a block that we can duplicate to split the
+	 path to the loop latch.  */
+      basic_block bb = find_block_to_duplicate_for_path_splitting (loop->latch);
+
+      /* BB is the merge point for an IF-THEN-ELSE we want to transform.
+
+	 Essentially we want to create two duplicates of BB and append
+	 a duplicate to the THEN and ELSE clauses.  This will split the
+	 path leading to the latch.  BB will be unreachable and removed.  */
+      if (bb && is_feasible_trace (bb))
+	{
+          if (dump_file && (dump_flags & TDF_DETAILS)) 
+            fprintf (dump_file,
+		     "Duplicating join block %d into predecessor paths\n",
+		     bb->index);
+	  basic_block pred0 = EDGE_PRED (bb, 0)->src;
+	  basic_block pred1 = EDGE_PRED (bb, 1)->src;
+	  transform_duplicate (pred0, bb);
+	  transform_duplicate (pred1, bb);
+	  changed = true;
+	}
+    }
+
+  loop_optimizer_finalize ();
+  free_original_copy_tables ();
+  return changed;
+}
+
+/* Main entry point for path splitting.  Returns TODO_cleanup_cfg if any
+   paths where split, otherwise return zero.  */
+
+static unsigned int
+execute_path_split (void)
+{
+  /* If we don't have at least 2 real blocks and backedges in the
+     CFG, then there's no point in trying to perform path splitting.  */
+  if (n_basic_blocks_for_fn (cfun) <= NUM_FIXED_BLOCKS + 1
+      || !mark_dfs_back_edges ())
+    return 0;
+
+  bool changed = perform_path_splitting();
+  if (changed)
+    {
+      free_dominance_info (CDI_DOMINATORS);
+      /* If we changed the CFG schedule loops for fixup by cleanup_cfg.  */
+      if (current_loops)
+	loops_state_set (LOOPS_NEED_FIXUP);
+    }
+
+  return changed ? TODO_cleanup_cfg : 0;
+}
+
+static bool
+gate_path_split (void)
+{
+  return flag_tree_path_split;
+}
+
+namespace {
+
+const pass_data pass_data_path_split =
+{
+  GIMPLE_PASS, /* type */
+  "path-split", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_TREE_PATH_SPLIT, /* tv_id */
+  PROP_ssa, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_update_ssa, /* todo_flags_finish */
+};
+
+class pass_path_split : public gimple_opt_pass
+{
+   public:
+    pass_path_split (gcc::context *ctxt)
+      : gimple_opt_pass (pass_data_path_split, ctxt)
+    {}
+   /* opt_pass methods: */
+   opt_pass * clone () { return new pass_path_split (m_ctxt); }
+   virtual bool gate (function *) { return gate_path_split (); }
+   virtual unsigned int execute (function *) { return execute_path_split (); }
+
+}; // class pass_path_split
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_path_split (gcc::context *ctxt)
+{
+  return new pass_path_split (ctxt);
+}

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-11-12 21:58                           ` Jeff Law
@ 2015-11-13 10:13                             ` Richard Biener
  2015-11-13 16:26                               ` Jeff Law
  2015-11-13 13:19                             ` Ajit Kumar Agarwal
  1 sibling, 1 reply; 72+ messages in thread
From: Richard Biener @ 2015-11-13 10:13 UTC (permalink / raw)
  To: Jeff Law
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On Thu, Nov 12, 2015 at 10:57 PM, Jeff Law <law@redhat.com> wrote:
> On 11/12/2015 11:32 AM, Jeff Law wrote:
>>
>> On 11/12/2015 10:05 AM, Jeff Law wrote:
>>>>
>>>> But IIRC you mentioned it should enable vectorization or so?  In this
>>>> case
>>>> that's obviously too late.
>>>
>>> The opposite.  Path splitting interferes with if-conversion &
>>> vectorization.  Path splitting mucks up the CFG enough that
>>> if-conversion won't fire and as a result vectorization is inhibited.  It
>>> also creates multi-latch loops, which isn't a great situation either.
>>>
>>> It *may* be the case that dropping it that far down in the pipeline and
>>> making the modifications necessary to handle simple latches may in turn
>>> make the path splitting code play better with if-conversion and
>>> vectorization and avoid creation of multi-latch loops.  At least that's
>>> how it looks on paper when I draw out the CFG manipulations.
>>>
>>> I'll do some experiments.
>>
>> It doesn't look too terrible to ravamp the recognition code to work
>> later in the pipeline with simple latches.  Sadly that doesn't seem to
>> have fixed the bad interactions with if-conversion.
>>
>> *But* that does open up the possibility of moving the path splitting
>> pass even deeper in the pipeline -- in particular we can move it past
>> the vectorizer.  Which is may be a win.
>>
>> So the big question is whether or not we'll still see enough benefits
>> from having it so late in the pipeline.  It's still early enough that we
>> get DOM, VRP, reassoc, forwprop, phiopt, etc.
>>
>> Ajit, I'll pass along an updated patch after doing some more testing.
>
> So here's what I'm working with.  It runs after the vectorizer now.
>
> Ajit, if you could benchmark this it would be greatly appreciated.  I know
> you saw significant improvements on one or more benchmarks in the past.
> It'd be good to know that the updated placement of the pass doesn't
> invalidate the gains you saw.
>
> With the updated pass placement, we don't have to worry about switching the
> pass on/off based on whether or not the vectorizer & if-conversion are
> enabled.  So that hackery is gone.
>
> I think I've beefed up the test to identify the diamond patterns we want so
> that it's stricter in what we accept.  The call to ignore_bb_p is a part of
> that test so that we're actually looking at the right block in a world where
> we're doing this transformation with simple latches.
>
> I've also put a graphical comment before perform_path_splitting which
> hopefully shows the CFG transformation we're making a bit clearer.
>
> This bootstraps and regression tests cleanly on x86_64-linux-gnu.
>
>
>
> diff --git a/gcc/Makefile.in b/gcc/Makefile.in
> index 34d2356..6613e83 100644
> --- a/gcc/Makefile.in
> +++ b/gcc/Makefile.in
> @@ -1474,6 +1474,7 @@ OBJS = \
>         tree-ssa-loop.o \
>         tree-ssa-math-opts.o \
>         tree-ssa-operands.o \
> +       tree-ssa-path-split.o \

gimple-ssa-path-split please.

>         tree-ssa-phionlycprop.o \
>         tree-ssa-phiopt.o \
>         tree-ssa-phiprop.o \
> diff --git a/gcc/common.opt b/gcc/common.opt
> index 757ce85..3e946ca 100644
> --- a/gcc/common.opt
> +++ b/gcc/common.opt
> @@ -2403,6 +2403,10 @@ ftree-vrp
>  Common Report Var(flag_tree_vrp) Init(0) Optimization
>  Perform Value Range Propagation on trees.
>
> +ftree-path-split

fsplit-paths

> +Common Report Var(flag_tree_path_split) Init(0) Optimization
> +Perform Path Splitting on trees for loop backedges.
> +
>  funit-at-a-time
>  Common Report Var(flag_unit_at_a_time) Init(1)
>  Compile whole compilation unit at a time.
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 213a9d0..b1e95da 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -354,6 +354,7 @@ Objective-C and Objective-C++ Dialects}.
>  -fdump-tree-fre@r{[}-@var{n}@r{]} @gol
>  -fdump-tree-vtable-verify @gol
>  -fdump-tree-vrp@r{[}-@var{n}@r{]} @gol
> +-fdump-tree-path-split@r{[}-@var{n}@r{]} @gol
>  -fdump-tree-storeccp@r{[}-@var{n}@r{]} @gol
>  -fdump-final-insns=@var{file} @gol
>  -fcompare-debug@r{[}=@var{opts}@r{]}  -fcompare-debug-second @gol
> @@ -462,7 +463,7 @@ Objective-C and Objective-C++ Dialects}.
>  -ftree-parallelize-loops=@var{n} -ftree-pre -ftree-partial-pre -ftree-pta
> @gol
>  -ftree-reassoc -ftree-sink -ftree-slsr -ftree-sra @gol
>  -ftree-switch-conversion -ftree-tail-merge -ftree-ter @gol
> --ftree-vectorize -ftree-vrp @gol
> +-ftree-vectorize -ftree-vrp @gol -ftree-path-split @gol
>  -funit-at-a-time -funroll-all-loops -funroll-loops @gol
>  -funsafe-loop-optimizations -funsafe-math-optimizations -funswitch-loops
> @gol
>  -fipa-ra -fvariable-expansion-in-unroller -fvect-cost-model -fvpt @gol
> @@ -7169,6 +7170,11 @@ output on to @file{stderr}. If two conflicting dump
> filenames are
>  given for the same pass, then the latter option overrides the earlier
>  one.
>
> +@item path-split
> +@opindex fdump-tree-path-split
> +Dump each function after path splitting.  The file name is made by
> +appending @file{.path-split} to the source file name.
> +
>  @item all
>  Turn on all options, except @option{raw}, @option{slim}, @option{verbose}
>  and @option{lineno}.
> @@ -7811,6 +7817,7 @@ also turns on the following optimization flags:
>  -ftree-switch-conversion -ftree-tail-merge @gol
>  -ftree-pre @gol
>  -ftree-vrp @gol
> +-ftree-path-split @gol
>  -fipa-ra}
>
>  Please note the warning under @option{-fgcse} about
> @@ -8819,7 +8826,7 @@ currently enabled, but may be enabled by @option{-O2}
> in the future.
>
>  @item -ftree-sink
>  @opindex ftree-sink
> -Perform forward store motion  on trees.  This flag is
> +Perform forward store motion on trees.  This flag is
>  enabled by default at @option{-O} and higher.
>
>  @item -ftree-bit-ccp
> @@ -9125,6 +9132,13 @@ enabled by default at @option{-O2} and higher.  Null
> pointer check
>  elimination is only done if @option{-fdelete-null-pointer-checks} is
>  enabled.
>
> +@item -ftree-path-split
> +@opindex ftree-path-split
> +Perform Path Splitting on trees.  When the two execution paths of a
> +if-then-else merge at the loop latch node, try to duplicate the
> +merge node into two paths. This is enabled by default at @option{-O2}
> +and above.
> +

I think if we go into the detail of the transform we should mention the
effective result (creating a loop nest with disambiguation figuring out
which is the "better" inner loop).

>  @item -fsplit-ivs-in-unroller
>  @opindex fsplit-ivs-in-unroller
>  Enables expression of values of induction variables in later iterations
> diff --git a/gcc/opts.c b/gcc/opts.c
> index 9a3fbb3..9a0b27c 100644
> --- a/gcc/opts.c
> +++ b/gcc/opts.c
> @@ -509,6 +509,7 @@ static const struct default_options
> default_options_table[] =
>      { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1
> },
>      { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
>      { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
> +    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },

Is this transform a good idea for -Os?

>
>      /* -O3 optimizations.  */
>      { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
> diff --git a/gcc/passes.def b/gcc/passes.def
> index c0ab6b9..4c9ef5f 100644
> --- a/gcc/passes.def
> +++ b/gcc/passes.def
> @@ -274,6 +274,7 @@ along with GCC; see the file COPYING3.  If not see
>        POP_INSERT_PASSES ()
>        NEXT_PASS (pass_simduid_cleanup);
>        NEXT_PASS (pass_lower_vector_ssa);
> +      NEXT_PASS (pass_path_split);
>        NEXT_PASS (pass_cse_reciprocals);
>        NEXT_PASS (pass_reassoc);
>        NEXT_PASS (pass_strength_reduction);
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/path-split-1.c
> b/gcc/testsuite/gcc.dg/tree-ssa/path-split-1.c
> new file mode 100644
> index 0000000..c7e9515
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/path-split-1.c
> @@ -0,0 +1,67 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fdump-tree-path-split-details " } */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +
> +#define RGBMAX 255
> +
> +int
> +test()
> +{
> +  int i, Pels;
> +  unsigned char sum = 0;
> +  unsigned char xr, xg, xb;
> +  unsigned char xc, xm, xy, xk;
> +  unsigned char *ReadPtr, *EritePtr;
> +
> +  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
> +  EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
> +
> +  for (i = 0; i < 100;i++)
> +     {
> +       ReadPtr[i] = 100 - i;
> +     }
> +
> +  for (i = 0; i < 100; i++)
> +     {
> +       xr = *ReadPtr++;
> +       xg = *ReadPtr++;
> +       xb = *ReadPtr++;
> +
> +       xc = (unsigned char) (RGBMAX - xr);
> +       xm = (unsigned char) (RGBMAX - xg);
> +       xy = (unsigned char) (RGBMAX - xb);
> +
> +       if (xc < xm)
> +         {
> +           xk = (unsigned char) (xc < xy ? xc : xy);
> +         }
> +       else
> +        {
> +          xk = (unsigned char) (xm < xy ? xm : xy);
> +        }
> +
> +       xc = (unsigned char) (xc - xk);
> +       xm = (unsigned char) (xm - xk);
> +       xy = (unsigned char) (xy - xk);
> +
> +       *EritePtr++ = xc;
> +       *EritePtr++ = xm;
> +       *EritePtr++ = xy;
> +       *EritePtr++ = xk;
> +       sum += *EritePtr;
> +    }
> +  return sum;
> +}
> +
> +int
> +main()
> +{
> +  if (test() != 33)
> +    abort();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump "Duplicating join block" "path-split" } } */
> diff --git a/gcc/timevar.def b/gcc/timevar.def
> index b429faf..3dba68b 100644
> --- a/gcc/timevar.def
> +++ b/gcc/timevar.def
> @@ -300,3 +300,4 @@ DEFTIMEVAR (TV_LINK              , "link JIT code")
>  DEFTIMEVAR (TV_LOAD                 , "load JIT result")
>  DEFTIMEVAR (TV_JIT_ACQUIRING_MUTEX   , "acquiring JIT mutex")
>  DEFTIMEVAR (TV_JIT_CLIENT_CODE   , "JIT client code")
> +DEFTIMEVAR (TV_TREE_PATH_SPLIT  , "tree path split")
> diff --git a/gcc/tracer.c b/gcc/tracer.c
> index 941dc20..c7b5150 100644
> --- a/gcc/tracer.c
> +++ b/gcc/tracer.c
> @@ -51,9 +51,9 @@
>  #include "tree-inline.h"
>  #include "cfgloop.h"
>  #include "fibonacci_heap.h"
> +#include "tracer.h"
>
>  static int count_insns (basic_block);
> -static bool ignore_bb_p (const_basic_block);
>  static bool better_p (const_edge, const_edge);
>  static edge find_best_successor (basic_block);
>  static edge find_best_predecessor (basic_block);
> @@ -85,7 +85,7 @@ bb_seen_p (basic_block bb)
>  }
>
>  /* Return true if we should ignore the basic block for purposes of tracing.
> */
> -static bool
> +bool
>  ignore_bb_p (const_basic_block bb)
>  {
>    if (bb->index < NUM_FIXED_BLOCKS)
> @@ -226,6 +226,24 @@ find_trace (basic_block bb, basic_block *trace)
>    return i;
>  }
>
> +/* Duplicate block BB2, placing it after BB in the CFG.  Return the
> +   newly created block.  */
> +basic_block
> +transform_duplicate (basic_block bb, basic_block bb2)
> +{
> +  edge e;
> +  basic_block copy;
> +
> +  e = find_edge (bb, bb2);
> +
> +  copy = duplicate_block (bb2, e, bb);
> +  flush_pending_stmts (e);
> +
> +  add_phi_args_after_copy (&copy, 1, NULL);
> +
> +  return (copy);
> +}
> +
>  /* Look for basic blocks in frequency order, construct traces and tail
> duplicate
>     if profitable.  */
>
> @@ -321,17 +339,8 @@ tail_duplicate (void)
>                  entries or at least rotate the loop.  */
>               && bb2->loop_father->header != bb2)
>             {
> -             edge e;
> -             basic_block copy;
> -
> -             nduplicated += counts [bb2->index];
> -
> -             e = find_edge (bb, bb2);
> -
> -             copy = duplicate_block (bb2, e, bb);
> -             flush_pending_stmts (e);
> -
> -             add_phi_args_after_copy (&copy, 1, NULL);
> +              nduplicated += counts [bb2->index];
> +              basic_block copy = transform_duplicate (bb, bb2);
>
>               /* Reconsider the original copy of block we've duplicated.
>                  Removing the most common predecessor may make it to be
> diff --git a/gcc/tracer.h b/gcc/tracer.h
> new file mode 100644
> index 0000000..cd1792a
> --- /dev/null
> +++ b/gcc/tracer.h
> @@ -0,0 +1,26 @@
> +/* Header file for Tracer.
> +   Copyright (C) 2015 Free Software Foundation, Inc.
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify it under
> +the terms of the GNU General Public License as published by the Free
> +Software Foundation; either version 3, or (at your option) any later
> +version.
> +
> +GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> + for more details.
> +
> +You should have received a copy of the GNU General Public License
> +along with GCC; see the file COPYING3.  If not see
> +<http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_TRACER_H
> +#define GCC_TRACER_H
> +
> +extern basic_block transform_duplicate (basic_block bb, basic_block bb2);
> +extern bool ignore_bb_p (const_basic_block bb);
> +
> +#endif /* GCC_TRACER_H */
> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
> index 49e22a9..6963acc 100644
> --- a/gcc/tree-pass.h
> +++ b/gcc/tree-pass.h
> @@ -390,6 +390,7 @@ extern gimple_opt_pass *make_pass_tree_loop_done
> (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
> +extern gimple_opt_pass *make_pass_path_split (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_build_ssa (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_build_alias (gcc::context *ctxt);
> diff --git a/gcc/tree-ssa-path-split.c b/gcc/tree-ssa-path-split.c
> new file mode 100644
> index 0000000..9f61bd4
> --- /dev/null
> +++ b/gcc/tree-ssa-path-split.c
> @@ -0,0 +1,275 @@
> +/* Support routines for Path Splitting.
> +   Copyright (C) 2015 Free Software Foundation, Inc.
> +   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
> +
> + This file is part of GCC.
> +
> + GCC is free software; you can redistribute it and/or modify
> + it under the terms of the GNU General Public License as published by
> + the Free Software Foundation; either version 3, or (at your option)
> + any later version.
> +
> +GCC is distributed in the hope that it will be useful,
> +but WITHOUT ANY WARRANTY; without even the implied warranty of
> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +GNU General Public License for more details.
> +
> +You should have received a copy of the GNU General Public License
> +along with GCC; see the file COPYING3.  If not see
> +<http://www.gnu.org/licenses/>.  */
> +
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#include "backend.h"
> +#include "tree.h"
> +#include "gimple.h"
> +#include "tree-pass.h"
> +#include "cfganal.h"
> +#include "cfgloop.h"
> +#include "gimple-iterator.h"
> +#include "tracer.h"
> +
> +/* Given LATCH, the latch block in a loop, see if the shape of the
> +   path reaching LATCH is suitable for path splitting.  If so, return
> +   the block that will be duplicated into its predecessor paths.  Else
> +   return NULL.  */
> +
> +static basic_block
> +find_block_to_duplicate_for_path_splitting (basic_block latch)
> +{
> +  /* We should have simple latches at this point.  So the latch should
> +     have a single successor.  This implies the predecessor of the latch
> +     likely has the loop exit.  And it's that predecessor we're most
> +     interested in. To keep things simple, we're going to require that
> +     the latch have a single predecessor too.  */
> +  if (single_succ_p (latch) && single_pred_p (latch))
> +    {
> +      basic_block bb = get_immediate_dominator (CDI_DOMINATORS, latch);
> +      gcc_assert (single_pred_edge (latch)->src == bb);
> +
> +      /* If BB has been marked as not to be duplicated, then honor that
> +        request.  */
> +      if (ignore_bb_p (bb))
> +       return NULL;
> +
> +      gimple *last = gsi_stmt (gsi_last_nondebug_bb (bb));
> +      /* The immediate dominator of the latch must end in a conditional.
> */
> +      if (!last || gimple_code (last) != GIMPLE_COND)
> +       return NULL;
> +
> +      /* We're hoping that BB is a join point for an IF-THEN-ELSE diamond
> +        region.  Verify that it is.
> +
> +        First, verify that BB has two predecessors (each arm of the
> +        IF-THEN-ELSE) and two successors (the latch and exit).  */
> +      if (EDGE_COUNT (bb->preds) == 2 && EDGE_COUNT (bb->succs) == 2)
> +       {
> +         /* Now verify that BB's immediate dominator ends in a
> +            conditional as well.  */
> +         basic_block bb_idom = get_immediate_dominator (CDI_DOMINATORS,
> bb);
> +         gimple *last = gsi_stmt (gsi_last_nondebug_bb (bb_idom));
> +         if (!last || gimple_code (last) != GIMPLE_COND)
> +           return NULL;
> +
> +         /* And that BB's immediate dominator's successors are the
> +            the predecessors of BB.  */
> +         if (!find_edge (bb_idom, EDGE_PRED (bb, 0)->src)
> +             || !find_edge (bb_idom, EDGE_PRED (bb, 1)->src))
> +           return NULL;
> +
> +         /* So at this point we have a simple diamond for an IF-THEN-ELSE
> +            construct starting at BB_IDOM, with a join point at BB.  BB
> +            pass control outside the loop or to the loop latch.
> +
> +            We're going to want to create two duplicates of BB, one for
> +            each successor of BB_IDOM.  */
> +         return bb;
> +       }
> +    }
> +  return NULL;
> +}
> +
> +/* Return TRUE if BB is a reasonable block to duplicate by examining
> +   its size, false otherwise.  BB will always be a loop latch block.
> +
> +   Should this use the same tests as we do for jump threading?  */
> +
> +static bool
> +is_feasible_trace (basic_block bb)
> +{
> +  int num_stmt = 0;
> +  gimple_stmt_iterator gsi;
> +
> +  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +    {
> +      gimple *stmt = gsi_stmt (gsi);
> +      if (!is_gimple_debug (stmt))
> +       num_stmt++;
> +    }
> +
> +  /* We may want to limit how many statements we copy.  */
> +  if (num_stmt > 1)
> +    return true;
> +
> +  return false;
> +}
> +
> +/* If the immediate dominator of the latch of the loop is
> +   block with conditional branch, then the loop latch  is
> +   duplicated to its predecessors path preserving the SSA
> +   semantics.
> +
> +   CFG before transformation.
> +
> +              2
> +              |
> +              |
> +        +---->3
> +        |    / \
> +        |   /   \
> +        |  4     5
> +        |   \   /
> +        |    \ /
> +        |     6
> +        |    / \
> +        |   /   \
> +        |  8     7
> +        |  |     |
> +        ---+     E
> +
> +
> +
> +    Block 8 is the latch.  We're going to make copies of block 6 (9 & 10)
> +    and wire things up so they look like this:
> +
> +              2
> +              |
> +              |
> +        +---->3
> +        |    / \
> +        |   /   \
> +        |  4     5
> +        |  |     |
> +        |  |     |
> +        |  9    10
> +        |  |\   /|
> +        |  | \ / |
> +        |  |  7  |
> +        |  |  |  |
> +        |  |  E  |
> +        |  |     |
> +        |   \   /
> +        |    \ /
> +        +-----8
> +
> +
> +    Blocks 9 and 10 will get merged into blocks 4 & 5 respectively which
> +    enables CSE, DCE and other optimizations to occur on a larger block
> +    of code.   */
> +
> +static bool
> +perform_path_splitting ()
> +{
> +  bool changed = false;
> +  loop_p loop;
> +
> +  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
> +  initialize_original_copy_tables ();
> +  calculate_dominance_info (CDI_DOMINATORS);
> +
> +  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
> +    {
> +      /* See if there is a block that we can duplicate to split the
> +        path to the loop latch.  */
> +      basic_block bb = find_block_to_duplicate_for_path_splitting
> (loop->latch);
> +
> +      /* BB is the merge point for an IF-THEN-ELSE we want to transform.
> +
> +        Essentially we want to create two duplicates of BB and append
> +        a duplicate to the THEN and ELSE clauses.  This will split the
> +        path leading to the latch.  BB will be unreachable and removed.  */
> +      if (bb && is_feasible_trace (bb))
> +       {
> +          if (dump_file && (dump_flags & TDF_DETAILS))
> +            fprintf (dump_file,
> +                    "Duplicating join block %d into predecessor paths\n",
> +                    bb->index);
> +         basic_block pred0 = EDGE_PRED (bb, 0)->src;
> +         basic_block pred1 = EDGE_PRED (bb, 1)->src;
> +         transform_duplicate (pred0, bb);
> +         transform_duplicate (pred1, bb);
> +         changed = true;
> +       }
> +    }
> +
> +  loop_optimizer_finalize ();
> +  free_original_copy_tables ();
> +  return changed;
> +}
> +
> +/* Main entry point for path splitting.  Returns TODO_cleanup_cfg if any
> +   paths where split, otherwise return zero.  */
> +
> +static unsigned int
> +execute_path_split (void)
> +{
> +  /* If we don't have at least 2 real blocks and backedges in the
> +     CFG, then there's no point in trying to perform path splitting.  */
> +  if (n_basic_blocks_for_fn (cfun) <= NUM_FIXED_BLOCKS + 1
> +      || !mark_dfs_back_edges ())
> +    return 0;
> +
> +  bool changed = perform_path_splitting();
> +  if (changed)
> +    {
> +      free_dominance_info (CDI_DOMINATORS);
> +      /* If we changed the CFG schedule loops for fixup by cleanup_cfg.  */
> +      if (current_loops)
> +       loops_state_set (LOOPS_NEED_FIXUP);
> +    }
> +
> +  return changed ? TODO_cleanup_cfg : 0;
> +}
> +
> +static bool
> +gate_path_split (void)
> +{
> +  return flag_tree_path_split;
> +}
> +
> +namespace {
> +
> +const pass_data pass_data_path_split =
> +{
> +  GIMPLE_PASS, /* type */
> +  "path-split", /* name */
> +  OPTGROUP_NONE, /* optinfo_flags */
> +  TV_TREE_PATH_SPLIT, /* tv_id */
> +  PROP_ssa, /* properties_required */
> +  0, /* properties_provided */
> +  0, /* properties_destroyed */
> +  0, /* todo_flags_start */
> +  TODO_update_ssa, /* todo_flags_finish */
> +};
> +
> +class pass_path_split : public gimple_opt_pass
> +{
> +   public:
> +    pass_path_split (gcc::context *ctxt)
> +      : gimple_opt_pass (pass_data_path_split, ctxt)
> +    {}
> +   /* opt_pass methods: */
> +   opt_pass * clone () { return new pass_path_split (m_ctxt); }
> +   virtual bool gate (function *) { return gate_path_split (); }
> +   virtual unsigned int execute (function *) { return execute_path_split
> (); }
> +
> +}; // class pass_path_split
> +
> +} // anon namespace
> +
> +gimple_opt_pass *
> +make_pass_path_split (gcc::context *ctxt)
> +{
> +  return new pass_path_split (ctxt);
> +}
>

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-11-12 21:58                           ` Jeff Law
  2015-11-13 10:13                             ` Richard Biener
@ 2015-11-13 13:19                             ` Ajit Kumar Agarwal
  1 sibling, 0 replies; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-11-13 13:19 UTC (permalink / raw)
  To: Jeff Law, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala



-----Original Message-----
From: Jeff Law [mailto:law@redhat.com] 
Sent: Friday, November 13, 2015 3:28 AM
To: Richard Biener
Cc: Ajit Kumar Agarwal; GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On 11/12/2015 11:32 AM, Jeff Law wrote:
> On 11/12/2015 10:05 AM, Jeff Law wrote:
>>> But IIRC you mentioned it should enable vectorization or so?  In 
>>> this case that's obviously too late.
>> The opposite.  Path splitting interferes with if-conversion & 
>> vectorization.  Path splitting mucks up the CFG enough that 
>> if-conversion won't fire and as a result vectorization is inhibited.  
>> It also creates multi-latch loops, which isn't a great situation either.
>>
>> It *may* be the case that dropping it that far down in the pipeline 
>> and making the modifications necessary to handle simple latches may 
>> in turn make the path splitting code play better with if-conversion 
>> and vectorization and avoid creation of multi-latch loops.  At least 
>> that's how it looks on paper when I draw out the CFG manipulations.
>>
>> I'll do some experiments.
> It doesn't look too terrible to ravamp the recognition code to work 
> later in the pipeline with simple latches.  Sadly that doesn't seem to 
> have fixed the bad interactions with if-conversion.
>
> *But* that does open up the possibility of moving the path splitting 
> pass even deeper in the pipeline -- in particular we can move it past 
> the vectorizer.  Which is may be a win.
>
> So the big question is whether or not we'll still see enough benefits 
> from having it so late in the pipeline.  It's still early enough that 
> we get DOM, VRP, reassoc, forwprop, phiopt, etc.
>
> Ajit, I'll pass along an updated patch after doing some more testing.

Hello Jeff:
>>So here's what I'm working with.  It runs after the vectorizer now.

>>Ajit, if you could benchmark this it would be greatly appreciated.  I know you saw significant improvements on one or more benchmarks in the past.  It'd be good to know that the >>updated placement of the pass doesn't invalidate the gains you saw.

>>With the updated pass placement, we don't have to worry about switching the pass on/off based on whether or not the vectorizer & if-conversion are enabled.  So that hackery is gone.

>>I think I've beefed up the test to identify the diamond patterns we want so that it's stricter in what we accept.  The call to ignore_bb_p is a part of that test so that we're actually looking at >>the right block in a world where we're doing this transformation with simple latches.

>>I've also put a graphical comment before perform_path_splitting which hopefully shows the CFG transformation we're making a bit clearer.

>>This bootstraps and regression tests cleanly on x86_64-linux-gnu.

Thank you for the inputs. I will build the compiler and run SPEC CPU 2000 benchmarks for X86 target and respond back as soon as run is done.
I will also run  the EEMBC/Mibench benchmarks for Microblaze target.
 
Would let you know the results at the earliest.

Thanks & Regards
Ajit


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-11-13 10:13                             ` Richard Biener
@ 2015-11-13 16:26                               ` Jeff Law
  2015-11-13 18:09                                 ` Richard Biener
  0 siblings, 1 reply; 72+ messages in thread
From: Jeff Law @ 2015-11-13 16:26 UTC (permalink / raw)
  To: Richard Biener
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On 11/13/2015 03:13 AM, Richard Biener wrote:

>> diff --git a/gcc/Makefile.in b/gcc/Makefile.in
>> index 34d2356..6613e83 100644
>> --- a/gcc/Makefile.in
>> +++ b/gcc/Makefile.in
>> @@ -1474,6 +1474,7 @@ OBJS = \
>>          tree-ssa-loop.o \
>>          tree-ssa-math-opts.o \
>>          tree-ssa-operands.o \
>> +       tree-ssa-path-split.o \
>
> gimple-ssa-path-split please.
Agreed.   I'll make that change for Ajit.


>
>>          tree-ssa-phionlycprop.o \
>>          tree-ssa-phiopt.o \
>>          tree-ssa-phiprop.o \
>> diff --git a/gcc/common.opt b/gcc/common.opt
>> index 757ce85..3e946ca 100644
>> --- a/gcc/common.opt
>> +++ b/gcc/common.opt
>> @@ -2403,6 +2403,10 @@ ftree-vrp
>>   Common Report Var(flag_tree_vrp) Init(0) Optimization
>>   Perform Value Range Propagation on trees.
>>
>> +ftree-path-split
>
> fsplit-paths
And this plus related variable name fixes and such.


>>
>> +@item -ftree-path-split
>> +@opindex ftree-path-split
>> +Perform Path Splitting on trees.  When the two execution paths of a
>> +if-then-else merge at the loop latch node, try to duplicate the
>> +merge node into two paths. This is enabled by default at @option{-O2}
>> +and above.
>> +
>
> I think if we go into the detail of the transform we should mention the
> effective result (creating a loop nest with disambiguation figuring out
> which is the "better" inner loop).
It no longer creates a loop nest.  The overall shape of the CFG is 
maintained.  ie, we still have a single simple latch for the loop.  The 
blocks we create are internal to the loop.

I always struggle with the right level at which to document these 
options.   I'll take a look at this for Ajit.

BTW Do we have an API for indicating that new blocks have been added to 
a loop?  If so, then we can likely drop the LOOPS_NEED_FIXUP.


>
>>   @item -fsplit-ivs-in-unroller
>>   @opindex fsplit-ivs-in-unroller
>>   Enables expression of values of induction variables in later iterations
>> diff --git a/gcc/opts.c b/gcc/opts.c
>> index 9a3fbb3..9a0b27c 100644
>> --- a/gcc/opts.c
>> +++ b/gcc/opts.c
>> @@ -509,6 +509,7 @@ static const struct default_options
>> default_options_table[] =
>>       { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1
>> },
>>       { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
>>       { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
>> +    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },
>
> Is this transform a good idea for -Os?
In general, no because of the block duplication.

jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-11-13 16:26                               ` Jeff Law
@ 2015-11-13 18:09                                 ` Richard Biener
  2015-11-13 20:23                                   ` Jeff Law
  0 siblings, 1 reply; 72+ messages in thread
From: Richard Biener @ 2015-11-13 18:09 UTC (permalink / raw)
  To: Jeff Law
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On November 13, 2015 5:26:01 PM GMT+01:00, Jeff Law <law@redhat.com> wrote:
>On 11/13/2015 03:13 AM, Richard Biener wrote:
>
>>> diff --git a/gcc/Makefile.in b/gcc/Makefile.in
>>> index 34d2356..6613e83 100644
>>> --- a/gcc/Makefile.in
>>> +++ b/gcc/Makefile.in
>>> @@ -1474,6 +1474,7 @@ OBJS = \
>>>          tree-ssa-loop.o \
>>>          tree-ssa-math-opts.o \
>>>          tree-ssa-operands.o \
>>> +       tree-ssa-path-split.o \
>>
>> gimple-ssa-path-split please.
>Agreed.   I'll make that change for Ajit.
>
>
>>
>>>          tree-ssa-phionlycprop.o \
>>>          tree-ssa-phiopt.o \
>>>          tree-ssa-phiprop.o \
>>> diff --git a/gcc/common.opt b/gcc/common.opt
>>> index 757ce85..3e946ca 100644
>>> --- a/gcc/common.opt
>>> +++ b/gcc/common.opt
>>> @@ -2403,6 +2403,10 @@ ftree-vrp
>>>   Common Report Var(flag_tree_vrp) Init(0) Optimization
>>>   Perform Value Range Propagation on trees.
>>>
>>> +ftree-path-split
>>
>> fsplit-paths
>And this plus related variable name fixes and such.
>
>
>>>
>>> +@item -ftree-path-split
>>> +@opindex ftree-path-split
>>> +Perform Path Splitting on trees.  When the two execution paths of a
>>> +if-then-else merge at the loop latch node, try to duplicate the
>>> +merge node into two paths. This is enabled by default at
>@option{-O2}
>>> +and above.
>>> +
>>
>> I think if we go into the detail of the transform we should mention
>the
>> effective result (creating a loop nest with disambiguation figuring
>out
>> which is the "better" inner loop).
>It no longer creates a loop nest.  The overall shape of the CFG is 
>maintained.  ie, we still have a single simple latch for the loop.  The
>
>blocks we create are internal to the loop.
>
>I always struggle with the right level at which to document these 
>options.   I'll take a look at this for Ajit.
>
>BTW Do we have an API for indicating that new blocks have been added to
>
>a loop?  If so, then we can likely drop the LOOPS_NEED_FIXUP.

Please. It's called add_to_loop or so.

Richard.

>
>>
>>>   @item -fsplit-ivs-in-unroller
>>>   @opindex fsplit-ivs-in-unroller
>>>   Enables expression of values of induction variables in later
>iterations
>>> diff --git a/gcc/opts.c b/gcc/opts.c
>>> index 9a3fbb3..9a0b27c 100644
>>> --- a/gcc/opts.c
>>> +++ b/gcc/opts.c
>>> @@ -509,6 +509,7 @@ static const struct default_options
>>> default_options_table[] =
>>>       { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference,
>NULL, 1
>>> },
>>>       { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
>>>       { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
>>> +    { OPT_LEVELS_2_PLUS, OPT_ftree_path_split, NULL, 1 },
>>
>> Is this transform a good idea for -Os?
>In general, no because of the block duplication.
>
>jeff


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-11-13 18:09                                 ` Richard Biener
@ 2015-11-13 20:23                                   ` Jeff Law
  2015-11-13 23:36                                     ` Jeff Law
  0 siblings, 1 reply; 72+ messages in thread
From: Jeff Law @ 2015-11-13 20:23 UTC (permalink / raw)
  To: Richard Biener
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On 11/13/2015 11:09 AM, Richard Biener wrote:

>>
>> BTW Do we have an API for indicating that new blocks have been added to
>>
>> a loop?  If so, then we can likely drop the LOOPS_NEED_FIXUP.
>
> Please. It's called add_to_loop or so.
Haha, the block duplication code was handling this already.  So in 
theory I can just drop the LOOPS_NEED_FIXUP completely.  Testing now.

jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-11-13 20:23                                   ` Jeff Law
@ 2015-11-13 23:36                                     ` Jeff Law
  2015-11-18  7:44                                       ` Tom de Vries
  2015-12-03 14:38                                       ` Richard Biener
  0 siblings, 2 replies; 72+ messages in thread
From: Jeff Law @ 2015-11-13 23:36 UTC (permalink / raw)
  To: Richard Biener
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

[-- Attachment #1: Type: text/plain, Size: 1087 bytes --]

On 11/13/2015 01:23 PM, Jeff Law wrote:
> On 11/13/2015 11:09 AM, Richard Biener wrote:
>
>>>
>>> BTW Do we have an API for indicating that new blocks have been added to
>>>
>>> a loop?  If so, then we can likely drop the LOOPS_NEED_FIXUP.
>>
>> Please. It's called add_to_loop or so.
> Haha, the block duplication code was handling this already.  So in
> theory I can just drop the LOOPS_NEED_FIXUP completely.  Testing now.
>
> jeff
>
Attached is the committed patch for path splitting.  As noted above, we 
didn't need the LOOPS_NEED_FIXUP in the final version, so that wart is 
gone :-)

I do find myself wondering if this can/should be generalized beyond just 
paths heading to loop backedges.  However to do so I think we'd need to 
be able to undo this transformation reliably and we'd need some 
heuristics when to duplicate to expose the redundancy vs rely on PRE 
techniques and jump threading.  I vaguely remember a paper which touched 
on these topics, but I can't seem to find it.

Anyway, bootstrapped and regression tested on x86_64-linux-gnu. 
Installed on the trunk.




[-- Attachment #2: patch --]
[-- Type: text/plain, Size: 20818 bytes --]

commit c1891376e5dcc99ad8be2d22f9551c03f9bb2729
Author: Jeff Law <law@redhat.com>
Date:   Fri Nov 13 16:29:34 2015 -0700

    [Patch,tree-optimization]: Add new path Splitting pass on tree ssa
    representation
    
    	* Makefile.in (OBJS): Add gimple-ssa-split-paths.o
    	* common.opt (-fsplit-paths): New flag controlling path splitting.
    	* doc/invoke.texi (fsplit-paths): Document.
    	* opts.c (default_options_table): Add -fsplit-paths to -O2.
    	* passes.def: Add split_paths pass.
    	* timevar.def (TV_SPLIT_PATHS): New timevar.
    	* tracer.c: Include "tracer.h"
    	(ignore_bb_p): No longer static.
    	(transform_duplicate): New function, broken out of tail_duplicate.
    	(tail_duplicate): Use transform_duplicate.
    	* tracer.h (ignore_bb_p): Declare
    	(transform_duplicate): Likewise.
    	* tree-pass.h (make_pass_split_paths): Declare.
    	* gimple-ssa-split-paths.c: New file.
    
    	* gcc.dg/tree-ssa/split-path-1.c: New test.

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index dde2695..a7abe37 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,21 @@
+2015-11-13  Ajit Agarwal  <ajitkum@xilinx.com>
+	    Jeff Law  <law@redhat.com>
+
+	* Makefile.in (OBJS): Add gimple-ssa-split-paths.o
+	* common.opt (-fsplit-paths): New flag controlling path splitting.
+	* doc/invoke.texi (fsplit-paths): Document.
+	* opts.c (default_options_table): Add -fsplit-paths to -O2.
+	* passes.def: Add split_paths pass.
+	* timevar.def (TV_SPLIT_PATHS): New timevar.
+	* tracer.c: Include "tracer.h"
+	(ignore_bb_p): No longer static.
+	(transform_duplicate): New function, broken out of tail_duplicate.
+	(tail_duplicate): Use transform_duplicate.
+	* tracer.h (ignore_bb_p): Declare
+	(transform_duplicate): Likewise.
+	* tree-pass.h (make_pass_split_paths): Declare.
+	* gimple-ssa-split-paths.c: New file.
+
 2015-11-13  Kai Tietz  <ktietz70@googlemail.com>
 	    Marek Polacek  <polacek@redhat.com>
 	    Jason Merrill  <jason@redhat.com>
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index d3fd5e9..5c294df 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1277,6 +1277,7 @@ OBJS = \
 	gimple-pretty-print.o \
 	gimple-ssa-backprop.o \
 	gimple-ssa-isolate-paths.o \
+	gimple-ssa-split-paths.o \
 	gimple-ssa-strength-reduction.o \
 	gimple-streamer-in.o \
 	gimple-streamer-out.o \
diff --git a/gcc/common.opt b/gcc/common.opt
index 757ce85..3eb520e 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2403,6 +2403,10 @@ ftree-vrp
 Common Report Var(flag_tree_vrp) Init(0) Optimization
 Perform Value Range Propagation on trees.
 
+fsplit-paths
+Common Report Var(flag_split_paths) Init(0) Optimization
+Split paths leading to loop backedges.
+
 funit-at-a-time
 Common Report Var(flag_unit_at_a_time) Init(1)
 Compile whole compilation unit at a time.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index c18df98..eeb79e6 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -354,6 +354,7 @@ Objective-C and Objective-C++ Dialects}.
 -fdump-tree-fre@r{[}-@var{n}@r{]} @gol
 -fdump-tree-vtable-verify @gol
 -fdump-tree-vrp@r{[}-@var{n}@r{]} @gol
+-fdump-tree-split-paths@r{[}-@var{n}@r{]} @gol
 -fdump-tree-storeccp@r{[}-@var{n}@r{]} @gol
 -fdump-final-insns=@var{file} @gol
 -fcompare-debug@r{[}=@var{opts}@r{]}  -fcompare-debug-second @gol
@@ -448,6 +449,7 @@ Objective-C and Objective-C++ Dialects}.
 -fsel-sched-pipelining -fsel-sched-pipelining-outer-loops @gol
 -fsemantic-interposition -fshrink-wrap -fsignaling-nans @gol
 -fsingle-precision-constant -fsplit-ivs-in-unroller @gol
+-fsplit-paths @gol
 -fsplit-wide-types -fssa-backprop -fssa-phiopt @gol
 -fstack-protector -fstack-protector-all -fstack-protector-strong @gol
 -fstack-protector-explicit -fstdarg-opt -fstrict-aliasing @gol
@@ -7171,6 +7173,11 @@ output on to @file{stderr}. If two conflicting dump filenames are
 given for the same pass, then the latter option overrides the earlier
 one.
 
+@item split-paths
+@opindex fdump-tree-split-paths
+Dump each function after splitting paths to loop backedges.  The file
+name is made by appending @file{.split-paths} to the source file name.
+
 @item all
 Turn on all options, except @option{raw}, @option{slim}, @option{verbose}
 and @option{lineno}.
@@ -7808,6 +7815,7 @@ also turns on the following optimization flags:
 -frerun-cse-after-loop  @gol
 -fsched-interblock  -fsched-spec @gol
 -fschedule-insns  -fschedule-insns2 @gol
+-fsplit-paths @gol
 -fstrict-aliasing -fstrict-overflow @gol
 -ftree-builtin-call-dce @gol
 -ftree-switch-conversion -ftree-tail-merge @gol
@@ -8821,7 +8829,7 @@ currently enabled, but may be enabled by @option{-O2} in the future.
 
 @item -ftree-sink
 @opindex ftree-sink
-Perform forward store motion  on trees.  This flag is
+Perform forward store motion on trees.  This flag is
 enabled by default at @option{-O} and higher.
 
 @item -ftree-bit-ccp
@@ -9127,6 +9135,12 @@ enabled by default at @option{-O2} and higher.  Null pointer check
 elimination is only done if @option{-fdelete-null-pointer-checks} is
 enabled.
 
+@item -fsplit-paths
+@opindex fsplit-paths
+Split paths leading to loop backedges.  This can improve dead code
+elimination and common subexpression elimination.  This is enabled by
+default at @option{-O2} and above.
+
 @item -fsplit-ivs-in-unroller
 @opindex fsplit-ivs-in-unroller
 Enables expression of values of induction variables in later iterations
diff --git a/gcc/gimple-ssa-split-paths.c b/gcc/gimple-ssa-split-paths.c
new file mode 100644
index 0000000..602e916
--- /dev/null
+++ b/gcc/gimple-ssa-split-paths.c
@@ -0,0 +1,270 @@
+/* Support routines for Splitting Paths to loop backedges
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "tree.h"
+#include "gimple.h"
+#include "tree-pass.h"
+#include "cfganal.h"
+#include "cfgloop.h"
+#include "gimple-iterator.h"
+#include "tracer.h"
+
+/* Given LATCH, the latch block in a loop, see if the shape of the
+   path reaching LATCH is suitable for being split by duplication.
+   If so, return the block that will be duplicated into its predecessor
+   paths.  Else return NULL.  */
+
+static basic_block
+find_block_to_duplicate_for_splitting_paths (basic_block latch)
+{
+  /* We should have simple latches at this point.  So the latch should
+     have a single successor.  This implies the predecessor of the latch
+     likely has the loop exit.  And it's that predecessor we're most
+     interested in. To keep things simple, we're going to require that
+     the latch have a single predecessor too.  */
+  if (single_succ_p (latch) && single_pred_p (latch))
+    {
+      basic_block bb = get_immediate_dominator (CDI_DOMINATORS, latch);
+      gcc_assert (single_pred_edge (latch)->src == bb);
+
+      /* If BB has been marked as not to be duplicated, then honor that
+	 request.  */
+      if (ignore_bb_p (bb))
+	return NULL;
+
+      gimple *last = gsi_stmt (gsi_last_nondebug_bb (bb));
+      /* The immediate dominator of the latch must end in a conditional.  */
+      if (!last || gimple_code (last) != GIMPLE_COND)
+	return NULL;
+
+      /* We're hoping that BB is a join point for an IF-THEN-ELSE diamond
+	 region.  Verify that it is.
+
+	 First, verify that BB has two predecessors (each arm of the
+	 IF-THEN-ELSE) and two successors (the latch and exit).  */
+      if (EDGE_COUNT (bb->preds) == 2 && EDGE_COUNT (bb->succs) == 2)
+	{
+	  /* Now verify that BB's immediate dominator ends in a
+	     conditional as well.  */
+	  basic_block bb_idom = get_immediate_dominator (CDI_DOMINATORS, bb);
+	  gimple *last = gsi_stmt (gsi_last_nondebug_bb (bb_idom));
+	  if (!last || gimple_code (last) != GIMPLE_COND)
+	    return NULL;
+
+	  /* And that BB's immediate dominator's successors are the
+	     the predecessors of BB.  */
+	  if (!find_edge (bb_idom, EDGE_PRED (bb, 0)->src)
+	      || !find_edge (bb_idom, EDGE_PRED (bb, 1)->src))
+	    return NULL;
+
+	  /* So at this point we have a simple diamond for an IF-THEN-ELSE
+	     construct starting at BB_IDOM, with a join point at BB.  BB
+	     pass control outside the loop or to the loop latch.
+
+	     We're going to want to create two duplicates of BB, one for
+	     each successor of BB_IDOM.  */
+	  return bb;
+	}
+    }
+  return NULL;
+}
+
+/* Return TRUE if BB is a reasonable block to duplicate by examining
+   its size, false otherwise.  BB will always be a loop latch block.
+
+   Should this use the same tests as we do for jump threading?  */
+
+static bool
+is_feasible_trace (basic_block bb)
+{
+  int num_stmt = 0;
+  gimple_stmt_iterator gsi;
+
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+      if (!is_gimple_debug (stmt))
+	num_stmt++;
+    }
+
+  /* We may want to limit how many statements we copy.  */
+  if (num_stmt > 1)
+    return true;
+
+  return false;
+}
+
+/* If the immediate dominator of the latch of the loop is
+   block with conditional branch, then the loop latch  is
+   duplicated to its predecessors path preserving the SSA
+   semantics.
+
+   CFG before transformation.
+
+              2
+              |
+              |
+        +---->3
+        |    / \
+        |   /   \
+        |  4     5
+        |   \   /
+        |    \ /
+        |     6
+        |    / \
+        |   /   \
+        |  8     7
+        |  |     |
+        ---+     E
+
+
+
+    Block 8 is the latch.  We're going to make copies of block 6 (9 & 10)
+    and wire things up so they look like this:
+
+              2
+              |
+              |
+        +---->3
+        |    / \
+        |   /   \
+        |  4     5
+        |  |     |
+        |  |     |
+        |  9    10
+        |  |\   /|
+        |  | \ / |
+        |  |  7  |
+        |  |  |  |
+        |  |  E  |
+        |  |     |
+        |   \   /
+        |    \ /
+        +-----8
+
+
+    Blocks 9 and 10 will get merged into blocks 4 & 5 respectively which
+    enables CSE, DCE and other optimizations to occur on a larger block
+    of code.   */
+
+static bool
+split_paths ()
+{
+  bool changed = false;
+  loop_p loop;
+
+  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
+  initialize_original_copy_tables ();
+  calculate_dominance_info (CDI_DOMINATORS);
+
+  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
+    {
+      /* See if there is a block that we can duplicate to split the
+	 path to the loop latch.  */
+      basic_block bb = find_block_to_duplicate_for_splitting_paths (loop->latch);
+
+      /* BB is the merge point for an IF-THEN-ELSE we want to transform.
+
+	 Essentially we want to create two duplicates of BB and append
+	 a duplicate to the THEN and ELSE clauses.  This will split the
+	 path leading to the latch.  BB will be unreachable and removed.  */
+      if (bb && is_feasible_trace (bb))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file,
+		     "Duplicating join block %d into predecessor paths\n",
+		     bb->index);
+	  basic_block pred0 = EDGE_PRED (bb, 0)->src;
+	  basic_block pred1 = EDGE_PRED (bb, 1)->src;
+	  transform_duplicate (pred0, bb);
+	  transform_duplicate (pred1, bb);
+	  changed = true;
+	}
+    }
+
+  loop_optimizer_finalize ();
+  free_original_copy_tables ();
+  return changed;
+}
+
+/* Main entry point for splitting paths.  Returns TODO_cleanup_cfg if any
+   paths where split, otherwise return zero.  */
+
+static unsigned int
+execute_split_paths ()
+{
+  /* If we don't have at least 2 real blocks and backedges in the
+     CFG, then there's no point in trying to perform path splitting.  */
+  if (n_basic_blocks_for_fn (cfun) <= NUM_FIXED_BLOCKS + 1
+      || !mark_dfs_back_edges ())
+    return 0;
+
+  bool changed = split_paths();
+  if (changed)
+    free_dominance_info (CDI_DOMINATORS);
+
+  return changed ? TODO_cleanup_cfg : 0;
+}
+
+static bool
+gate_split_paths ()
+{
+  return flag_split_paths;
+}
+
+namespace {
+
+const pass_data pass_data_split_paths =
+{
+  GIMPLE_PASS, /* type */
+  "split-paths", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_SPLIT_PATHS, /* tv_id */
+  PROP_ssa, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_update_ssa, /* todo_flags_finish */
+};
+
+class pass_split_paths : public gimple_opt_pass
+{
+   public:
+    pass_split_paths (gcc::context *ctxt)
+      : gimple_opt_pass (pass_data_split_paths, ctxt)
+    {}
+   /* opt_pass methods: */
+   opt_pass * clone () { return new pass_split_paths (m_ctxt); }
+   virtual bool gate (function *) { return gate_split_paths (); }
+   virtual unsigned int execute (function *) { return execute_split_paths (); }
+
+}; // class pass_split_paths
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_split_paths (gcc::context *ctxt)
+{
+  return new pass_split_paths (ctxt);
+}
diff --git a/gcc/opts.c b/gcc/opts.c
index 930ae43..be04cf5 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -523,6 +523,7 @@ static const struct default_options default_options_table[] =
     { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
+    { OPT_LEVELS_2_PLUS, OPT_fsplit_paths, NULL, 1 },
 
     /* -O3 optimizations.  */
     { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
diff --git a/gcc/passes.def b/gcc/passes.def
index c0ab6b9..db822d3 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -274,6 +274,7 @@ along with GCC; see the file COPYING3.  If not see
       POP_INSERT_PASSES ()
       NEXT_PASS (pass_simduid_cleanup);
       NEXT_PASS (pass_lower_vector_ssa);
+      NEXT_PASS (pass_split_paths);
       NEXT_PASS (pass_cse_reciprocals);
       NEXT_PASS (pass_reassoc);
       NEXT_PASS (pass_strength_reduction);
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 3301130..ee92aaf 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2015-11-13  Ajit Agarwal  <ajitkum@xilinx.com>
+            Jeff Law  <law@redhat.com>
+
+	* gcc.dg/tree-ssa/split-path-1.c: New test.
+
 2015-11-13  Nathan Sidwell  <nathan@codesourcery.com>
 
 	* c-c++-common/goacc/loop-auto-1.c: New.
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/split-path-1.c b/gcc/testsuite/gcc.dg/tree-ssa/split-path-1.c
new file mode 100644
index 0000000..1239892
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/split-path-1.c
@@ -0,0 +1,67 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-tree-split-paths-details " } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define RGBMAX 255
+
+int
+test()
+{
+  int i, Pels;
+  unsigned char sum = 0;
+  unsigned char xr, xg, xb;
+  unsigned char xc, xm, xy, xk;
+  unsigned char *ReadPtr, *EritePtr;
+
+  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+  EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
+
+  for (i = 0; i < 100;i++)
+     {
+       ReadPtr[i] = 100 - i;
+     }
+
+  for (i = 0; i < 100; i++)
+     {
+       xr = *ReadPtr++;
+       xg = *ReadPtr++;
+       xb = *ReadPtr++;
+
+       xc = (unsigned char) (RGBMAX - xr);
+       xm = (unsigned char) (RGBMAX - xg);
+       xy = (unsigned char) (RGBMAX - xb);
+
+       if (xc < xm)
+         {
+           xk = (unsigned char) (xc < xy ? xc : xy);
+         }
+       else
+        {
+          xk = (unsigned char) (xm < xy ? xm : xy);
+        }
+
+       xc = (unsigned char) (xc - xk);
+       xm = (unsigned char) (xm - xk);
+       xy = (unsigned char) (xy - xk);
+
+       *EritePtr++ = xc;
+       *EritePtr++ = xm;
+       *EritePtr++ = xy;
+       *EritePtr++ = xk;
+       sum += *EritePtr;
+    }
+  return sum;
+}
+
+int
+main()
+{
+  if (test() != 33)
+    abort();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "Duplicating join block" "split-paths" } } */
diff --git a/gcc/timevar.def b/gcc/timevar.def
index b429faf..45e3b70 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -252,6 +252,7 @@ DEFTIMEVAR (TV_GCSE_AFTER_RELOAD     , "load CSE after reload")
 DEFTIMEVAR (TV_REE		     , "ree")
 DEFTIMEVAR (TV_THREAD_PROLOGUE_AND_EPILOGUE, "thread pro- & epilogue")
 DEFTIMEVAR (TV_IFCVT2		     , "if-conversion 2")
+DEFTIMEVAR (TV_SPLIT_PATHS	     , "split paths")
 DEFTIMEVAR (TV_COMBINE_STACK_ADJUST  , "combine stack adjustments")
 DEFTIMEVAR (TV_PEEPHOLE2             , "peephole 2")
 DEFTIMEVAR (TV_RENAME_REGISTERS      , "rename registers")
diff --git a/gcc/tracer.c b/gcc/tracer.c
index 941dc20..c2dba4c 100644
--- a/gcc/tracer.c
+++ b/gcc/tracer.c
@@ -51,9 +51,9 @@
 #include "tree-inline.h"
 #include "cfgloop.h"
 #include "fibonacci_heap.h"
+#include "tracer.h"
 
 static int count_insns (basic_block);
-static bool ignore_bb_p (const_basic_block);
 static bool better_p (const_edge, const_edge);
 static edge find_best_successor (basic_block);
 static edge find_best_predecessor (basic_block);
@@ -85,7 +85,7 @@ bb_seen_p (basic_block bb)
 }
 
 /* Return true if we should ignore the basic block for purposes of tracing.  */
-static bool
+bool
 ignore_bb_p (const_basic_block bb)
 {
   if (bb->index < NUM_FIXED_BLOCKS)
@@ -226,6 +226,24 @@ find_trace (basic_block bb, basic_block *trace)
   return i;
 }
 
+/* Duplicate block BB2, placing it after BB in the CFG.  Return the
+   newly created block.  */
+basic_block
+transform_duplicate (basic_block bb, basic_block bb2)
+{
+  edge e;
+  basic_block copy;
+
+  e = find_edge (bb, bb2);
+
+  copy = duplicate_block (bb2, e, bb);
+  flush_pending_stmts (e);
+
+  add_phi_args_after_copy (&copy, 1, NULL);
+
+  return (copy);
+}
+
 /* Look for basic blocks in frequency order, construct traces and tail duplicate
    if profitable.  */
 
@@ -321,17 +339,8 @@ tail_duplicate (void)
 		 entries or at least rotate the loop.  */
 	      && bb2->loop_father->header != bb2)
 	    {
-	      edge e;
-	      basic_block copy;
-
 	      nduplicated += counts [bb2->index];
-
-	      e = find_edge (bb, bb2);
-
-	      copy = duplicate_block (bb2, e, bb);
-	      flush_pending_stmts (e);
-
-	      add_phi_args_after_copy (&copy, 1, NULL);
+	      basic_block copy = transform_duplicate (bb, bb2);
 
 	      /* Reconsider the original copy of block we've duplicated.
 	         Removing the most common predecessor may make it to be
diff --git a/gcc/tracer.h b/gcc/tracer.h
new file mode 100644
index 0000000..cd1792a
--- /dev/null
+++ b/gcc/tracer.h
@@ -0,0 +1,26 @@
+/* Header file for Tracer.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_TRACER_H
+#define GCC_TRACER_H
+
+extern basic_block transform_duplicate (basic_block bb, basic_block bb2);
+extern bool ignore_bb_p (const_basic_block bb);
+
+#endif /* GCC_TRACER_H */
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 49e22a9..da67761 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -390,6 +390,7 @@ extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_split_paths (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_build_ssa (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_build_alias (gcc::context *ctxt);

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-11-13 23:36                                     ` Jeff Law
@ 2015-11-18  7:44                                       ` Tom de Vries
  2015-11-18 14:24                                         ` Ajit Kumar Agarwal
  2015-12-03 14:38                                       ` Richard Biener
  1 sibling, 1 reply; 72+ messages in thread
From: Tom de Vries @ 2015-11-18  7:44 UTC (permalink / raw)
  To: Jeff Law, Richard Biener
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On 14/11/15 00:35, Jeff Law wrote:
> Anyway, bootstrapped and regression tested on x86_64-linux-gnu.
> Installed on the trunk.

>      [Patch,tree-optimization]: Add new path Splitting pass on tree ssa
>      representation
>
>      	* Makefile.in (OBJS): Add gimple-ssa-split-paths.o
>      	* common.opt (-fsplit-paths): New flag controlling path splitting.
>      	* doc/invoke.texi (fsplit-paths): Document.
>      	* opts.c (default_options_table): Add -fsplit-paths to -O2.
>      	* passes.def: Add split_paths pass.
>      	* timevar.def (TV_SPLIT_PATHS): New timevar.
>      	* tracer.c: Include "tracer.h"
>      	(ignore_bb_p): No longer static.
>      	(transform_duplicate): New function, broken out of tail_duplicate.
>      	(tail_duplicate): Use transform_duplicate.
>      	* tracer.h (ignore_bb_p): Declare
>      	(transform_duplicate): Likewise.
>      	* tree-pass.h (make_pass_split_paths): Declare.
>      	* gimple-ssa-split-paths.c: New file.
>
>      	* gcc.dg/tree-ssa/split-path-1.c: New test.

I've filed PR68402 - FAIL: gcc.dg/tree-ssa/split-path-1.c execution test 
with -m32.

Thanks,
- Tom

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-11-18  7:44                                       ` Tom de Vries
@ 2015-11-18 14:24                                         ` Ajit Kumar Agarwal
  0 siblings, 0 replies; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-11-18 14:24 UTC (permalink / raw)
  To: Tom de Vries, Jeff Law, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala



-----Original Message-----
From: Tom de Vries [mailto:Tom_deVries@mentor.com] 
Sent: Wednesday, November 18, 2015 1:14 PM
To: Jeff Law; Richard Biener
Cc: Ajit Kumar Agarwal; GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On 14/11/15 00:35, Jeff Law wrote:
> Anyway, bootstrapped and regression tested on x86_64-linux-gnu.
> Installed on the trunk.

>      [Patch,tree-optimization]: Add new path Splitting pass on tree ssa
>      representation
>
>      	* Makefile.in (OBJS): Add gimple-ssa-split-paths.o
>      	* common.opt (-fsplit-paths): New flag controlling path splitting.
>      	* doc/invoke.texi (fsplit-paths): Document.
>      	* opts.c (default_options_table): Add -fsplit-paths to -O2.
>      	* passes.def: Add split_paths pass.
>      	* timevar.def (TV_SPLIT_PATHS): New timevar.
>      	* tracer.c: Include "tracer.h"
>      	(ignore_bb_p): No longer static.
>      	(transform_duplicate): New function, broken out of tail_duplicate.
>      	(tail_duplicate): Use transform_duplicate.
>      	* tracer.h (ignore_bb_p): Declare
>      	(transform_duplicate): Likewise.
>      	* tree-pass.h (make_pass_split_paths): Declare.
>      	* gimple-ssa-split-paths.c: New file.
>
>      	* gcc.dg/tree-ssa/split-path-1.c: New test.

>>I've filed PR68402 - FAIL: gcc.dg/tree-ssa/split-path-1.c execution test with -m32.

I have fixed the above PR and the patch is submitted.

https://gcc.gnu.org/ml/gcc-patches/2015-11/msg02217.html

Thanks & Regards
Ajit

Thanks,
- Tom


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-11-13 23:36                                     ` Jeff Law
  2015-11-18  7:44                                       ` Tom de Vries
@ 2015-12-03 14:38                                       ` Richard Biener
  2015-12-03 14:45                                         ` Richard Biener
                                                           ` (2 more replies)
  1 sibling, 3 replies; 72+ messages in thread
From: Richard Biener @ 2015-12-03 14:38 UTC (permalink / raw)
  To: Jeff Law
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On Sat, Nov 14, 2015 at 12:35 AM, Jeff Law <law@redhat.com> wrote:
> On 11/13/2015 01:23 PM, Jeff Law wrote:
>>
>> On 11/13/2015 11:09 AM, Richard Biener wrote:
>>
>>>>
>>>> BTW Do we have an API for indicating that new blocks have been added to
>>>>
>>>> a loop?  If so, then we can likely drop the LOOPS_NEED_FIXUP.
>>>
>>>
>>> Please. It's called add_to_loop or so.
>>
>> Haha, the block duplication code was handling this already.  So in
>> theory I can just drop the LOOPS_NEED_FIXUP completely.  Testing now.
>>
>> jeff
>>
> Attached is the committed patch for path splitting.  As noted above, we
> didn't need the LOOPS_NEED_FIXUP in the final version, so that wart is gone
> :-)
>
> I do find myself wondering if this can/should be generalized beyond just
> paths heading to loop backedges.  However to do so I think we'd need to be
> able to undo this transformation reliably and we'd need some heuristics when
> to duplicate to expose the redundancy vs rely on PRE techniques and jump
> threading.  I vaguely remember a paper which touched on these topics, but I
> can't seem to find it.
>
> Anyway, bootstrapped and regression tested on x86_64-linux-gnu. Installed on
> the trunk.

This pass is now enabled by default with -Os but has no limits on the amount of
stmts it copies.  It also will make all loops with this shape have at least two
exits (if the resulting loop will be disambiguated the inner loop will
have two exits).
Having more than one exit will disable almost all loop optimizations after it.

The pass itself documents the transform it does but does zero to motivate it.

What's the benefit of this pass (apart from disrupting further optimizations)?

I can see a _single_ case where duplicating the latch will allow threading
one of the paths through the loop header to eliminate the original exit.  Then
disambiguation may create a nice nested loop out of this.  Of course that
is only profitable again if you know the remaining single exit of the inner
loop (exiting to the outer one) is executed infrequently (thus the inner loop
actually loops).

But no checks other than on the CFG shape exist (oh, it checks it will
at _least_ copy two stmts!).

Given the profitability constraints above (well, correct me if I am
wrong on these)
it looks like the whole transform should be done within the FSM threading
code which might be able to compute whether there will be an inner loop
with a single exit only.

I'm inclined to request the pass to be removed again or at least disabled by
default.

What closed source benchmark was this transform invented for?

Richard.

>
>
>
> commit c1891376e5dcc99ad8be2d22f9551c03f9bb2729
> Author: Jeff Law <law@redhat.com>
> Date:   Fri Nov 13 16:29:34 2015 -0700
>
>     [Patch,tree-optimization]: Add new path Splitting pass on tree ssa
>     representation
>
>         * Makefile.in (OBJS): Add gimple-ssa-split-paths.o
>         * common.opt (-fsplit-paths): New flag controlling path splitting.
>         * doc/invoke.texi (fsplit-paths): Document.
>         * opts.c (default_options_table): Add -fsplit-paths to -O2.
>         * passes.def: Add split_paths pass.
>         * timevar.def (TV_SPLIT_PATHS): New timevar.
>         * tracer.c: Include "tracer.h"
>         (ignore_bb_p): No longer static.
>         (transform_duplicate): New function, broken out of tail_duplicate.
>         (tail_duplicate): Use transform_duplicate.
>         * tracer.h (ignore_bb_p): Declare
>         (transform_duplicate): Likewise.
>         * tree-pass.h (make_pass_split_paths): Declare.
>         * gimple-ssa-split-paths.c: New file.
>
>         * gcc.dg/tree-ssa/split-path-1.c: New test.
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index dde2695..a7abe37 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,21 @@
> +2015-11-13  Ajit Agarwal  <ajitkum@xilinx.com>
> +           Jeff Law  <law@redhat.com>
> +
> +       * Makefile.in (OBJS): Add gimple-ssa-split-paths.o
> +       * common.opt (-fsplit-paths): New flag controlling path splitting.
> +       * doc/invoke.texi (fsplit-paths): Document.
> +       * opts.c (default_options_table): Add -fsplit-paths to -O2.
> +       * passes.def: Add split_paths pass.
> +       * timevar.def (TV_SPLIT_PATHS): New timevar.
> +       * tracer.c: Include "tracer.h"
> +       (ignore_bb_p): No longer static.
> +       (transform_duplicate): New function, broken out of tail_duplicate.
> +       (tail_duplicate): Use transform_duplicate.
> +       * tracer.h (ignore_bb_p): Declare
> +       (transform_duplicate): Likewise.
> +       * tree-pass.h (make_pass_split_paths): Declare.
> +       * gimple-ssa-split-paths.c: New file.
> +
>  2015-11-13  Kai Tietz  <ktietz70@googlemail.com>
>             Marek Polacek  <polacek@redhat.com>
>             Jason Merrill  <jason@redhat.com>
> diff --git a/gcc/Makefile.in b/gcc/Makefile.in
> index d3fd5e9..5c294df 100644
> --- a/gcc/Makefile.in
> +++ b/gcc/Makefile.in
> @@ -1277,6 +1277,7 @@ OBJS = \
>         gimple-pretty-print.o \
>         gimple-ssa-backprop.o \
>         gimple-ssa-isolate-paths.o \
> +       gimple-ssa-split-paths.o \
>         gimple-ssa-strength-reduction.o \
>         gimple-streamer-in.o \
>         gimple-streamer-out.o \
> diff --git a/gcc/common.opt b/gcc/common.opt
> index 757ce85..3eb520e 100644
> --- a/gcc/common.opt
> +++ b/gcc/common.opt
> @@ -2403,6 +2403,10 @@ ftree-vrp
>  Common Report Var(flag_tree_vrp) Init(0) Optimization
>  Perform Value Range Propagation on trees.
>
> +fsplit-paths
> +Common Report Var(flag_split_paths) Init(0) Optimization
> +Split paths leading to loop backedges.
> +
>  funit-at-a-time
>  Common Report Var(flag_unit_at_a_time) Init(1)
>  Compile whole compilation unit at a time.
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index c18df98..eeb79e6 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -354,6 +354,7 @@ Objective-C and Objective-C++ Dialects}.
>  -fdump-tree-fre@r{[}-@var{n}@r{]} @gol
>  -fdump-tree-vtable-verify @gol
>  -fdump-tree-vrp@r{[}-@var{n}@r{]} @gol
> +-fdump-tree-split-paths@r{[}-@var{n}@r{]} @gol
>  -fdump-tree-storeccp@r{[}-@var{n}@r{]} @gol
>  -fdump-final-insns=@var{file} @gol
>  -fcompare-debug@r{[}=@var{opts}@r{]}  -fcompare-debug-second @gol
> @@ -448,6 +449,7 @@ Objective-C and Objective-C++ Dialects}.
>  -fsel-sched-pipelining -fsel-sched-pipelining-outer-loops @gol
>  -fsemantic-interposition -fshrink-wrap -fsignaling-nans @gol
>  -fsingle-precision-constant -fsplit-ivs-in-unroller @gol
> +-fsplit-paths @gol
>  -fsplit-wide-types -fssa-backprop -fssa-phiopt @gol
>  -fstack-protector -fstack-protector-all -fstack-protector-strong @gol
>  -fstack-protector-explicit -fstdarg-opt -fstrict-aliasing @gol
> @@ -7171,6 +7173,11 @@ output on to @file{stderr}. If two conflicting dump
> filenames are
>  given for the same pass, then the latter option overrides the earlier
>  one.
>
> +@item split-paths
> +@opindex fdump-tree-split-paths
> +Dump each function after splitting paths to loop backedges.  The file
> +name is made by appending @file{.split-paths} to the source file name.
> +
>  @item all
>  Turn on all options, except @option{raw}, @option{slim}, @option{verbose}
>  and @option{lineno}.
> @@ -7808,6 +7815,7 @@ also turns on the following optimization flags:
>  -frerun-cse-after-loop  @gol
>  -fsched-interblock  -fsched-spec @gol
>  -fschedule-insns  -fschedule-insns2 @gol
> +-fsplit-paths @gol
>  -fstrict-aliasing -fstrict-overflow @gol
>  -ftree-builtin-call-dce @gol
>  -ftree-switch-conversion -ftree-tail-merge @gol
> @@ -8821,7 +8829,7 @@ currently enabled, but may be enabled by @option{-O2}
> in the future.
>
>  @item -ftree-sink
>  @opindex ftree-sink
> -Perform forward store motion  on trees.  This flag is
> +Perform forward store motion on trees.  This flag is
>  enabled by default at @option{-O} and higher.
>
>  @item -ftree-bit-ccp
> @@ -9127,6 +9135,12 @@ enabled by default at @option{-O2} and higher.  Null
> pointer check
>  elimination is only done if @option{-fdelete-null-pointer-checks} is
>  enabled.
>
> +@item -fsplit-paths
> +@opindex fsplit-paths
> +Split paths leading to loop backedges.  This can improve dead code
> +elimination and common subexpression elimination.  This is enabled by
> +default at @option{-O2} and above.
> +
>  @item -fsplit-ivs-in-unroller
>  @opindex fsplit-ivs-in-unroller
>  Enables expression of values of induction variables in later iterations
> diff --git a/gcc/gimple-ssa-split-paths.c b/gcc/gimple-ssa-split-paths.c
> new file mode 100644
> index 0000000..602e916
> --- /dev/null
> +++ b/gcc/gimple-ssa-split-paths.c
> @@ -0,0 +1,270 @@
> +/* Support routines for Splitting Paths to loop backedges
> +   Copyright (C) 2015 Free Software Foundation, Inc.
> +   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
> +
> + This file is part of GCC.
> +
> + GCC is free software; you can redistribute it and/or modify
> + it under the terms of the GNU General Public License as published by
> + the Free Software Foundation; either version 3, or (at your option)
> + any later version.
> +
> +GCC is distributed in the hope that it will be useful,
> +but WITHOUT ANY WARRANTY; without even the implied warranty of
> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +GNU General Public License for more details.
> +
> +You should have received a copy of the GNU General Public License
> +along with GCC; see the file COPYING3.  If not see
> +<http://www.gnu.org/licenses/>.  */
> +
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#include "backend.h"
> +#include "tree.h"
> +#include "gimple.h"
> +#include "tree-pass.h"
> +#include "cfganal.h"
> +#include "cfgloop.h"
> +#include "gimple-iterator.h"
> +#include "tracer.h"
> +
> +/* Given LATCH, the latch block in a loop, see if the shape of the
> +   path reaching LATCH is suitable for being split by duplication.
> +   If so, return the block that will be duplicated into its predecessor
> +   paths.  Else return NULL.  */
> +
> +static basic_block
> +find_block_to_duplicate_for_splitting_paths (basic_block latch)
> +{
> +  /* We should have simple latches at this point.  So the latch should
> +     have a single successor.  This implies the predecessor of the latch
> +     likely has the loop exit.  And it's that predecessor we're most
> +     interested in. To keep things simple, we're going to require that
> +     the latch have a single predecessor too.  */
> +  if (single_succ_p (latch) && single_pred_p (latch))
> +    {
> +      basic_block bb = get_immediate_dominator (CDI_DOMINATORS, latch);
> +      gcc_assert (single_pred_edge (latch)->src == bb);
> +
> +      /* If BB has been marked as not to be duplicated, then honor that
> +        request.  */
> +      if (ignore_bb_p (bb))
> +       return NULL;
> +
> +      gimple *last = gsi_stmt (gsi_last_nondebug_bb (bb));
> +      /* The immediate dominator of the latch must end in a conditional.
> */
> +      if (!last || gimple_code (last) != GIMPLE_COND)
> +       return NULL;
> +
> +      /* We're hoping that BB is a join point for an IF-THEN-ELSE diamond
> +        region.  Verify that it is.
> +
> +        First, verify that BB has two predecessors (each arm of the
> +        IF-THEN-ELSE) and two successors (the latch and exit).  */
> +      if (EDGE_COUNT (bb->preds) == 2 && EDGE_COUNT (bb->succs) == 2)
> +       {
> +         /* Now verify that BB's immediate dominator ends in a
> +            conditional as well.  */
> +         basic_block bb_idom = get_immediate_dominator (CDI_DOMINATORS,
> bb);
> +         gimple *last = gsi_stmt (gsi_last_nondebug_bb (bb_idom));
> +         if (!last || gimple_code (last) != GIMPLE_COND)
> +           return NULL;
> +
> +         /* And that BB's immediate dominator's successors are the
> +            the predecessors of BB.  */
> +         if (!find_edge (bb_idom, EDGE_PRED (bb, 0)->src)
> +             || !find_edge (bb_idom, EDGE_PRED (bb, 1)->src))
> +           return NULL;
> +
> +         /* So at this point we have a simple diamond for an IF-THEN-ELSE
> +            construct starting at BB_IDOM, with a join point at BB.  BB
> +            pass control outside the loop or to the loop latch.
> +
> +            We're going to want to create two duplicates of BB, one for
> +            each successor of BB_IDOM.  */
> +         return bb;
> +       }
> +    }
> +  return NULL;
> +}
> +
> +/* Return TRUE if BB is a reasonable block to duplicate by examining
> +   its size, false otherwise.  BB will always be a loop latch block.
> +
> +   Should this use the same tests as we do for jump threading?  */
> +
> +static bool
> +is_feasible_trace (basic_block bb)
> +{
> +  int num_stmt = 0;
> +  gimple_stmt_iterator gsi;
> +
> +  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +    {
> +      gimple *stmt = gsi_stmt (gsi);
> +      if (!is_gimple_debug (stmt))
> +       num_stmt++;
> +    }
> +
> +  /* We may want to limit how many statements we copy.  */
> +  if (num_stmt > 1)
> +    return true;
> +
> +  return false;
> +}
> +
> +/* If the immediate dominator of the latch of the loop is
> +   block with conditional branch, then the loop latch  is
> +   duplicated to its predecessors path preserving the SSA
> +   semantics.
> +
> +   CFG before transformation.
> +
> +              2
> +              |
> +              |
> +        +---->3
> +        |    / \
> +        |   /   \
> +        |  4     5
> +        |   \   /
> +        |    \ /
> +        |     6
> +        |    / \
> +        |   /   \
> +        |  8     7
> +        |  |     |
> +        ---+     E
> +
> +
> +
> +    Block 8 is the latch.  We're going to make copies of block 6 (9 & 10)
> +    and wire things up so they look like this:
> +
> +              2
> +              |
> +              |
> +        +---->3
> +        |    / \
> +        |   /   \
> +        |  4     5
> +        |  |     |
> +        |  |     |
> +        |  9    10
> +        |  |\   /|
> +        |  | \ / |
> +        |  |  7  |
> +        |  |  |  |
> +        |  |  E  |
> +        |  |     |
> +        |   \   /
> +        |    \ /
> +        +-----8
> +
> +
> +    Blocks 9 and 10 will get merged into blocks 4 & 5 respectively which
> +    enables CSE, DCE and other optimizations to occur on a larger block
> +    of code.   */
> +
> +static bool
> +split_paths ()
> +{
> +  bool changed = false;
> +  loop_p loop;
> +
> +  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
> +  initialize_original_copy_tables ();
> +  calculate_dominance_info (CDI_DOMINATORS);
> +
> +  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
> +    {
> +      /* See if there is a block that we can duplicate to split the
> +        path to the loop latch.  */
> +      basic_block bb = find_block_to_duplicate_for_splitting_paths
> (loop->latch);
> +
> +      /* BB is the merge point for an IF-THEN-ELSE we want to transform.
> +
> +        Essentially we want to create two duplicates of BB and append
> +        a duplicate to the THEN and ELSE clauses.  This will split the
> +        path leading to the latch.  BB will be unreachable and removed.  */
> +      if (bb && is_feasible_trace (bb))
> +       {
> +         if (dump_file && (dump_flags & TDF_DETAILS))
> +           fprintf (dump_file,
> +                    "Duplicating join block %d into predecessor paths\n",
> +                    bb->index);
> +         basic_block pred0 = EDGE_PRED (bb, 0)->src;
> +         basic_block pred1 = EDGE_PRED (bb, 1)->src;
> +         transform_duplicate (pred0, bb);
> +         transform_duplicate (pred1, bb);
> +         changed = true;
> +       }
> +    }
> +
> +  loop_optimizer_finalize ();
> +  free_original_copy_tables ();
> +  return changed;
> +}
> +
> +/* Main entry point for splitting paths.  Returns TODO_cleanup_cfg if any
> +   paths where split, otherwise return zero.  */
> +
> +static unsigned int
> +execute_split_paths ()
> +{
> +  /* If we don't have at least 2 real blocks and backedges in the
> +     CFG, then there's no point in trying to perform path splitting.  */
> +  if (n_basic_blocks_for_fn (cfun) <= NUM_FIXED_BLOCKS + 1
> +      || !mark_dfs_back_edges ())
> +    return 0;
> +
> +  bool changed = split_paths();
> +  if (changed)
> +    free_dominance_info (CDI_DOMINATORS);
> +
> +  return changed ? TODO_cleanup_cfg : 0;
> +}
> +
> +static bool
> +gate_split_paths ()
> +{
> +  return flag_split_paths;
> +}
> +
> +namespace {
> +
> +const pass_data pass_data_split_paths =
> +{
> +  GIMPLE_PASS, /* type */
> +  "split-paths", /* name */
> +  OPTGROUP_NONE, /* optinfo_flags */
> +  TV_SPLIT_PATHS, /* tv_id */
> +  PROP_ssa, /* properties_required */
> +  0, /* properties_provided */
> +  0, /* properties_destroyed */
> +  0, /* todo_flags_start */
> +  TODO_update_ssa, /* todo_flags_finish */
> +};
> +
> +class pass_split_paths : public gimple_opt_pass
> +{
> +   public:
> +    pass_split_paths (gcc::context *ctxt)
> +      : gimple_opt_pass (pass_data_split_paths, ctxt)
> +    {}
> +   /* opt_pass methods: */
> +   opt_pass * clone () { return new pass_split_paths (m_ctxt); }
> +   virtual bool gate (function *) { return gate_split_paths (); }
> +   virtual unsigned int execute (function *) { return execute_split_paths
> (); }
> +
> +}; // class pass_split_paths
> +
> +} // anon namespace
> +
> +gimple_opt_pass *
> +make_pass_split_paths (gcc::context *ctxt)
> +{
> +  return new pass_split_paths (ctxt);
> +}
> diff --git a/gcc/opts.c b/gcc/opts.c
> index 930ae43..be04cf5 100644
> --- a/gcc/opts.c
> +++ b/gcc/opts.c
> @@ -523,6 +523,7 @@ static const struct default_options
> default_options_table[] =
>      { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1
> },
>      { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
>      { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
> +    { OPT_LEVELS_2_PLUS, OPT_fsplit_paths, NULL, 1 },
>
>      /* -O3 optimizations.  */
>      { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
> diff --git a/gcc/passes.def b/gcc/passes.def
> index c0ab6b9..db822d3 100644
> --- a/gcc/passes.def
> +++ b/gcc/passes.def
> @@ -274,6 +274,7 @@ along with GCC; see the file COPYING3.  If not see
>        POP_INSERT_PASSES ()
>        NEXT_PASS (pass_simduid_cleanup);
>        NEXT_PASS (pass_lower_vector_ssa);
> +      NEXT_PASS (pass_split_paths);
>        NEXT_PASS (pass_cse_reciprocals);
>        NEXT_PASS (pass_reassoc);
>        NEXT_PASS (pass_strength_reduction);
> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
> index 3301130..ee92aaf 100644
> --- a/gcc/testsuite/ChangeLog
> +++ b/gcc/testsuite/ChangeLog
> @@ -1,3 +1,8 @@
> +2015-11-13  Ajit Agarwal  <ajitkum@xilinx.com>
> +            Jeff Law  <law@redhat.com>
> +
> +       * gcc.dg/tree-ssa/split-path-1.c: New test.
> +
>  2015-11-13  Nathan Sidwell  <nathan@codesourcery.com>
>
>         * c-c++-common/goacc/loop-auto-1.c: New.
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/split-path-1.c
> b/gcc/testsuite/gcc.dg/tree-ssa/split-path-1.c
> new file mode 100644
> index 0000000..1239892
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/split-path-1.c
> @@ -0,0 +1,67 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fdump-tree-split-paths-details " } */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +
> +#define RGBMAX 255
> +
> +int
> +test()
> +{
> +  int i, Pels;
> +  unsigned char sum = 0;
> +  unsigned char xr, xg, xb;
> +  unsigned char xc, xm, xy, xk;
> +  unsigned char *ReadPtr, *EritePtr;
> +
> +  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
> +  EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
> +
> +  for (i = 0; i < 100;i++)
> +     {
> +       ReadPtr[i] = 100 - i;
> +     }
> +
> +  for (i = 0; i < 100; i++)
> +     {
> +       xr = *ReadPtr++;
> +       xg = *ReadPtr++;
> +       xb = *ReadPtr++;
> +
> +       xc = (unsigned char) (RGBMAX - xr);
> +       xm = (unsigned char) (RGBMAX - xg);
> +       xy = (unsigned char) (RGBMAX - xb);
> +
> +       if (xc < xm)
> +         {
> +           xk = (unsigned char) (xc < xy ? xc : xy);
> +         }
> +       else
> +        {
> +          xk = (unsigned char) (xm < xy ? xm : xy);
> +        }
> +
> +       xc = (unsigned char) (xc - xk);
> +       xm = (unsigned char) (xm - xk);
> +       xy = (unsigned char) (xy - xk);
> +
> +       *EritePtr++ = xc;
> +       *EritePtr++ = xm;
> +       *EritePtr++ = xy;
> +       *EritePtr++ = xk;
> +       sum += *EritePtr;
> +    }
> +  return sum;
> +}
> +
> +int
> +main()
> +{
> +  if (test() != 33)
> +    abort();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump "Duplicating join block" "split-paths" } }
> */
> diff --git a/gcc/timevar.def b/gcc/timevar.def
> index b429faf..45e3b70 100644
> --- a/gcc/timevar.def
> +++ b/gcc/timevar.def
> @@ -252,6 +252,7 @@ DEFTIMEVAR (TV_GCSE_AFTER_RELOAD     , "load CSE after
> reload")
>  DEFTIMEVAR (TV_REE                  , "ree")
>  DEFTIMEVAR (TV_THREAD_PROLOGUE_AND_EPILOGUE, "thread pro- & epilogue")
>  DEFTIMEVAR (TV_IFCVT2               , "if-conversion 2")
> +DEFTIMEVAR (TV_SPLIT_PATHS          , "split paths")
>  DEFTIMEVAR (TV_COMBINE_STACK_ADJUST  , "combine stack adjustments")
>  DEFTIMEVAR (TV_PEEPHOLE2             , "peephole 2")
>  DEFTIMEVAR (TV_RENAME_REGISTERS      , "rename registers")
> diff --git a/gcc/tracer.c b/gcc/tracer.c
> index 941dc20..c2dba4c 100644
> --- a/gcc/tracer.c
> +++ b/gcc/tracer.c
> @@ -51,9 +51,9 @@
>  #include "tree-inline.h"
>  #include "cfgloop.h"
>  #include "fibonacci_heap.h"
> +#include "tracer.h"
>
>  static int count_insns (basic_block);
> -static bool ignore_bb_p (const_basic_block);
>  static bool better_p (const_edge, const_edge);
>  static edge find_best_successor (basic_block);
>  static edge find_best_predecessor (basic_block);
> @@ -85,7 +85,7 @@ bb_seen_p (basic_block bb)
>  }
>
>  /* Return true if we should ignore the basic block for purposes of tracing.
> */
> -static bool
> +bool
>  ignore_bb_p (const_basic_block bb)
>  {
>    if (bb->index < NUM_FIXED_BLOCKS)
> @@ -226,6 +226,24 @@ find_trace (basic_block bb, basic_block *trace)
>    return i;
>  }
>
> +/* Duplicate block BB2, placing it after BB in the CFG.  Return the
> +   newly created block.  */
> +basic_block
> +transform_duplicate (basic_block bb, basic_block bb2)
> +{
> +  edge e;
> +  basic_block copy;
> +
> +  e = find_edge (bb, bb2);
> +
> +  copy = duplicate_block (bb2, e, bb);
> +  flush_pending_stmts (e);
> +
> +  add_phi_args_after_copy (&copy, 1, NULL);
> +
> +  return (copy);
> +}
> +
>  /* Look for basic blocks in frequency order, construct traces and tail
> duplicate
>     if profitable.  */
>
> @@ -321,17 +339,8 @@ tail_duplicate (void)
>                  entries or at least rotate the loop.  */
>               && bb2->loop_father->header != bb2)
>             {
> -             edge e;
> -             basic_block copy;
> -
>               nduplicated += counts [bb2->index];
> -
> -             e = find_edge (bb, bb2);
> -
> -             copy = duplicate_block (bb2, e, bb);
> -             flush_pending_stmts (e);
> -
> -             add_phi_args_after_copy (&copy, 1, NULL);
> +             basic_block copy = transform_duplicate (bb, bb2);
>
>               /* Reconsider the original copy of block we've duplicated.
>                  Removing the most common predecessor may make it to be
> diff --git a/gcc/tracer.h b/gcc/tracer.h
> new file mode 100644
> index 0000000..cd1792a
> --- /dev/null
> +++ b/gcc/tracer.h
> @@ -0,0 +1,26 @@
> +/* Header file for Tracer.
> +   Copyright (C) 2015 Free Software Foundation, Inc.
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify it under
> +the terms of the GNU General Public License as published by the Free
> +Software Foundation; either version 3, or (at your option) any later
> +version.
> +
> +GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> + for more details.
> +
> +You should have received a copy of the GNU General Public License
> +along with GCC; see the file COPYING3.  If not see
> +<http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_TRACER_H
> +#define GCC_TRACER_H
> +
> +extern basic_block transform_duplicate (basic_block bb, basic_block bb2);
> +extern bool ignore_bb_p (const_basic_block bb);
> +
> +#endif /* GCC_TRACER_H */
> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
> index 49e22a9..da67761 100644
> --- a/gcc/tree-pass.h
> +++ b/gcc/tree-pass.h
> @@ -390,6 +390,7 @@ extern gimple_opt_pass *make_pass_tree_loop_done
> (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
> +extern gimple_opt_pass *make_pass_split_paths (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_build_ssa (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_build_alias (gcc::context *ctxt);
>

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-12-03 14:38                                       ` Richard Biener
@ 2015-12-03 14:45                                         ` Richard Biener
  2015-12-10 20:12                                           ` Jeff Law
  2015-12-03 15:46                                         ` Jeff Law
  2015-12-10 20:08                                         ` Jeff Law
  2 siblings, 1 reply; 72+ messages in thread
From: Richard Biener @ 2015-12-03 14:45 UTC (permalink / raw)
  To: Jeff Law
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On Thu, Dec 3, 2015 at 3:38 PM, Richard Biener
<richard.guenther@gmail.com> wrote:
> On Sat, Nov 14, 2015 at 12:35 AM, Jeff Law <law@redhat.com> wrote:
>> On 11/13/2015 01:23 PM, Jeff Law wrote:
>>>
>>> On 11/13/2015 11:09 AM, Richard Biener wrote:
>>>
>>>>>
>>>>> BTW Do we have an API for indicating that new blocks have been added to
>>>>>
>>>>> a loop?  If so, then we can likely drop the LOOPS_NEED_FIXUP.
>>>>
>>>>
>>>> Please. It's called add_to_loop or so.
>>>
>>> Haha, the block duplication code was handling this already.  So in
>>> theory I can just drop the LOOPS_NEED_FIXUP completely.  Testing now.
>>>
>>> jeff
>>>
>> Attached is the committed patch for path splitting.  As noted above, we
>> didn't need the LOOPS_NEED_FIXUP in the final version, so that wart is gone
>> :-)
>>
>> I do find myself wondering if this can/should be generalized beyond just
>> paths heading to loop backedges.  However to do so I think we'd need to be
>> able to undo this transformation reliably and we'd need some heuristics when
>> to duplicate to expose the redundancy vs rely on PRE techniques and jump
>> threading.  I vaguely remember a paper which touched on these topics, but I
>> can't seem to find it.
>>
>> Anyway, bootstrapped and regression tested on x86_64-linux-gnu. Installed on
>> the trunk.
>
> This pass is now enabled by default with -Os but has no limits on the amount of
> stmts it copies.  It also will make all loops with this shape have at least two
> exits (if the resulting loop will be disambiguated the inner loop will
> have two exits).
> Having more than one exit will disable almost all loop optimizations after it.
>
> The pass itself documents the transform it does but does zero to motivate it.
>
> What's the benefit of this pass (apart from disrupting further optimizations)?
>
> I can see a _single_ case where duplicating the latch will allow threading
> one of the paths through the loop header to eliminate the original exit.  Then
> disambiguation may create a nice nested loop out of this.  Of course that
> is only profitable again if you know the remaining single exit of the inner
> loop (exiting to the outer one) is executed infrequently (thus the inner loop
> actually loops).
>
> But no checks other than on the CFG shape exist (oh, it checks it will
> at _least_ copy two stmts!).
>
> Given the profitability constraints above (well, correct me if I am
> wrong on these)
> it looks like the whole transform should be done within the FSM threading
> code which might be able to compute whether there will be an inner loop
> with a single exit only.
>
> I'm inclined to request the pass to be removed again or at least disabled by
> default.
>
> What closed source benchmark was this transform invented for?

Ah, some EEMBC one.

Btw, the testcase that was added shows

       if (xc < xm)
         {
           xk = (unsigned char) (xc < xy ? xc : xy);
         }
       else
        {
          xk = (unsigned char) (xm < xy ? xm : xy);
        }

which might be better handled by phiopt transforming it into

xk = MIN (xc, MIN (xm, xy))

phiopt1 sees (hooray to GENERIC folding)

  xc_26 = ~xr_21;
  xm_27 = ~xg_23;
  xy_28 = ~xb_25;
  if (xr_21 > xg_23)
    goto <bb 5>;
  else
    goto <bb 6>;

  <bb 5>:
  xk_29 = MIN_EXPR <xc_26, xy_28>;
  goto <bb 7>;

  <bb 6>:
  xk_30 = MIN_EXPR <xm_27, xy_28>;

  <bb 7>:
  # xk_4 = PHI <xk_29(5), xk_30(6)>

btw, see PR67438 for a similar testcase and the above pattern.

Richard.

> Richard.
>
>>
>>
>>
>> commit c1891376e5dcc99ad8be2d22f9551c03f9bb2729
>> Author: Jeff Law <law@redhat.com>
>> Date:   Fri Nov 13 16:29:34 2015 -0700
>>
>>     [Patch,tree-optimization]: Add new path Splitting pass on tree ssa
>>     representation
>>
>>         * Makefile.in (OBJS): Add gimple-ssa-split-paths.o
>>         * common.opt (-fsplit-paths): New flag controlling path splitting.
>>         * doc/invoke.texi (fsplit-paths): Document.
>>         * opts.c (default_options_table): Add -fsplit-paths to -O2.
>>         * passes.def: Add split_paths pass.
>>         * timevar.def (TV_SPLIT_PATHS): New timevar.
>>         * tracer.c: Include "tracer.h"
>>         (ignore_bb_p): No longer static.
>>         (transform_duplicate): New function, broken out of tail_duplicate.
>>         (tail_duplicate): Use transform_duplicate.
>>         * tracer.h (ignore_bb_p): Declare
>>         (transform_duplicate): Likewise.
>>         * tree-pass.h (make_pass_split_paths): Declare.
>>         * gimple-ssa-split-paths.c: New file.
>>
>>         * gcc.dg/tree-ssa/split-path-1.c: New test.
>>
>> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
>> index dde2695..a7abe37 100644
>> --- a/gcc/ChangeLog
>> +++ b/gcc/ChangeLog
>> @@ -1,3 +1,21 @@
>> +2015-11-13  Ajit Agarwal  <ajitkum@xilinx.com>
>> +           Jeff Law  <law@redhat.com>
>> +
>> +       * Makefile.in (OBJS): Add gimple-ssa-split-paths.o
>> +       * common.opt (-fsplit-paths): New flag controlling path splitting.
>> +       * doc/invoke.texi (fsplit-paths): Document.
>> +       * opts.c (default_options_table): Add -fsplit-paths to -O2.
>> +       * passes.def: Add split_paths pass.
>> +       * timevar.def (TV_SPLIT_PATHS): New timevar.
>> +       * tracer.c: Include "tracer.h"
>> +       (ignore_bb_p): No longer static.
>> +       (transform_duplicate): New function, broken out of tail_duplicate.
>> +       (tail_duplicate): Use transform_duplicate.
>> +       * tracer.h (ignore_bb_p): Declare
>> +       (transform_duplicate): Likewise.
>> +       * tree-pass.h (make_pass_split_paths): Declare.
>> +       * gimple-ssa-split-paths.c: New file.
>> +
>>  2015-11-13  Kai Tietz  <ktietz70@googlemail.com>
>>             Marek Polacek  <polacek@redhat.com>
>>             Jason Merrill  <jason@redhat.com>
>> diff --git a/gcc/Makefile.in b/gcc/Makefile.in
>> index d3fd5e9..5c294df 100644
>> --- a/gcc/Makefile.in
>> +++ b/gcc/Makefile.in
>> @@ -1277,6 +1277,7 @@ OBJS = \
>>         gimple-pretty-print.o \
>>         gimple-ssa-backprop.o \
>>         gimple-ssa-isolate-paths.o \
>> +       gimple-ssa-split-paths.o \
>>         gimple-ssa-strength-reduction.o \
>>         gimple-streamer-in.o \
>>         gimple-streamer-out.o \
>> diff --git a/gcc/common.opt b/gcc/common.opt
>> index 757ce85..3eb520e 100644
>> --- a/gcc/common.opt
>> +++ b/gcc/common.opt
>> @@ -2403,6 +2403,10 @@ ftree-vrp
>>  Common Report Var(flag_tree_vrp) Init(0) Optimization
>>  Perform Value Range Propagation on trees.
>>
>> +fsplit-paths
>> +Common Report Var(flag_split_paths) Init(0) Optimization
>> +Split paths leading to loop backedges.
>> +
>>  funit-at-a-time
>>  Common Report Var(flag_unit_at_a_time) Init(1)
>>  Compile whole compilation unit at a time.
>> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
>> index c18df98..eeb79e6 100644
>> --- a/gcc/doc/invoke.texi
>> +++ b/gcc/doc/invoke.texi
>> @@ -354,6 +354,7 @@ Objective-C and Objective-C++ Dialects}.
>>  -fdump-tree-fre@r{[}-@var{n}@r{]} @gol
>>  -fdump-tree-vtable-verify @gol
>>  -fdump-tree-vrp@r{[}-@var{n}@r{]} @gol
>> +-fdump-tree-split-paths@r{[}-@var{n}@r{]} @gol
>>  -fdump-tree-storeccp@r{[}-@var{n}@r{]} @gol
>>  -fdump-final-insns=@var{file} @gol
>>  -fcompare-debug@r{[}=@var{opts}@r{]}  -fcompare-debug-second @gol
>> @@ -448,6 +449,7 @@ Objective-C and Objective-C++ Dialects}.
>>  -fsel-sched-pipelining -fsel-sched-pipelining-outer-loops @gol
>>  -fsemantic-interposition -fshrink-wrap -fsignaling-nans @gol
>>  -fsingle-precision-constant -fsplit-ivs-in-unroller @gol
>> +-fsplit-paths @gol
>>  -fsplit-wide-types -fssa-backprop -fssa-phiopt @gol
>>  -fstack-protector -fstack-protector-all -fstack-protector-strong @gol
>>  -fstack-protector-explicit -fstdarg-opt -fstrict-aliasing @gol
>> @@ -7171,6 +7173,11 @@ output on to @file{stderr}. If two conflicting dump
>> filenames are
>>  given for the same pass, then the latter option overrides the earlier
>>  one.
>>
>> +@item split-paths
>> +@opindex fdump-tree-split-paths
>> +Dump each function after splitting paths to loop backedges.  The file
>> +name is made by appending @file{.split-paths} to the source file name.
>> +
>>  @item all
>>  Turn on all options, except @option{raw}, @option{slim}, @option{verbose}
>>  and @option{lineno}.
>> @@ -7808,6 +7815,7 @@ also turns on the following optimization flags:
>>  -frerun-cse-after-loop  @gol
>>  -fsched-interblock  -fsched-spec @gol
>>  -fschedule-insns  -fschedule-insns2 @gol
>> +-fsplit-paths @gol
>>  -fstrict-aliasing -fstrict-overflow @gol
>>  -ftree-builtin-call-dce @gol
>>  -ftree-switch-conversion -ftree-tail-merge @gol
>> @@ -8821,7 +8829,7 @@ currently enabled, but may be enabled by @option{-O2}
>> in the future.
>>
>>  @item -ftree-sink
>>  @opindex ftree-sink
>> -Perform forward store motion  on trees.  This flag is
>> +Perform forward store motion on trees.  This flag is
>>  enabled by default at @option{-O} and higher.
>>
>>  @item -ftree-bit-ccp
>> @@ -9127,6 +9135,12 @@ enabled by default at @option{-O2} and higher.  Null
>> pointer check
>>  elimination is only done if @option{-fdelete-null-pointer-checks} is
>>  enabled.
>>
>> +@item -fsplit-paths
>> +@opindex fsplit-paths
>> +Split paths leading to loop backedges.  This can improve dead code
>> +elimination and common subexpression elimination.  This is enabled by
>> +default at @option{-O2} and above.
>> +
>>  @item -fsplit-ivs-in-unroller
>>  @opindex fsplit-ivs-in-unroller
>>  Enables expression of values of induction variables in later iterations
>> diff --git a/gcc/gimple-ssa-split-paths.c b/gcc/gimple-ssa-split-paths.c
>> new file mode 100644
>> index 0000000..602e916
>> --- /dev/null
>> +++ b/gcc/gimple-ssa-split-paths.c
>> @@ -0,0 +1,270 @@
>> +/* Support routines for Splitting Paths to loop backedges
>> +   Copyright (C) 2015 Free Software Foundation, Inc.
>> +   Contributed by Ajit Kumar Agarwal <ajitkum@xilinx.com>.
>> +
>> + This file is part of GCC.
>> +
>> + GCC is free software; you can redistribute it and/or modify
>> + it under the terms of the GNU General Public License as published by
>> + the Free Software Foundation; either version 3, or (at your option)
>> + any later version.
>> +
>> +GCC is distributed in the hope that it will be useful,
>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> +GNU General Public License for more details.
>> +
>> +You should have received a copy of the GNU General Public License
>> +along with GCC; see the file COPYING3.  If not see
>> +<http://www.gnu.org/licenses/>.  */
>> +
>> +#include "config.h"
>> +#include "system.h"
>> +#include "coretypes.h"
>> +#include "backend.h"
>> +#include "tree.h"
>> +#include "gimple.h"
>> +#include "tree-pass.h"
>> +#include "cfganal.h"
>> +#include "cfgloop.h"
>> +#include "gimple-iterator.h"
>> +#include "tracer.h"
>> +
>> +/* Given LATCH, the latch block in a loop, see if the shape of the
>> +   path reaching LATCH is suitable for being split by duplication.
>> +   If so, return the block that will be duplicated into its predecessor
>> +   paths.  Else return NULL.  */
>> +
>> +static basic_block
>> +find_block_to_duplicate_for_splitting_paths (basic_block latch)
>> +{
>> +  /* We should have simple latches at this point.  So the latch should
>> +     have a single successor.  This implies the predecessor of the latch
>> +     likely has the loop exit.  And it's that predecessor we're most
>> +     interested in. To keep things simple, we're going to require that
>> +     the latch have a single predecessor too.  */
>> +  if (single_succ_p (latch) && single_pred_p (latch))
>> +    {
>> +      basic_block bb = get_immediate_dominator (CDI_DOMINATORS, latch);
>> +      gcc_assert (single_pred_edge (latch)->src == bb);
>> +
>> +      /* If BB has been marked as not to be duplicated, then honor that
>> +        request.  */
>> +      if (ignore_bb_p (bb))
>> +       return NULL;
>> +
>> +      gimple *last = gsi_stmt (gsi_last_nondebug_bb (bb));
>> +      /* The immediate dominator of the latch must end in a conditional.
>> */
>> +      if (!last || gimple_code (last) != GIMPLE_COND)
>> +       return NULL;
>> +
>> +      /* We're hoping that BB is a join point for an IF-THEN-ELSE diamond
>> +        region.  Verify that it is.
>> +
>> +        First, verify that BB has two predecessors (each arm of the
>> +        IF-THEN-ELSE) and two successors (the latch and exit).  */
>> +      if (EDGE_COUNT (bb->preds) == 2 && EDGE_COUNT (bb->succs) == 2)
>> +       {
>> +         /* Now verify that BB's immediate dominator ends in a
>> +            conditional as well.  */
>> +         basic_block bb_idom = get_immediate_dominator (CDI_DOMINATORS,
>> bb);
>> +         gimple *last = gsi_stmt (gsi_last_nondebug_bb (bb_idom));
>> +         if (!last || gimple_code (last) != GIMPLE_COND)
>> +           return NULL;
>> +
>> +         /* And that BB's immediate dominator's successors are the
>> +            the predecessors of BB.  */
>> +         if (!find_edge (bb_idom, EDGE_PRED (bb, 0)->src)
>> +             || !find_edge (bb_idom, EDGE_PRED (bb, 1)->src))
>> +           return NULL;
>> +
>> +         /* So at this point we have a simple diamond for an IF-THEN-ELSE
>> +            construct starting at BB_IDOM, with a join point at BB.  BB
>> +            pass control outside the loop or to the loop latch.
>> +
>> +            We're going to want to create two duplicates of BB, one for
>> +            each successor of BB_IDOM.  */
>> +         return bb;
>> +       }
>> +    }
>> +  return NULL;
>> +}
>> +
>> +/* Return TRUE if BB is a reasonable block to duplicate by examining
>> +   its size, false otherwise.  BB will always be a loop latch block.
>> +
>> +   Should this use the same tests as we do for jump threading?  */
>> +
>> +static bool
>> +is_feasible_trace (basic_block bb)
>> +{
>> +  int num_stmt = 0;
>> +  gimple_stmt_iterator gsi;
>> +
>> +  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
>> +    {
>> +      gimple *stmt = gsi_stmt (gsi);
>> +      if (!is_gimple_debug (stmt))
>> +       num_stmt++;
>> +    }
>> +
>> +  /* We may want to limit how many statements we copy.  */
>> +  if (num_stmt > 1)
>> +    return true;
>> +
>> +  return false;
>> +}
>> +
>> +/* If the immediate dominator of the latch of the loop is
>> +   block with conditional branch, then the loop latch  is
>> +   duplicated to its predecessors path preserving the SSA
>> +   semantics.
>> +
>> +   CFG before transformation.
>> +
>> +              2
>> +              |
>> +              |
>> +        +---->3
>> +        |    / \
>> +        |   /   \
>> +        |  4     5
>> +        |   \   /
>> +        |    \ /
>> +        |     6
>> +        |    / \
>> +        |   /   \
>> +        |  8     7
>> +        |  |     |
>> +        ---+     E
>> +
>> +
>> +
>> +    Block 8 is the latch.  We're going to make copies of block 6 (9 & 10)
>> +    and wire things up so they look like this:
>> +
>> +              2
>> +              |
>> +              |
>> +        +---->3
>> +        |    / \
>> +        |   /   \
>> +        |  4     5
>> +        |  |     |
>> +        |  |     |
>> +        |  9    10
>> +        |  |\   /|
>> +        |  | \ / |
>> +        |  |  7  |
>> +        |  |  |  |
>> +        |  |  E  |
>> +        |  |     |
>> +        |   \   /
>> +        |    \ /
>> +        +-----8
>> +
>> +
>> +    Blocks 9 and 10 will get merged into blocks 4 & 5 respectively which
>> +    enables CSE, DCE and other optimizations to occur on a larger block
>> +    of code.   */
>> +
>> +static bool
>> +split_paths ()
>> +{
>> +  bool changed = false;
>> +  loop_p loop;
>> +
>> +  loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
>> +  initialize_original_copy_tables ();
>> +  calculate_dominance_info (CDI_DOMINATORS);
>> +
>> +  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
>> +    {
>> +      /* See if there is a block that we can duplicate to split the
>> +        path to the loop latch.  */
>> +      basic_block bb = find_block_to_duplicate_for_splitting_paths
>> (loop->latch);
>> +
>> +      /* BB is the merge point for an IF-THEN-ELSE we want to transform.
>> +
>> +        Essentially we want to create two duplicates of BB and append
>> +        a duplicate to the THEN and ELSE clauses.  This will split the
>> +        path leading to the latch.  BB will be unreachable and removed.  */
>> +      if (bb && is_feasible_trace (bb))
>> +       {
>> +         if (dump_file && (dump_flags & TDF_DETAILS))
>> +           fprintf (dump_file,
>> +                    "Duplicating join block %d into predecessor paths\n",
>> +                    bb->index);
>> +         basic_block pred0 = EDGE_PRED (bb, 0)->src;
>> +         basic_block pred1 = EDGE_PRED (bb, 1)->src;
>> +         transform_duplicate (pred0, bb);
>> +         transform_duplicate (pred1, bb);
>> +         changed = true;
>> +       }
>> +    }
>> +
>> +  loop_optimizer_finalize ();
>> +  free_original_copy_tables ();
>> +  return changed;
>> +}
>> +
>> +/* Main entry point for splitting paths.  Returns TODO_cleanup_cfg if any
>> +   paths where split, otherwise return zero.  */
>> +
>> +static unsigned int
>> +execute_split_paths ()
>> +{
>> +  /* If we don't have at least 2 real blocks and backedges in the
>> +     CFG, then there's no point in trying to perform path splitting.  */
>> +  if (n_basic_blocks_for_fn (cfun) <= NUM_FIXED_BLOCKS + 1
>> +      || !mark_dfs_back_edges ())
>> +    return 0;
>> +
>> +  bool changed = split_paths();
>> +  if (changed)
>> +    free_dominance_info (CDI_DOMINATORS);
>> +
>> +  return changed ? TODO_cleanup_cfg : 0;
>> +}
>> +
>> +static bool
>> +gate_split_paths ()
>> +{
>> +  return flag_split_paths;
>> +}
>> +
>> +namespace {
>> +
>> +const pass_data pass_data_split_paths =
>> +{
>> +  GIMPLE_PASS, /* type */
>> +  "split-paths", /* name */
>> +  OPTGROUP_NONE, /* optinfo_flags */
>> +  TV_SPLIT_PATHS, /* tv_id */
>> +  PROP_ssa, /* properties_required */
>> +  0, /* properties_provided */
>> +  0, /* properties_destroyed */
>> +  0, /* todo_flags_start */
>> +  TODO_update_ssa, /* todo_flags_finish */
>> +};
>> +
>> +class pass_split_paths : public gimple_opt_pass
>> +{
>> +   public:
>> +    pass_split_paths (gcc::context *ctxt)
>> +      : gimple_opt_pass (pass_data_split_paths, ctxt)
>> +    {}
>> +   /* opt_pass methods: */
>> +   opt_pass * clone () { return new pass_split_paths (m_ctxt); }
>> +   virtual bool gate (function *) { return gate_split_paths (); }
>> +   virtual unsigned int execute (function *) { return execute_split_paths
>> (); }
>> +
>> +}; // class pass_split_paths
>> +
>> +} // anon namespace
>> +
>> +gimple_opt_pass *
>> +make_pass_split_paths (gcc::context *ctxt)
>> +{
>> +  return new pass_split_paths (ctxt);
>> +}
>> diff --git a/gcc/opts.c b/gcc/opts.c
>> index 930ae43..be04cf5 100644
>> --- a/gcc/opts.c
>> +++ b/gcc/opts.c
>> @@ -523,6 +523,7 @@ static const struct default_options
>> default_options_table[] =
>>      { OPT_LEVELS_2_PLUS, OPT_fisolate_erroneous_paths_dereference, NULL, 1
>> },
>>      { OPT_LEVELS_2_PLUS, OPT_fipa_ra, NULL, 1 },
>>      { OPT_LEVELS_2_PLUS, OPT_flra_remat, NULL, 1 },
>> +    { OPT_LEVELS_2_PLUS, OPT_fsplit_paths, NULL, 1 },
>>
>>      /* -O3 optimizations.  */
>>      { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
>> diff --git a/gcc/passes.def b/gcc/passes.def
>> index c0ab6b9..db822d3 100644
>> --- a/gcc/passes.def
>> +++ b/gcc/passes.def
>> @@ -274,6 +274,7 @@ along with GCC; see the file COPYING3.  If not see
>>        POP_INSERT_PASSES ()
>>        NEXT_PASS (pass_simduid_cleanup);
>>        NEXT_PASS (pass_lower_vector_ssa);
>> +      NEXT_PASS (pass_split_paths);
>>        NEXT_PASS (pass_cse_reciprocals);
>>        NEXT_PASS (pass_reassoc);
>>        NEXT_PASS (pass_strength_reduction);
>> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
>> index 3301130..ee92aaf 100644
>> --- a/gcc/testsuite/ChangeLog
>> +++ b/gcc/testsuite/ChangeLog
>> @@ -1,3 +1,8 @@
>> +2015-11-13  Ajit Agarwal  <ajitkum@xilinx.com>
>> +            Jeff Law  <law@redhat.com>
>> +
>> +       * gcc.dg/tree-ssa/split-path-1.c: New test.
>> +
>>  2015-11-13  Nathan Sidwell  <nathan@codesourcery.com>
>>
>>         * c-c++-common/goacc/loop-auto-1.c: New.
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/split-path-1.c
>> b/gcc/testsuite/gcc.dg/tree-ssa/split-path-1.c
>> new file mode 100644
>> index 0000000..1239892
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/split-path-1.c
>> @@ -0,0 +1,67 @@
>> +/* { dg-do run } */
>> +/* { dg-options "-O2 -fdump-tree-split-paths-details " } */
>> +
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +
>> +#define RGBMAX 255
>> +
>> +int
>> +test()
>> +{
>> +  int i, Pels;
>> +  unsigned char sum = 0;
>> +  unsigned char xr, xg, xb;
>> +  unsigned char xc, xm, xy, xk;
>> +  unsigned char *ReadPtr, *EritePtr;
>> +
>> +  ReadPtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
>> +  EritePtr = ( unsigned char *) malloc (sizeof (unsigned char) * 100);
>> +
>> +  for (i = 0; i < 100;i++)
>> +     {
>> +       ReadPtr[i] = 100 - i;
>> +     }
>> +
>> +  for (i = 0; i < 100; i++)
>> +     {
>> +       xr = *ReadPtr++;
>> +       xg = *ReadPtr++;
>> +       xb = *ReadPtr++;
>> +
>> +       xc = (unsigned char) (RGBMAX - xr);
>> +       xm = (unsigned char) (RGBMAX - xg);
>> +       xy = (unsigned char) (RGBMAX - xb);
>> +
>> +       if (xc < xm)
>> +         {
>> +           xk = (unsigned char) (xc < xy ? xc : xy);
>> +         }
>> +       else
>> +        {
>> +          xk = (unsigned char) (xm < xy ? xm : xy);
>> +        }
>> +
>> +       xc = (unsigned char) (xc - xk);
>> +       xm = (unsigned char) (xm - xk);
>> +       xy = (unsigned char) (xy - xk);
>> +
>> +       *EritePtr++ = xc;
>> +       *EritePtr++ = xm;
>> +       *EritePtr++ = xy;
>> +       *EritePtr++ = xk;
>> +       sum += *EritePtr;
>> +    }
>> +  return sum;
>> +}
>> +
>> +int
>> +main()
>> +{
>> +  if (test() != 33)
>> +    abort();
>> +
>> +  return 0;
>> +}
>> +
>> +/* { dg-final { scan-tree-dump "Duplicating join block" "split-paths" } }
>> */
>> diff --git a/gcc/timevar.def b/gcc/timevar.def
>> index b429faf..45e3b70 100644
>> --- a/gcc/timevar.def
>> +++ b/gcc/timevar.def
>> @@ -252,6 +252,7 @@ DEFTIMEVAR (TV_GCSE_AFTER_RELOAD     , "load CSE after
>> reload")
>>  DEFTIMEVAR (TV_REE                  , "ree")
>>  DEFTIMEVAR (TV_THREAD_PROLOGUE_AND_EPILOGUE, "thread pro- & epilogue")
>>  DEFTIMEVAR (TV_IFCVT2               , "if-conversion 2")
>> +DEFTIMEVAR (TV_SPLIT_PATHS          , "split paths")
>>  DEFTIMEVAR (TV_COMBINE_STACK_ADJUST  , "combine stack adjustments")
>>  DEFTIMEVAR (TV_PEEPHOLE2             , "peephole 2")
>>  DEFTIMEVAR (TV_RENAME_REGISTERS      , "rename registers")
>> diff --git a/gcc/tracer.c b/gcc/tracer.c
>> index 941dc20..c2dba4c 100644
>> --- a/gcc/tracer.c
>> +++ b/gcc/tracer.c
>> @@ -51,9 +51,9 @@
>>  #include "tree-inline.h"
>>  #include "cfgloop.h"
>>  #include "fibonacci_heap.h"
>> +#include "tracer.h"
>>
>>  static int count_insns (basic_block);
>> -static bool ignore_bb_p (const_basic_block);
>>  static bool better_p (const_edge, const_edge);
>>  static edge find_best_successor (basic_block);
>>  static edge find_best_predecessor (basic_block);
>> @@ -85,7 +85,7 @@ bb_seen_p (basic_block bb)
>>  }
>>
>>  /* Return true if we should ignore the basic block for purposes of tracing.
>> */
>> -static bool
>> +bool
>>  ignore_bb_p (const_basic_block bb)
>>  {
>>    if (bb->index < NUM_FIXED_BLOCKS)
>> @@ -226,6 +226,24 @@ find_trace (basic_block bb, basic_block *trace)
>>    return i;
>>  }
>>
>> +/* Duplicate block BB2, placing it after BB in the CFG.  Return the
>> +   newly created block.  */
>> +basic_block
>> +transform_duplicate (basic_block bb, basic_block bb2)
>> +{
>> +  edge e;
>> +  basic_block copy;
>> +
>> +  e = find_edge (bb, bb2);
>> +
>> +  copy = duplicate_block (bb2, e, bb);
>> +  flush_pending_stmts (e);
>> +
>> +  add_phi_args_after_copy (&copy, 1, NULL);
>> +
>> +  return (copy);
>> +}
>> +
>>  /* Look for basic blocks in frequency order, construct traces and tail
>> duplicate
>>     if profitable.  */
>>
>> @@ -321,17 +339,8 @@ tail_duplicate (void)
>>                  entries or at least rotate the loop.  */
>>               && bb2->loop_father->header != bb2)
>>             {
>> -             edge e;
>> -             basic_block copy;
>> -
>>               nduplicated += counts [bb2->index];
>> -
>> -             e = find_edge (bb, bb2);
>> -
>> -             copy = duplicate_block (bb2, e, bb);
>> -             flush_pending_stmts (e);
>> -
>> -             add_phi_args_after_copy (&copy, 1, NULL);
>> +             basic_block copy = transform_duplicate (bb, bb2);
>>
>>               /* Reconsider the original copy of block we've duplicated.
>>                  Removing the most common predecessor may make it to be
>> diff --git a/gcc/tracer.h b/gcc/tracer.h
>> new file mode 100644
>> index 0000000..cd1792a
>> --- /dev/null
>> +++ b/gcc/tracer.h
>> @@ -0,0 +1,26 @@
>> +/* Header file for Tracer.
>> +   Copyright (C) 2015 Free Software Foundation, Inc.
>> +
>> +This file is part of GCC.
>> +
>> +GCC is free software; you can redistribute it and/or modify it under
>> +the terms of the GNU General Public License as published by the Free
>> +Software Foundation; either version 3, or (at your option) any later
>> +version.
>> +
>> +GCC is distributed in the hope that it will be useful, but WITHOUT ANY
>> +WARRANTY; without even the implied warranty of MERCHANTABILITY or
>> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
>> + for more details.
>> +
>> +You should have received a copy of the GNU General Public License
>> +along with GCC; see the file COPYING3.  If not see
>> +<http://www.gnu.org/licenses/>.  */
>> +
>> +#ifndef GCC_TRACER_H
>> +#define GCC_TRACER_H
>> +
>> +extern basic_block transform_duplicate (basic_block bb, basic_block bb2);
>> +extern bool ignore_bb_p (const_basic_block bb);
>> +
>> +#endif /* GCC_TRACER_H */
>> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
>> index 49e22a9..da67761 100644
>> --- a/gcc/tree-pass.h
>> +++ b/gcc/tree-pass.h
>> @@ -390,6 +390,7 @@ extern gimple_opt_pass *make_pass_tree_loop_done
>> (gcc::context *ctxt);
>>  extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);
>>  extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt);
>>  extern gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
>> +extern gimple_opt_pass *make_pass_split_paths (gcc::context *ctxt);
>>  extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context *ctxt);
>>  extern gimple_opt_pass *make_pass_build_ssa (gcc::context *ctxt);
>>  extern gimple_opt_pass *make_pass_build_alias (gcc::context *ctxt);
>>

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-12-03 14:38                                       ` Richard Biener
  2015-12-03 14:45                                         ` Richard Biener
@ 2015-12-03 15:46                                         ` Jeff Law
  2015-12-10 20:08                                         ` Jeff Law
  2 siblings, 0 replies; 72+ messages in thread
From: Jeff Law @ 2015-12-03 15:46 UTC (permalink / raw)
  To: Richard Biener
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On 12/03/2015 07:38 AM, Richard Biener wrote:
>
> This pass is now enabled by default with -Os but has no limits on the amount of
> stmts it copies.  It also will make all loops with this shape have at least two
> exits (if the resulting loop will be disambiguated the inner loop will
> have two exits).
> Having more than one exit will disable almost all loop optimizations after it.
[ ... ]
split-paths in the queue -- behind addressing a couple of correctness 
issues that are on my plate (not split-paths related).  I'll respond 
fully.  FWIW, I wouldn't lose much sleep if this were disabled by 
default -- without the "sink-common-code-past-phi" stuff we've discussed 
in the past it's fairly hard to justify path-splitting this aggressively.

jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-12-03 14:38                                       ` Richard Biener
  2015-12-03 14:45                                         ` Richard Biener
  2015-12-03 15:46                                         ` Jeff Law
@ 2015-12-10 20:08                                         ` Jeff Law
  2015-12-11  9:11                                           ` Ajit Kumar Agarwal
  2015-12-11 10:06                                           ` Richard Biener
  2 siblings, 2 replies; 72+ messages in thread
From: Jeff Law @ 2015-12-10 20:08 UTC (permalink / raw)
  To: Richard Biener
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On 12/03/2015 07:38 AM, Richard Biener wrote:
> This pass is now enabled by default with -Os but has no limits on the amount of
> stmts it copies.
The more statements it copies, the more likely it is that the path 
spitting will turn out to be useful!  It's counter-intuitive.

The primary benefit AFAICT with path splitting is that it exposes 
additional CSE, DCE, etc opportunities.

IIRC  Ajit posited that it could help with live/conflict analysis, I 
never saw that, and with the changes to push splitting deeper into the 
pipeline I'd further life/conflict analysis since that work also 
involved preserving the single latch property.



  It also will make all loops with this shape have at least two
> exits (if the resulting loop will be disambiguated the inner loop will
> have two exits).
> Having more than one exit will disable almost all loop optimizations after it.
Hmmm, the updated code keeps the single latch property, but I'm pretty 
sure it won't keep a single exit policy.

To keep a single exit policy would require keeping an additional block 
around.  Each of the split paths would unconditionally transfer to this 
new block.  The new block would then either transfer to the latch block 
or out of the loop.


>
> The pass itself documents the transform it does but does zero to motivate it.
>
> What's the benefit of this pass (apart from disrupting further optimizations)?
It's essentially building superblocks in a special case to enable 
additional CSE, DCE and the like.

Unfortunately what is is missing is heuristics and de-duplication.  The 
former to drive cases when it's not useful and the latter to reduce 
codesize for any statements that did not participate in optimizations 
when they were duplicated.

The de-duplication is the "sink-statements-through-phi" problems, cross 
jumping, tail merging and the like class of problems.

It was only after I approved this code after twiddling it for Ajit that 
I came across Honza's tracer implementation, which may in fact be 
retargettable to these loops and do a better job.  I haven't 
experimented with that.



>
> I can see a _single_ case where duplicating the latch will allow threading
> one of the paths through the loop header to eliminate the original exit.  Then
> disambiguation may create a nice nested loop out of this.  Of course that
> is only profitable again if you know the remaining single exit of the inner
> loop (exiting to the outer one) is executed infrequently (thus the inner loop
> actually loops).
It wasn't ever about threading.

>
> But no checks other than on the CFG shape exist (oh, it checks it will
> at _least_ copy two stmts!).
Again, the more statements it copies the more likely it is to be 
profitable.  Think superblocks to expose CSE, DCE and the like.

>
> Given the profitability constraints above (well, correct me if I am
> wrong on these)
> it looks like the whole transform should be done within the FSM threading
> code which might be able to compute whether there will be an inner loop
> with a single exit only.
While it shares some concepts with jump threading, I don't think the 
transformation belongs in jump threading.

>
> I'm inclined to request the pass to be removed again or at least disabled by
> default.
I wouldn't lose any sleep if we disabled by default or removed, 
particularly if we can repurpose Honza's code.  In fact, I might 
strongly support the former until we hear back from Ajit on performance 
data.

I also keep coming back to Click's paper on code motion -- in that 
context, copying statements would be a way to break dependencies and 
give the global code motion algorithm more freedom.  The advantage of 
doing it in a framework like Click's is it's got a built-in sinking step.


>
> What closed source benchmark was this transform invented for?
I think it was EEMBC or Coremark.  Ajit should know for sure.  I was 
actually still hoping to see benchmark results from Ajit to confirm the 
new placement in the pipeline didn't negate all the benefits he saw.

jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-12-03 14:45                                         ` Richard Biener
@ 2015-12-10 20:12                                           ` Jeff Law
  0 siblings, 0 replies; 72+ messages in thread
From: Jeff Law @ 2015-12-10 20:12 UTC (permalink / raw)
  To: Richard Biener
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On 12/03/2015 07:45 AM, Richard Biener wrote:

>
> Ah, some EEMBC one.
>
> Btw, the testcase that was added shows
>
>         if (xc < xm)
>           {
>             xk = (unsigned char) (xc < xy ? xc : xy);
>           }
>         else
>          {
>            xk = (unsigned char) (xm < xy ? xm : xy);
>          }
>
> which might be better handled by phiopt transforming it into
I don't think the included testcase is a particularly good one for this 
transformation -- I didn't see that the transformation made any 
significant difference on x86_64.  That why I asked for Ajit for more 
data on the benchmarking.


>
> xk = MIN (xc, MIN (xm, xy))
>
> phiopt1 sees (hooray to GENERIC folding)
>
>    xc_26 = ~xr_21;
>    xm_27 = ~xg_23;
>    xy_28 = ~xb_25;
>    if (xr_21 > xg_23)
>      goto <bb 5>;
>    else
>      goto <bb 6>;
>
>    <bb 5>:
>    xk_29 = MIN_EXPR <xc_26, xy_28>;
>    goto <bb 7>;
>
>    <bb 6>:
>    xk_30 = MIN_EXPR <xm_27, xy_28>;
>
>    <bb 7>:
>    # xk_4 = PHI <xk_29(5), xk_30(6)>
>
> btw, see PR67438 for a similar testcase and the above pattern.
That may be elsewhere in BZ database as well.  I've seen stuff that 
looks awful close to that when going through the bug lists in prior 
releases.

jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-12-10 20:08                                         ` Jeff Law
@ 2015-12-11  9:11                                           ` Ajit Kumar Agarwal
  2015-12-23  6:36                                             ` Jeff Law
  2015-12-11 10:06                                           ` Richard Biener
  1 sibling, 1 reply; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-12-11  9:11 UTC (permalink / raw)
  To: Jeff Law, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

Hello Jeff:

Sorry for the delay in sending the benchmarks run with Split-Path change.

Here is the Summary of the results.

SPEC CPU 2000 INT benchmarks ( Target i386)
( Geomean Score without Split-Paths changes vs Geomean Score with Split-Path changes  =  3740.789 vs 3745.193).

SPEC CPU 2000 FP benchmarks. ( Target i386)
( Geomean Score without Split-Paths changes vs Geomean Score with Split-Path changes  =  4721.655 vs 4741.825).

Mibench/EEMBC benchmarks (Target Microblaze)

Automotive_qsort1(4.03%), Office_ispell(4.29%), Office_stringsearch1(3.5%). Telecom_adpcm_d( 1.37%), ospfv2_lite(1.35%).

We are seeing minor negative gains that are mainly noise.(less than 0.5%)

Thanks & Regards
Ajit
-----Original Message-----
From: Jeff Law [mailto:law@redhat.com] 
Sent: Friday, December 11, 2015 1:39 AM
To: Richard Biener
Cc: Ajit Kumar Agarwal; GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On 12/03/2015 07:38 AM, Richard Biener wrote:
> This pass is now enabled by default with -Os but has no limits on the 
> amount of stmts it copies.
The more statements it copies, the more likely it is that the path spitting will turn out to be useful!  It's counter-intuitive.

The primary benefit AFAICT with path splitting is that it exposes additional CSE, DCE, etc opportunities.

IIRC  Ajit posited that it could help with live/conflict analysis, I never saw that, and with the changes to push splitting deeper into the pipeline I'd further life/conflict analysis since that work also involved preserving the single latch property.



  It also will make all loops with this shape have at least two
> exits (if the resulting loop will be disambiguated the inner loop will 
> have two exits).
> Having more than one exit will disable almost all loop optimizations after it.
Hmmm, the updated code keeps the single latch property, but I'm pretty sure it won't keep a single exit policy.

To keep a single exit policy would require keeping an additional block around.  Each of the split paths would unconditionally transfer to this new block.  The new block would then either transfer to the latch block or out of the loop.


>
> The pass itself documents the transform it does but does zero to motivate it.
>
> What's the benefit of this pass (apart from disrupting further optimizations)?
It's essentially building superblocks in a special case to enable additional CSE, DCE and the like.

Unfortunately what is is missing is heuristics and de-duplication.  The former to drive cases when it's not useful and the latter to reduce codesize for any statements that did not participate in optimizations when they were duplicated.

The de-duplication is the "sink-statements-through-phi" problems, cross jumping, tail merging and the like class of problems.

It was only after I approved this code after twiddling it for Ajit that I came across Honza's tracer implementation, which may in fact be retargettable to these loops and do a better job.  I haven't experimented with that.



>
> I can see a _single_ case where duplicating the latch will allow 
> threading one of the paths through the loop header to eliminate the 
> original exit.  Then disambiguation may create a nice nested loop out 
> of this.  Of course that is only profitable again if you know the 
> remaining single exit of the inner loop (exiting to the outer one) is 
> executed infrequently (thus the inner loop actually loops).
It wasn't ever about threading.

>
> But no checks other than on the CFG shape exist (oh, it checks it will 
> at _least_ copy two stmts!).
Again, the more statements it copies the more likely it is to be profitable.  Think superblocks to expose CSE, DCE and the like.

>
> Given the profitability constraints above (well, correct me if I am 
> wrong on these) it looks like the whole transform should be done 
> within the FSM threading code which might be able to compute whether 
> there will be an inner loop with a single exit only.
While it shares some concepts with jump threading, I don't think the transformation belongs in jump threading.

>
> I'm inclined to request the pass to be removed again or at least 
> disabled by default.
I wouldn't lose any sleep if we disabled by default or removed, particularly if we can repurpose Honza's code.  In fact, I might strongly support the former until we hear back from Ajit on performance data.

I also keep coming back to Click's paper on code motion -- in that context, copying statements would be a way to break dependencies and give the global code motion algorithm more freedom.  The advantage of doing it in a framework like Click's is it's got a built-in sinking step.


>
> What closed source benchmark was this transform invented for?
I think it was EEMBC or Coremark.  Ajit should know for sure.  I was actually still hoping to see benchmark results from Ajit to confirm the new placement in the pipeline didn't negate all the benefits he saw.

jeff


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-12-10 20:08                                         ` Jeff Law
  2015-12-11  9:11                                           ` Ajit Kumar Agarwal
@ 2015-12-11 10:06                                           ` Richard Biener
  2015-12-15 23:50                                             ` Jeff Law
  2015-12-17 23:41                                             ` Jeff Law
  1 sibling, 2 replies; 72+ messages in thread
From: Richard Biener @ 2015-12-11 10:06 UTC (permalink / raw)
  To: Jeff Law
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On Thu, Dec 10, 2015 at 9:08 PM, Jeff Law <law@redhat.com> wrote:
> On 12/03/2015 07:38 AM, Richard Biener wrote:
>>
>> This pass is now enabled by default with -Os but has no limits on the
>> amount of
>> stmts it copies.
>
> The more statements it copies, the more likely it is that the path spitting
> will turn out to be useful!  It's counter-intuitive.

Well, it's still not appropriate for -Os (nor -O2 I think).  -ftracer is enabled
with -fprofile-use (but it is also properly driven to only trace hot paths)
and otherwise not by default at any optimization level.

> The primary benefit AFAICT with path splitting is that it exposes additional
> CSE, DCE, etc opportunities.
>
> IIRC  Ajit posited that it could help with live/conflict analysis, I never
> saw that, and with the changes to push splitting deeper into the pipeline
> I'd further life/conflict analysis since that work also involved preserving
> the single latch property.
>
>
>
>  It also will make all loops with this shape have at least two
>>
>> exits (if the resulting loop will be disambiguated the inner loop will
>> have two exits).
>> Having more than one exit will disable almost all loop optimizations after
>> it.
>
> Hmmm, the updated code keeps the single latch property, but I'm pretty sure
> it won't keep a single exit policy.
>
> To keep a single exit policy would require keeping an additional block
> around.  Each of the split paths would unconditionally transfer to this new
> block.  The new block would then either transfer to the latch block or out
> of the loop.

Don't see how this would work for the CFG pattern it operates on unless you
duplicate the exit condition into that new block creating an even more
obfuscated
CFG.

>
>>
>> The pass itself documents the transform it does but does zero to motivate
>> it.
>>
>> What's the benefit of this pass (apart from disrupting further
>> optimizations)?
>
> It's essentially building superblocks in a special case to enable additional
> CSE, DCE and the like.
>
> Unfortunately what is is missing is heuristics and de-duplication.  The
> former to drive cases when it's not useful and the latter to reduce codesize
> for any statements that did not participate in optimizations when they were
> duplicated.
>
> The de-duplication is the "sink-statements-through-phi" problems, cross
> jumping, tail merging and the like class of problems.
>
> It was only after I approved this code after twiddling it for Ajit that I
> came across Honza's tracer implementation, which may in fact be
> retargettable to these loops and do a better job.  I haven't experimented
> with that.

Well, I originally suggested to merge this with the tracer pass...

>> I can see a _single_ case where duplicating the latch will allow threading
>> one of the paths through the loop header to eliminate the original exit.
>> Then
>> disambiguation may create a nice nested loop out of this.  Of course that
>> is only profitable again if you know the remaining single exit of the
>> inner
>> loop (exiting to the outer one) is executed infrequently (thus the inner
>> loop
>> actually loops).
>
> It wasn't ever about threading.

I see.

>>
>> But no checks other than on the CFG shape exist (oh, it checks it will
>> at _least_ copy two stmts!).
>
> Again, the more statements it copies the more likely it is to be profitable.
> Think superblocks to expose CSE, DCE and the like.

Ok, so similar to tracer (where I think the main benefit is actually increasing
scheduling opportunities for architectures where it matters).

Note that both passes are placed quite late and thus won't see much
of the GIMPLE optimizations (DOM mainly).  I wonder why they were
not placed adjacent to each other.

>>
>> Given the profitability constraints above (well, correct me if I am
>> wrong on these)
>> it looks like the whole transform should be done within the FSM threading
>> code which might be able to compute whether there will be an inner loop
>> with a single exit only.
>
> While it shares some concepts with jump threading, I don't think the
> transformation belongs in jump threading.
>
>>
>> I'm inclined to request the pass to be removed again or at least disabled
>> by
>> default.
>
> I wouldn't lose any sleep if we disabled by default or removed, particularly
> if we can repurpose Honza's code.  In fact, I might strongly support the
> former until we hear back from Ajit on performance data.

See above for what we do with -ftracer.  path-splitting should at _least_
restrict itself to operate on optimize_loop_for_speed_p () loops.

It should also (even if counter-intuitive) limit the amount of stmt copying
it does - after all there is sth like an instruction cache size which exceeeding
for loops will never be a good idea (and even smaller special loop caches on
some archs).

Note that a better heuristic than "at least more than one stmt" would be
to have at least one PHI in the merger block.  Otherwise I don't see how
CSE opportunities could exist we don't see without the duplication.
And yes, more PHIs -> more possible CSE.  I wouldn't say so for
the number of stmts.  So please limit the number of stmt copies!
(after all we do limit the number of stmts we copy during jump threading!)

Richard.

> I also keep coming back to Click's paper on code motion -- in that context,
> copying statements would be a way to break dependencies and give the global
> code motion algorithm more freedom.  The advantage of doing it in a
> framework like Click's is it's got a built-in sinking step.
>
>
>>
>> What closed source benchmark was this transform invented for?
>
> I think it was EEMBC or Coremark.  Ajit should know for sure.  I was
> actually still hoping to see benchmark results from Ajit to confirm the new
> placement in the pipeline didn't negate all the benefits he saw.
>
> jeff
>

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-12-11 10:06                                           ` Richard Biener
@ 2015-12-15 23:50                                             ` Jeff Law
  2015-12-16  7:44                                               ` Ajit Kumar Agarwal
  2015-12-17 23:41                                             ` Jeff Law
  1 sibling, 1 reply; 72+ messages in thread
From: Jeff Law @ 2015-12-15 23:50 UTC (permalink / raw)
  To: Richard Biener
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On 12/11/2015 03:05 AM, Richard Biener wrote:
> On Thu, Dec 10, 2015 at 9:08 PM, Jeff Law <law@redhat.com> wrote:
>> On 12/03/2015 07:38 AM, Richard Biener wrote:
>>>
>>> This pass is now enabled by default with -Os but has no limits on the
>>> amount of
>>> stmts it copies.
>>
>> The more statements it copies, the more likely it is that the path spitting
>> will turn out to be useful!  It's counter-intuitive.
>
> Well, it's still not appropriate for -Os (nor -O2 I think).  -ftracer is enabled
> with -fprofile-use (but it is also properly driven to only trace hot paths)
> and otherwise not by default at any optimization level.
Definitely not appropriate for -Os.  But as I mentioned, I really want 
to look at the tracer code as it may totally subsume path splitting.

>
> Don't see how this would work for the CFG pattern it operates on unless you
> duplicate the exit condition into that new block creating an even more
> obfuscated CFG.
Agreed, I don't see any way to fix the multiple exit problem.  Then 
again, this all runs after the tree loop optimizer, so I'm not sure how 
big of an issue it is in practice.


>> It was only after I approved this code after twiddling it for Ajit that I
>> came across Honza's tracer implementation, which may in fact be
>> retargettable to these loops and do a better job.  I haven't experimented
>> with that.
>
> Well, I originally suggested to merge this with the tracer pass...
I missed that, or it didn't sink into my brain.

>> Again, the more statements it copies the more likely it is to be profitable.
>> Think superblocks to expose CSE, DCE and the like.
>
> Ok, so similar to tracer (where I think the main benefit is actually increasing
> scheduling opportunities for architectures where it matters).
Right.  They're both building superblocks, which has the effect of 
larger windows for scheduling, DCE, CSE, etc.


>
> Note that both passes are placed quite late and thus won't see much
> of the GIMPLE optimizations (DOM mainly).  I wonder why they were
> not placed adjacent to each other.
Ajit had it fairly early, but that didn't play well with if-conversion. 
  I just pushed it past if-conversion and vectorization, but before the 
last DOM pass.  That turns out to be where tracer lives too as you noted.

>>
>> I wouldn't lose any sleep if we disabled by default or removed, particularly
>> if we can repurpose Honza's code.  In fact, I might strongly support the
>> former until we hear back from Ajit on performance data.
>
> See above for what we do with -ftracer.  path-splitting should at _least_
> restrict itself to operate on optimize_loop_for_speed_p () loops.
I think we need to decide if we want the code at all, particularly given 
the multiple-exit problem.

The difficulty is I think Ajit posted some recent data that shows it's 
helping.  So maybe the thing to do is ask Ajit to try the tracer 
independent of path splitting and take the obvious actions based on 
Ajit's data.


>
> It should also (even if counter-intuitive) limit the amount of stmt copying
> it does - after all there is sth like an instruction cache size which exceeeding
> for loops will never be a good idea (and even smaller special loop caches on
> some archs).
Yup.

>
> Note that a better heuristic than "at least more than one stmt" would be
> to have at least one PHI in the merger block.  Otherwise I don't see how
> CSE opportunities could exist we don't see without the duplication.
> And yes, more PHIs -> more possible CSE.  I wouldn't say so for
> the number of stmts.  So please limit the number of stmt copies!
> (after all we do limit the number of stmts we copy during jump threading!)
Let's get some more data before we try to tune path splitting.  In an 
ideal world, the tracer can handle this for us and we just remove path 
splitting completely.

Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-12-15 23:50                                             ` Jeff Law
@ 2015-12-16  7:44                                               ` Ajit Kumar Agarwal
  2015-12-16  9:57                                                 ` Richard Biener
  0 siblings, 1 reply; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-12-16  7:44 UTC (permalink / raw)
  To: Jeff Law, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

Hello Jeff:

Here is more of a data you have asked for.

SPEC FP benchmarks.
a) No Path Splitting + tracer enabled 
    Geomean Score =  4749.726.
b) Path Splitting enabled + tracer enabled.
    Geomean Score =  4781.655.

Conclusion: With both Path Splitting and tracer enabled we got maximum gains. I think we need to have Path Splitting pass.

SPEC INT benchmarks.
a) Path Splitting enabled + tracer not enabled.
    Geomean Score =  3745.193.
b) No Path Splitting + tracer enabled.
    Geomean Score = 3738.558.
c) Path Splitting enabled + tracer enabled.
    Geomean Score = 3742.833.

Conclusion: We are getting more gains with Path Splitting as compared to tracer. With both Path Splitting and tracer enabled we are also getting  gains.
I think we should have Path Splitting pass.

One more observation: Richard's concern is the creation of multiple exits with Splitting paths through duplication. My observation is,  in tracer pass also there
is a creation of multiple exits through duplication. I don’t think that’s an issue with the practicality considering the gains we are getting with Splitting paths with
more PRE, CSE and DCE.

Thanks & Regards
Ajit 




-----Original Message-----
From: Jeff Law [mailto:law@redhat.com] 
Sent: Wednesday, December 16, 2015 5:20 AM
To: Richard Biener
Cc: Ajit Kumar Agarwal; GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On 12/11/2015 03:05 AM, Richard Biener wrote:
> On Thu, Dec 10, 2015 at 9:08 PM, Jeff Law <law@redhat.com> wrote:
>> On 12/03/2015 07:38 AM, Richard Biener wrote:
>>>
>>> This pass is now enabled by default with -Os but has no limits on 
>>> the amount of stmts it copies.
>>
>> The more statements it copies, the more likely it is that the path 
>> spitting will turn out to be useful!  It's counter-intuitive.
>
> Well, it's still not appropriate for -Os (nor -O2 I think).  -ftracer 
> is enabled with -fprofile-use (but it is also properly driven to only 
> trace hot paths) and otherwise not by default at any optimization level.
Definitely not appropriate for -Os.  But as I mentioned, I really want to look at the tracer code as it may totally subsume path splitting.

>
> Don't see how this would work for the CFG pattern it operates on 
> unless you duplicate the exit condition into that new block creating 
> an even more obfuscated CFG.
Agreed, I don't see any way to fix the multiple exit problem.  Then again, this all runs after the tree loop optimizer, so I'm not sure how big of an issue it is in practice.


>> It was only after I approved this code after twiddling it for Ajit 
>> that I came across Honza's tracer implementation, which may in fact 
>> be retargettable to these loops and do a better job.  I haven't 
>> experimented with that.
>
> Well, I originally suggested to merge this with the tracer pass...
I missed that, or it didn't sink into my brain.

>> Again, the more statements it copies the more likely it is to be profitable.
>> Think superblocks to expose CSE, DCE and the like.
>
> Ok, so similar to tracer (where I think the main benefit is actually 
> increasing scheduling opportunities for architectures where it matters).
Right.  They're both building superblocks, which has the effect of larger windows for scheduling, DCE, CSE, etc.


>
> Note that both passes are placed quite late and thus won't see much
> of the GIMPLE optimizations (DOM mainly).  I wonder why they were
> not placed adjacent to each other.
Ajit had it fairly early, but that didn't play well with if-conversion. 
  I just pushed it past if-conversion and vectorization, but before the 
last DOM pass.  That turns out to be where tracer lives too as you noted.

>>
>> I wouldn't lose any sleep if we disabled by default or removed, particularly
>> if we can repurpose Honza's code.  In fact, I might strongly support the
>> former until we hear back from Ajit on performance data.
>
> See above for what we do with -ftracer.  path-splitting should at _least_
> restrict itself to operate on optimize_loop_for_speed_p () loops.
I think we need to decide if we want the code at all, particularly given 
the multiple-exit problem.

The difficulty is I think Ajit posted some recent data that shows it's 
helping.  So maybe the thing to do is ask Ajit to try the tracer 
independent of path splitting and take the obvious actions based on 
Ajit's data.


>
> It should also (even if counter-intuitive) limit the amount of stmt copying
> it does - after all there is sth like an instruction cache size which exceeeding
> for loops will never be a good idea (and even smaller special loop caches on
> some archs).
Yup.

>
> Note that a better heuristic than "at least more than one stmt" would be
> to have at least one PHI in the merger block.  Otherwise I don't see how
> CSE opportunities could exist we don't see without the duplication.
> And yes, more PHIs -> more possible CSE.  I wouldn't say so for
> the number of stmts.  So please limit the number of stmt copies!
> (after all we do limit the number of stmts we copy during jump threading!)
Let's get some more data before we try to tune path splitting.  In an 
ideal world, the tracer can handle this for us and we just remove path 
splitting completely.

Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-12-16  7:44                                               ` Ajit Kumar Agarwal
@ 2015-12-16  9:57                                                 ` Richard Biener
  2015-12-16 10:13                                                   ` Ajit Kumar Agarwal
  0 siblings, 1 reply; 72+ messages in thread
From: Richard Biener @ 2015-12-16  9:57 UTC (permalink / raw)
  To: Ajit Kumar Agarwal
  Cc: Jeff Law, GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On Wed, Dec 16, 2015 at 8:43 AM, Ajit Kumar Agarwal
<ajit.kumar.agarwal@xilinx.com> wrote:
> Hello Jeff:
>
> Here is more of a data you have asked for.
>
> SPEC FP benchmarks.
> a) No Path Splitting + tracer enabled
>     Geomean Score =  4749.726.
> b) Path Splitting enabled + tracer enabled.
>     Geomean Score =  4781.655.
>
> Conclusion: With both Path Splitting and tracer enabled we got maximum gains. I think we need to have Path Splitting pass.
>
> SPEC INT benchmarks.
> a) Path Splitting enabled + tracer not enabled.
>     Geomean Score =  3745.193.
> b) No Path Splitting + tracer enabled.
>     Geomean Score = 3738.558.
> c) Path Splitting enabled + tracer enabled.
>     Geomean Score = 3742.833.

I suppose with SPEC you mean SPEC CPU 2006?

Can you disclose the architecture you did the measurements on and the
compile flags you used otherwise?

Note that tracer does a very good job only when paired with FDO so can
you re-run SPEC with FDO and
compare with path-splitting enabled on top of that?

Thanks,
Richard.

> Conclusion: We are getting more gains with Path Splitting as compared to tracer. With both Path Splitting and tracer enabled we are also getting  gains.
> I think we should have Path Splitting pass.
>
> One more observation: Richard's concern is the creation of multiple exits with Splitting paths through duplication. My observation is,  in tracer pass also there
> is a creation of multiple exits through duplication. I don’t think that’s an issue with the practicality considering the gains we are getting with Splitting paths with
> more PRE, CSE and DCE.
>
> Thanks & Regards
> Ajit
>
>
>
>
> -----Original Message-----
> From: Jeff Law [mailto:law@redhat.com]
> Sent: Wednesday, December 16, 2015 5:20 AM
> To: Richard Biener
> Cc: Ajit Kumar Agarwal; GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
> Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
>
> On 12/11/2015 03:05 AM, Richard Biener wrote:
>> On Thu, Dec 10, 2015 at 9:08 PM, Jeff Law <law@redhat.com> wrote:
>>> On 12/03/2015 07:38 AM, Richard Biener wrote:
>>>>
>>>> This pass is now enabled by default with -Os but has no limits on
>>>> the amount of stmts it copies.
>>>
>>> The more statements it copies, the more likely it is that the path
>>> spitting will turn out to be useful!  It's counter-intuitive.
>>
>> Well, it's still not appropriate for -Os (nor -O2 I think).  -ftracer
>> is enabled with -fprofile-use (but it is also properly driven to only
>> trace hot paths) and otherwise not by default at any optimization level.
> Definitely not appropriate for -Os.  But as I mentioned, I really want to look at the tracer code as it may totally subsume path splitting.
>
>>
>> Don't see how this would work for the CFG pattern it operates on
>> unless you duplicate the exit condition into that new block creating
>> an even more obfuscated CFG.
> Agreed, I don't see any way to fix the multiple exit problem.  Then again, this all runs after the tree loop optimizer, so I'm not sure how big of an issue it is in practice.
>
>
>>> It was only after I approved this code after twiddling it for Ajit
>>> that I came across Honza's tracer implementation, which may in fact
>>> be retargettable to these loops and do a better job.  I haven't
>>> experimented with that.
>>
>> Well, I originally suggested to merge this with the tracer pass...
> I missed that, or it didn't sink into my brain.
>
>>> Again, the more statements it copies the more likely it is to be profitable.
>>> Think superblocks to expose CSE, DCE and the like.
>>
>> Ok, so similar to tracer (where I think the main benefit is actually
>> increasing scheduling opportunities for architectures where it matters).
> Right.  They're both building superblocks, which has the effect of larger windows for scheduling, DCE, CSE, etc.
>
>
>>
>> Note that both passes are placed quite late and thus won't see much
>> of the GIMPLE optimizations (DOM mainly).  I wonder why they were
>> not placed adjacent to each other.
> Ajit had it fairly early, but that didn't play well with if-conversion.
>   I just pushed it past if-conversion and vectorization, but before the
> last DOM pass.  That turns out to be where tracer lives too as you noted.
>
>>>
>>> I wouldn't lose any sleep if we disabled by default or removed, particularly
>>> if we can repurpose Honza's code.  In fact, I might strongly support the
>>> former until we hear back from Ajit on performance data.
>>
>> See above for what we do with -ftracer.  path-splitting should at _least_
>> restrict itself to operate on optimize_loop_for_speed_p () loops.
> I think we need to decide if we want the code at all, particularly given
> the multiple-exit problem.
>
> The difficulty is I think Ajit posted some recent data that shows it's
> helping.  So maybe the thing to do is ask Ajit to try the tracer
> independent of path splitting and take the obvious actions based on
> Ajit's data.
>
>
>>
>> It should also (even if counter-intuitive) limit the amount of stmt copying
>> it does - after all there is sth like an instruction cache size which exceeeding
>> for loops will never be a good idea (and even smaller special loop caches on
>> some archs).
> Yup.
>
>>
>> Note that a better heuristic than "at least more than one stmt" would be
>> to have at least one PHI in the merger block.  Otherwise I don't see how
>> CSE opportunities could exist we don't see without the duplication.
>> And yes, more PHIs -> more possible CSE.  I wouldn't say so for
>> the number of stmts.  So please limit the number of stmt copies!
>> (after all we do limit the number of stmts we copy during jump threading!)
> Let's get some more data before we try to tune path splitting.  In an
> ideal world, the tracer can handle this for us and we just remove path
> splitting completely.
>
> Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-12-16  9:57                                                 ` Richard Biener
@ 2015-12-16 10:13                                                   ` Ajit Kumar Agarwal
  2015-12-17 10:38                                                     ` Ajit Kumar Agarwal
  0 siblings, 1 reply; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-12-16 10:13 UTC (permalink / raw)
  To: Richard Biener
  Cc: Jeff Law, GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala



-----Original Message-----
From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-owner@gcc.gnu.org] On Behalf Of Richard Biener
Sent: Wednesday, December 16, 2015 3:27 PM
To: Ajit Kumar Agarwal
Cc: Jeff Law; GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On Wed, Dec 16, 2015 at 8:43 AM, Ajit Kumar Agarwal <ajit.kumar.agarwal@xilinx.com> wrote:
> Hello Jeff:
>
> Here is more of a data you have asked for.
>
> SPEC FP benchmarks.
> a) No Path Splitting + tracer enabled
>     Geomean Score =  4749.726.
> b) Path Splitting enabled + tracer enabled.
>     Geomean Score =  4781.655.
>
> Conclusion: With both Path Splitting and tracer enabled we got maximum gains. I think we need to have Path Splitting pass.
>
> SPEC INT benchmarks.
> a) Path Splitting enabled + tracer not enabled.
>     Geomean Score =  3745.193.
> b) No Path Splitting + tracer enabled.
>     Geomean Score = 3738.558.
> c) Path Splitting enabled + tracer enabled.
>     Geomean Score = 3742.833.

>>I suppose with SPEC you mean SPEC CPU 2006?

The performance data is with respect to SPEC CPU 2000 benchmarks.

>>Can you disclose the architecture you did the measurements on and the compile flags you used otherwise?

Intel(R) Xeon(R) CPU E5-2687W v3 @ 3.10GHz 
cpu cores       : 10
cache size      : 25600 KB

I have used -O3 and enable the tracer with  -ftracer .

Thanks & Regards
Ajit
>>Note that tracer does a very good job only when paired with FDO so can you re-run SPEC with FDO and compare with path-splitting enabled on top of that?


Thanks,
Richard.

> Conclusion: We are getting more gains with Path Splitting as compared to tracer. With both Path Splitting and tracer enabled we are also getting  gains.
> I think we should have Path Splitting pass.
>
> One more observation: Richard's concern is the creation of multiple 
> exits with Splitting paths through duplication. My observation is,  in 
> tracer pass also there is a creation of multiple exits through duplication. I don’t think that’s an issue with the practicality considering the gains we are getting with Splitting paths with more PRE, CSE and DCE.
>
> Thanks & Regards
> Ajit
>
>
>
>
> -----Original Message-----
> From: Jeff Law [mailto:law@redhat.com]
> Sent: Wednesday, December 16, 2015 5:20 AM
> To: Richard Biener
> Cc: Ajit Kumar Agarwal; GCC Patches; Vinod Kathail; Shail Aditya 
> Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
> Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on 
> tree ssa representation
>
> On 12/11/2015 03:05 AM, Richard Biener wrote:
>> On Thu, Dec 10, 2015 at 9:08 PM, Jeff Law <law@redhat.com> wrote:
>>> On 12/03/2015 07:38 AM, Richard Biener wrote:
>>>>
>>>> This pass is now enabled by default with -Os but has no limits on 
>>>> the amount of stmts it copies.
>>>
>>> The more statements it copies, the more likely it is that the path 
>>> spitting will turn out to be useful!  It's counter-intuitive.
>>
>> Well, it's still not appropriate for -Os (nor -O2 I think).  -ftracer 
>> is enabled with -fprofile-use (but it is also properly driven to only 
>> trace hot paths) and otherwise not by default at any optimization level.
> Definitely not appropriate for -Os.  But as I mentioned, I really want to look at the tracer code as it may totally subsume path splitting.
>
>>
>> Don't see how this would work for the CFG pattern it operates on 
>> unless you duplicate the exit condition into that new block creating 
>> an even more obfuscated CFG.
> Agreed, I don't see any way to fix the multiple exit problem.  Then again, this all runs after the tree loop optimizer, so I'm not sure how big of an issue it is in practice.
>
>
>>> It was only after I approved this code after twiddling it for Ajit 
>>> that I came across Honza's tracer implementation, which may in fact 
>>> be retargettable to these loops and do a better job.  I haven't 
>>> experimented with that.
>>
>> Well, I originally suggested to merge this with the tracer pass...
> I missed that, or it didn't sink into my brain.
>
>>> Again, the more statements it copies the more likely it is to be profitable.
>>> Think superblocks to expose CSE, DCE and the like.
>>
>> Ok, so similar to tracer (where I think the main benefit is actually 
>> increasing scheduling opportunities for architectures where it matters).
> Right.  They're both building superblocks, which has the effect of larger windows for scheduling, DCE, CSE, etc.
>
>
>>
>> Note that both passes are placed quite late and thus won't see much 
>> of the GIMPLE optimizations (DOM mainly).  I wonder why they were not 
>> placed adjacent to each other.
> Ajit had it fairly early, but that didn't play well with if-conversion.
>   I just pushed it past if-conversion and vectorization, but before 
> the last DOM pass.  That turns out to be where tracer lives too as you noted.
>
>>>
>>> I wouldn't lose any sleep if we disabled by default or removed, 
>>> particularly if we can repurpose Honza's code.  In fact, I might 
>>> strongly support the former until we hear back from Ajit on performance data.
>>
>> See above for what we do with -ftracer.  path-splitting should at 
>> _least_ restrict itself to operate on optimize_loop_for_speed_p () loops.
> I think we need to decide if we want the code at all, particularly 
> given the multiple-exit problem.
>
> The difficulty is I think Ajit posted some recent data that shows it's 
> helping.  So maybe the thing to do is ask Ajit to try the tracer 
> independent of path splitting and take the obvious actions based on 
> Ajit's data.
>
>
>>
>> It should also (even if counter-intuitive) limit the amount of stmt 
>> copying it does - after all there is sth like an instruction cache 
>> size which exceeeding for loops will never be a good idea (and even 
>> smaller special loop caches on some archs).
> Yup.
>
>>
>> Note that a better heuristic than "at least more than one stmt" would 
>> be to have at least one PHI in the merger block.  Otherwise I don't 
>> see how CSE opportunities could exist we don't see without the duplication.
>> And yes, more PHIs -> more possible CSE.  I wouldn't say so for the 
>> number of stmts.  So please limit the number of stmt copies!
>> (after all we do limit the number of stmts we copy during jump 
>> threading!)
> Let's get some more data before we try to tune path splitting.  In an 
> ideal world, the tracer can handle this for us and we just remove path 
> splitting completely.
>
> Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-12-16 10:13                                                   ` Ajit Kumar Agarwal
@ 2015-12-17 10:38                                                     ` Ajit Kumar Agarwal
  0 siblings, 0 replies; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-12-17 10:38 UTC (permalink / raw)
  To: Jeff Law, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

Hello Jeff and Richard:

Here is the Summary of the FDO(Feedback Directed Optimization ) performance results.

SPEC CPU2000 INT benchmarks.
a) FDO + Splitting Paths enabled + tracer enabled
     Geomean Score = 3907.751673.
b) FDO + No Splitting Paths + tracer enabled
     Geomean Score = 3895.191536.

SPEC CPU2000 FP benchmarks.
a) FDO + Splitting Paths enabled + tracer enabled
     Geomean Score = 4793.321963
b) FDO + No Splitting Paths + tracer enabled
     Geomean Score = 4770.855467

The gains are maximum with Split Paths enabled + tracer pass enabled as compared to No Split Paths + tracer enabled. The 
Split Paths pass is very much required.

Thanks & Regards
Ajit

-----Original Message-----
From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-owner@gcc.gnu.org] On Behalf Of Ajit Kumar Agarwal
Sent: Wednesday, December 16, 2015 3:44 PM
To: Richard Biener
Cc: Jeff Law; GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation



-----Original Message-----
From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-owner@gcc.gnu.org] On Behalf Of Richard Biener
Sent: Wednesday, December 16, 2015 3:27 PM
To: Ajit Kumar Agarwal
Cc: Jeff Law; GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On Wed, Dec 16, 2015 at 8:43 AM, Ajit Kumar Agarwal <ajit.kumar.agarwal@xilinx.com> wrote:
> Hello Jeff:
>
> Here is more of a data you have asked for.
>
> SPEC FP benchmarks.
> a) No Path Splitting + tracer enabled
>     Geomean Score =  4749.726.
> b) Path Splitting enabled + tracer enabled.
>     Geomean Score =  4781.655.
>
> Conclusion: With both Path Splitting and tracer enabled we got maximum gains. I think we need to have Path Splitting pass.
>
> SPEC INT benchmarks.
> a) Path Splitting enabled + tracer not enabled.
>     Geomean Score =  3745.193.
> b) No Path Splitting + tracer enabled.
>     Geomean Score = 3738.558.
> c) Path Splitting enabled + tracer enabled.
>     Geomean Score = 3742.833.

>>I suppose with SPEC you mean SPEC CPU 2006?

The performance data is with respect to SPEC CPU 2000 benchmarks.

>>Can you disclose the architecture you did the measurements on and the compile flags you used otherwise?

Intel(R) Xeon(R) CPU E5-2687W v3 @ 3.10GHz 
cpu cores       : 10
cache size      : 25600 KB

I have used -O3 and enable the tracer with  -ftracer .

Thanks & Regards
Ajit
>>Note that tracer does a very good job only when paired with FDO so can you re-run SPEC with FDO and compare with path-splitting enabled on top of that?


Thanks,
Richard.

> Conclusion: We are getting more gains with Path Splitting as compared to tracer. With both Path Splitting and tracer enabled we are also getting  gains.
> I think we should have Path Splitting pass.
>
> One more observation: Richard's concern is the creation of multiple 
> exits with Splitting paths through duplication. My observation is,  in 
> tracer pass also there is a creation of multiple exits through duplication. I don’t think that’s an issue with the practicality considering the gains we are getting with Splitting paths with more PRE, CSE and DCE.
>
> Thanks & Regards
> Ajit
>
>
>
>
> -----Original Message-----
> From: Jeff Law [mailto:law@redhat.com]
> Sent: Wednesday, December 16, 2015 5:20 AM
> To: Richard Biener
> Cc: Ajit Kumar Agarwal; GCC Patches; Vinod Kathail; Shail Aditya 
> Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
> Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on 
> tree ssa representation
>
> On 12/11/2015 03:05 AM, Richard Biener wrote:
>> On Thu, Dec 10, 2015 at 9:08 PM, Jeff Law <law@redhat.com> wrote:
>>> On 12/03/2015 07:38 AM, Richard Biener wrote:
>>>>
>>>> This pass is now enabled by default with -Os but has no limits on 
>>>> the amount of stmts it copies.
>>>
>>> The more statements it copies, the more likely it is that the path 
>>> spitting will turn out to be useful!  It's counter-intuitive.
>>
>> Well, it's still not appropriate for -Os (nor -O2 I think).  -ftracer 
>> is enabled with -fprofile-use (but it is also properly driven to only 
>> trace hot paths) and otherwise not by default at any optimization level.
> Definitely not appropriate for -Os.  But as I mentioned, I really want to look at the tracer code as it may totally subsume path splitting.
>
>>
>> Don't see how this would work for the CFG pattern it operates on 
>> unless you duplicate the exit condition into that new block creating 
>> an even more obfuscated CFG.
> Agreed, I don't see any way to fix the multiple exit problem.  Then again, this all runs after the tree loop optimizer, so I'm not sure how big of an issue it is in practice.
>
>
>>> It was only after I approved this code after twiddling it for Ajit 
>>> that I came across Honza's tracer implementation, which may in fact 
>>> be retargettable to these loops and do a better job.  I haven't 
>>> experimented with that.
>>
>> Well, I originally suggested to merge this with the tracer pass...
> I missed that, or it didn't sink into my brain.
>
>>> Again, the more statements it copies the more likely it is to be profitable.
>>> Think superblocks to expose CSE, DCE and the like.
>>
>> Ok, so similar to tracer (where I think the main benefit is actually 
>> increasing scheduling opportunities for architectures where it matters).
> Right.  They're both building superblocks, which has the effect of larger windows for scheduling, DCE, CSE, etc.
>
>
>>
>> Note that both passes are placed quite late and thus won't see much 
>> of the GIMPLE optimizations (DOM mainly).  I wonder why they were not 
>> placed adjacent to each other.
> Ajit had it fairly early, but that didn't play well with if-conversion.
>   I just pushed it past if-conversion and vectorization, but before 
> the last DOM pass.  That turns out to be where tracer lives too as you noted.
>
>>>
>>> I wouldn't lose any sleep if we disabled by default or removed, 
>>> particularly if we can repurpose Honza's code.  In fact, I might 
>>> strongly support the former until we hear back from Ajit on performance data.
>>
>> See above for what we do with -ftracer.  path-splitting should at 
>> _least_ restrict itself to operate on optimize_loop_for_speed_p () loops.
> I think we need to decide if we want the code at all, particularly 
> given the multiple-exit problem.
>
> The difficulty is I think Ajit posted some recent data that shows it's 
> helping.  So maybe the thing to do is ask Ajit to try the tracer 
> independent of path splitting and take the obvious actions based on 
> Ajit's data.
>
>
>>
>> It should also (even if counter-intuitive) limit the amount of stmt 
>> copying it does - after all there is sth like an instruction cache 
>> size which exceeeding for loops will never be a good idea (and even 
>> smaller special loop caches on some archs).
> Yup.
>
>>
>> Note that a better heuristic than "at least more than one stmt" would 
>> be to have at least one PHI in the merger block.  Otherwise I don't 
>> see how CSE opportunities could exist we don't see without the duplication.
>> And yes, more PHIs -> more possible CSE.  I wouldn't say so for the 
>> number of stmts.  So please limit the number of stmt copies!
>> (after all we do limit the number of stmts we copy during jump
>> threading!)
> Let's get some more data before we try to tune path splitting.  In an 
> ideal world, the tracer can handle this for us and we just remove path 
> splitting completely.
>
> Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-12-11 10:06                                           ` Richard Biener
  2015-12-15 23:50                                             ` Jeff Law
@ 2015-12-17 23:41                                             ` Jeff Law
  2015-12-18 15:43                                               ` Zamyatin, Igor
  1 sibling, 1 reply; 72+ messages in thread
From: Jeff Law @ 2015-12-17 23:41 UTC (permalink / raw)
  To: Richard Biener
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala

On 12/11/2015 03:05 AM, Richard Biener wrote:
> On Thu, Dec 10, 2015 at 9:08 PM, Jeff Law <law@redhat.com> wrote:
>> On 12/03/2015 07:38 AM, Richard Biener wrote:
>>>
>>> This pass is now enabled by default with -Os but has no limits on the
>>> amount of
>>> stmts it copies.
>>
>> The more statements it copies, the more likely it is that the path spitting
>> will turn out to be useful!  It's counter-intuitive.
>
> Well, it's still not appropriate for -Os (nor -O2 I think).  -ftracer is enabled
> with -fprofile-use (but it is also properly driven to only trace hot paths)
> and otherwise not by default at any optimization level.
I've just committed a patch to limit to loops we're optimizing for speed 
and moved the transformation from -O2 to -O3.

I put in some instrumentation to see when this was triggering and, as 
expected the vast majority of triggers are with very small blocks, 2-3 
statements.  But those are probably the least interesting.  There's 
limited instances where it triggers on large blocks (say > 10 
statements).  But those were with GCC sources.  I'm going to pull out 
SPEC and do some instrumented builds with that, obviously focusing on 
those benchmarks where Ajit saw improvements.

>> Hmmm, the updated code keeps the single latch property, but I'm pretty sure
>> it won't keep a single exit policy.
>>
>> To keep a single exit policy would require keeping an additional block
>> around.  Each of the split paths would unconditionally transfer to this new
>> block.  The new block would then either transfer to the latch block or out
>> of the loop.
>
> Don't see how this would work for the CFG pattern it operates on unless you
> duplicate the exit condition into that new block creating an even more
> obfuscated CFG.
Upon further reflection, I don't think this is important as the pass 
runs after the tree loop optimizers.

>
> Note that both passes are placed quite late and thus won't see much
> of the GIMPLE optimizations (DOM mainly).  I wonder why they were
> not placed adjacent to each other.
I'm going to move them to be adjacent.  If for no other reason than 
it'll make comparisons easier without having to worry about any passes 
between them.  I suspect that'll drop in tonight after I get the kids to 
sleep :-)

Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-12-17 23:41                                             ` Jeff Law
@ 2015-12-18 15:43                                               ` Zamyatin, Igor
  0 siblings, 0 replies; 72+ messages in thread
From: Zamyatin, Igor @ 2015-12-18 15:43 UTC (permalink / raw)
  To: Jeff Law, Richard Biener
  Cc: Ajit Kumar Agarwal, GCC Patches, Vinod Kathail,
	Shail Aditya Gupta, Vidhumouli Hunsigida, Nagaraju Mekala,
	ysrumyan

> On 12/11/2015 03:05 AM, Richard Biener wrote:
> > On Thu, Dec 10, 2015 at 9:08 PM, Jeff Law <law@redhat.com> wrote:
> >> On 12/03/2015 07:38 AM, Richard Biener wrote:
> >>>
> >>> This pass is now enabled by default with -Os but has no limits on
> >>> the amount of stmts it copies.
> >>
> >> The more statements it copies, the more likely it is that the path
> >> spitting will turn out to be useful!  It's counter-intuitive.
> >
> > Well, it's still not appropriate for -Os (nor -O2 I think).  -ftracer
> > is enabled with -fprofile-use (but it is also properly driven to only
> > trace hot paths) and otherwise not by default at any optimization level.
> I've just committed a patch to limit to loops we're optimizing for speed and
> moved the transformation from -O2 to -O3.
> 
> I put in some instrumentation to see when this was triggering and, as
> expected the vast majority of triggers are with very small blocks, 2-3
> statements.  But those are probably the least interesting.  There's limited
> instances where it triggers on large blocks (say > 10 statements).  But those
> were with GCC sources.  I'm going to pull out SPEC and do some
> instrumented builds with that, obviously focusing on those benchmarks
> where Ajit saw improvements.

I measured spec2000 and spec2006 on Haswell with recent GCC with following options - -Ofast -funroll-loops -flto -static -m64 -march=core-avx2 and saw no visible changes in performance for 2 runs, with and without -fno-split-paths.
Note also that there are couple of existing issues with path splitting - PR68541 and PR68522 (which is currently bisected)

Thanks,
Igor


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-12-11  9:11                                           ` Ajit Kumar Agarwal
@ 2015-12-23  6:36                                             ` Jeff Law
  2015-12-25  8:40                                               ` Ajit Kumar Agarwal
  2016-01-04 14:32                                               ` Ajit Kumar Agarwal
  0 siblings, 2 replies; 72+ messages in thread
From: Jeff Law @ 2015-12-23  6:36 UTC (permalink / raw)
  To: Ajit Kumar Agarwal, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On 12/11/2015 02:11 AM, Ajit Kumar Agarwal wrote:
>
> Mibench/EEMBC benchmarks (Target Microblaze)
>
> Automotive_qsort1(4.03%), Office_ispell(4.29%), Office_stringsearch1(3.5%). Telecom_adpcm_d( 1.37%), ospfv2_lite(1.35%).
I'm having a real tough time reproducing any of these results.  In fact, 
I'm having a tough time seeing cases where path splitting even applies 
to the Mibench/EEMBC benchmarks mentioned above.

In the very few cases where split-paths might apply, the net resulting 
assembly code I get is the same with and without split-paths.

How consistent are these results?

What functions are being affected that in turn impact performance?

What options are you using to compile the benchmarks?  I'm trying with 
-O2 -fsplit-paths and -O3 in my attempts to trigger the transformation 
so that I can look more closely at possible heuristics.

Is this with the standard microblaze-elf target?  Or with some other target?

jeff


^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-12-23  6:36                                             ` Jeff Law
@ 2015-12-25  8:40                                               ` Ajit Kumar Agarwal
  2016-01-02  7:32                                                 ` Jeff Law
  2016-01-04 14:32                                               ` Ajit Kumar Agarwal
  1 sibling, 1 reply; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2015-12-25  8:40 UTC (permalink / raw)
  To: Jeff Law, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

Hello Jeff:

I am out on vacation till 3rd Jan 2016.
Is it okay If I respond on the below once I am back in office.

Thanks & Regards
Ajit

-----Original Message-----
From: Jeff Law [mailto:law@redhat.com] 
Sent: Wednesday, December 23, 2015 12:06 PM
To: Ajit Kumar Agarwal; Richard Biener
Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On 12/11/2015 02:11 AM, Ajit Kumar Agarwal wrote:
>
> Mibench/EEMBC benchmarks (Target Microblaze)
>
> Automotive_qsort1(4.03%), Office_ispell(4.29%), Office_stringsearch1(3.5%). Telecom_adpcm_d( 1.37%), ospfv2_lite(1.35%).
I'm having a real tough time reproducing any of these results.  In fact, I'm having a tough time seeing cases where path splitting even applies to the Mibench/EEMBC benchmarks mentioned above.

In the very few cases where split-paths might apply, the net resulting assembly code I get is the same with and without split-paths.

How consistent are these results?

What functions are being affected that in turn impact performance?

What options are you using to compile the benchmarks?  I'm trying with
-O2 -fsplit-paths and -O3 in my attempts to trigger the transformation so that I can look more closely at possible heuristics.

Is this with the standard microblaze-elf target?  Or with some other target?

jeff



^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-12-25  8:40                                               ` Ajit Kumar Agarwal
@ 2016-01-02  7:32                                                 ` Jeff Law
  0 siblings, 0 replies; 72+ messages in thread
From: Jeff Law @ 2016-01-02  7:32 UTC (permalink / raw)
  To: Ajit Kumar Agarwal, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On 12/25/2015 01:40 AM, Ajit Kumar Agarwal wrote:
> Hello Jeff:
>
> I am out on vacation till 3rd Jan 2016.
> Is it okay If I respond on the below once I am back in office.
Yes.  I'm on vacation until then as well.

Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2015-12-23  6:36                                             ` Jeff Law
  2015-12-25  8:40                                               ` Ajit Kumar Agarwal
@ 2016-01-04 14:32                                               ` Ajit Kumar Agarwal
  2016-01-13  8:10                                                 ` Jeff Law
                                                                   ` (2 more replies)
  1 sibling, 3 replies; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2016-01-04 14:32 UTC (permalink / raw)
  To: Jeff Law, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala



-----Original Message-----
From: Jeff Law [mailto:law@redhat.com] 
Sent: Wednesday, December 23, 2015 12:06 PM
To: Ajit Kumar Agarwal; Richard Biener
Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On 12/11/2015 02:11 AM, Ajit Kumar Agarwal wrote:
>
> Mibench/EEMBC benchmarks (Target Microblaze)
>
> Automotive_qsort1(4.03%), Office_ispell(4.29%), Office_stringsearch1(3.5%). Telecom_adpcm_d( 1.37%), ospfv2_lite(1.35%).
>>I'm having a real tough time reproducing any of these results.  In fact, I'm having a tough time seeing cases where path splitting even applies to the Mibench/EEMBC benchmarks >>mentioned above.

>>In the very few cases where split-paths might apply, the net resulting assembly code I get is the same with and without split-paths.

>>How consistent are these results?

I am consistently getting the gains for office_ispell and office_stringsearch1, telcom_adpcm_d. I ran it again today and we see gains in the same bench mark tests 
with the split path changes.

>>What functions are being affected that in turn impact performance?

For office_ispell: The function are Function "linit (linit, funcdef_no=0, decl_uid=2535, cgraph_uid=0, symbol_order=2) for lookup.c file".
                                   "Function checkfile (checkfile, funcdef_no=1, decl_uid=2478, cgraph_uid=1, symbol_order=4)"
                                   " Function correct (correct, funcdef_no=2, decl_uid=2503, cgraph_uid=2, symbol_order=5)"
                                   " Function askmode (askmode, funcdef_no=24, decl_uid=2464, cgraph_uid=24, symbol_order=27)"
                                   for correct.c file.
                                  
For office_stringsearch1: The function is Function "bmhi_search (bmhi_search, funcdef_no=1, decl_uid=2178, cgraph_uid=1, symbol_order=5)"
for bmhisrch.c file.

>>What options are you using to compile the benchmarks?  I'm trying with
>>-O2 -fsplit-paths and -O3 in my attempts to trigger the transformation so that I can look more closely at possible heuristics.

I am using the following flags.

-O3 mlittle-endian -mxl-barrel-shift -mno-xl-soft-div -mhard-float -mxl-float-convert -mxl-float-sqrt   -mno-xl-soft-mul -mxl-multiply-high -mxl-pattern-compare.

To disable split paths -fno-split-paths is used on top of the above flags.

>>Is this with the standard microblaze-elf target?  Or with some other target?

I am using the --target=microblaze-xilinx-elf to build the microblaze target.

Thanks & Regards
Ajit

jeff



^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2016-01-04 14:32                                               ` Ajit Kumar Agarwal
@ 2016-01-13  8:10                                                 ` Jeff Law
       [not found]                                                   ` <56976289.3080203@redhat.com! >
                                                                     ` (2 more replies)
  2016-01-16  6:32                                                 ` Jeff Law
  2016-02-04  8:57                                                 ` Jeff Law
  2 siblings, 3 replies; 72+ messages in thread
From: Jeff Law @ 2016-01-13  8:10 UTC (permalink / raw)
  To: Ajit Kumar Agarwal, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On 01/04/2016 07:32 AM, Ajit Kumar Agarwal wrote:
>
> I am consistently getting the gains for office_ispell and office_stringsearch1, telcom_adpcm_d. I ran it again today and we see gains in the same bench mark tests
> with the split path changes.
>
>>> What functions are being affected that in turn impact performance?
>
> For office_ispell: The function are Function "linit (linit, funcdef_no=0, decl_uid=2535, cgraph_uid=0, symbol_order=2) for lookup.c file".
>                                     "Function checkfile (checkfile, funcdef_no=1, decl_uid=2478, cgraph_uid=1, symbol_order=4)"
>                                     " Function correct (correct, funcdef_no=2, decl_uid=2503, cgraph_uid=2, symbol_order=5)"
>                                     " Function askmode (askmode, funcdef_no=24, decl_uid=2464, cgraph_uid=24, symbol_order=27)"
>                                     for correct.c file.
>
> For office_stringsearch1: The function is Function "bmhi_search (bmhi_search, funcdef_no=1, decl_uid=2178, cgraph_uid=1, symbol_order=5)"
> for bmhisrch.c file.
So I can see split-paths affecting adpcm & lookup.  I don't see it 
affecting correct.c or bmhisrch.c.

That's progress though.  It's likely one of one or more of the flags is 
critical, so thanks for passing those along.

I'm going to focus on adpcm for the moment, in particular adpcm_coder. 
It appears the key blocks are:


;;   basic block 14, loop depth 1, count 0, freq 9100, maybe hot
;;    prev block 13, next block 15, flags: (NEW, REACHABLE)
;;    pred:       12 [100.0%]  (FALLTHRU,EXECUTABLE)
;;                13 [100.0%]  (FALLTHRU,EXECUTABLE)
   # valpred_12 = PHI <valpred_54(12), valpred_55(13)>
   _112 = MAX_EXPR <valpred_12, -32768>;
   valpred_18 = MIN_EXPR <_112, 32767>;
   delta_56 = delta_7 | iftmp.1_114;
   _57 = indexTable[delta_56];
   index_58 = _57 + index_107;
   _113 = MIN_EXPR <index_58, 88>;
   index_111 = MAX_EXPR <_113, 0>;
   step_59 = stepsizeTable[index_111];
   if (bufferstep_93 != 0)
     goto <bb 15>;
   else
     goto <bb 16>;
;;    succ:       15 [50.0%]  (TRUE_VALUE,EXECUTABLE)
;;                16 [50.0%]  (FALSE_VALUE,EXECUTABLE)

;;   basic block 15, loop depth 1, count 0, freq 4550, maybe hot
;;    prev block 14, next block 16, flags: (NEW, REACHABLE)
;;    pred:       14 [50.0%]  (TRUE_VALUE,EXECUTABLE)
   _60 = delta_56 << 4;
   goto <bb 17>;
;;    succ:       17 [100.0%]  (FALLTHRU,EXECUTABLE)

;;   basic block 16, loop depth 1, count 0, freq 4550, maybe hot
;;    prev block 15, next block 17, flags: (NEW, REACHABLE)
;;    pred:       14 [50.0%]  (FALSE_VALUE,EXECUTABLE)
   outp_62 = outp_83 + 1;
   _63 = (signed char) delta_56;
   _65 = (signed char) outputbuffer_90;
   _66 = _63 | _65;
   *outp_83 = _66;
;;    succ:       17 [100.0%]  (FALLTHRU,EXECUTABLE)

;;   basic block 17, loop depth 1, count 0, freq 9100, maybe hot
;;    prev block 16, next block 18, flags: (NEW, REACHABLE)
;;    pred:       15 [100.0%]  (FALLTHRU,EXECUTABLE)
;;                16 [100.0%]  (FALLTHRU,EXECUTABLE)
   # outp_3 = PHI <outp_83(15), outp_62(16)>
   # outputbuffer_21 = PHI <_60(15), outputbuffer_90(16)>
   _109 = bufferstep_93 ^ 1;
   _98 = _109 & 1;
   ivtmp.11_68 = ivtmp.11_105 + 2;
   if (ivtmp.11_68 != _116)
     goto <bb 4>;
   else
     goto <bb 18>;


Block #17 is the join point that we're going to effectively copy into 
blocks #15 and #16.  Doing so in turn exposes bufferstep_93 as the 
constant 0 in block #16, which in turn allows elimination of a couple 
statements in the extended version of block #16 and we propagate the 
constant 1 for bufferstep_93 to the top of the loop when reached via 
block #16.  So we save a few instructions.  However, I think we're 
actually doing a fairly poor job here.

bufferstep is a great example of a flip-flop variable and its value is 
statically computable based on the path from the prior loop iteration 
which, if exploited would allow the FSM threader to eliminate the 
conditional at the end of bb14.  I'm going to have to play with that.

Anyway, it's late and I want to rip this test apart a bit more and see 
how it interacts with the heuristic that I've cobbled together as well 
as see what it would take to have DOM or VRP get data on bufferstep_93 
on the true path out of BB14 after a path-split.

Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2016-01-13  8:10                                                 ` Jeff Law
       [not found]                                                   ` <56976289.3080203@redhat.com! >
@ 2016-01-14  8:55                                                   ` Jeff Law
  2016-01-15 23:02                                                     ` Jeff Law
  2016-01-15 22:38                                                   ` Jeff Law
  2 siblings, 1 reply; 72+ messages in thread
From: Jeff Law @ 2016-01-14  8:55 UTC (permalink / raw)
  To: Ajit Kumar Agarwal, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On 01/13/2016 01:10 AM, Jeff Law wrote:
>
> I'm going to focus on adpcm for the moment, in particular adpcm_coder.
> It appears the key blocks are:
Looking at adpcm_decoder we have the same idiom as in adpcm_coder:

   if (bufferstep_79 != 0)
     goto <bb 6>;
   else
     goto <bb 7>;
;;    succ:       6 [50.0%]  (TRUE_VALUE,EXECUTABLE)
;;                7 [50.0%]  (FALSE_VALUE,EXECUTABLE)

;;   basic block 6, loop depth 1, count 0, freq 4550, maybe hot
;;    prev block 5, next block 7, flags: (NEW, REACHABLE)
;;    pred:       5 [50.0%]  (TRUE_VALUE,EXECUTABLE)
   delta_31 = inputbuffer_65 & 15;
   goto <bb 8>;
;;    succ:       8 [100.0%]  (FALLTHRU,EXECUTABLE)

;;   basic block 7, loop depth 1, count 0, freq 4550, maybe hot
;;    prev block 6, next block 8, flags: (NEW, REACHABLE)
;;    pred:       5 [50.0%]  (FALSE_VALUE,EXECUTABLE)
   inp_32 = inp_70 + 1;
   _33 = *inp_70;
   inputbuffer_34 = (int) _33;
   _35 = inputbuffer_34 >> 4;
   delta_36 = _35 & 15;
;;    succ:       8 [100.0%]  (FALLTHRU,EXECUTABLE)

;;   basic block 8, loop depth 1, count 0, freq 9100, maybe hot
;;    prev block 7, next block 9, flags: (NEW, REACHABLE)
;;    pred:       6 [100.0%]  (FALLTHRU,EXECUTABLE)
;;                7 [100.0%]  (FALLTHRU,EXECUTABLE)
   # inp_2 = PHI <inp_70(6), inp_32(7)>
   # delta_5 = PHI <delta_31(6), delta_36(7)>
   # inputbuffer_16 = PHI <inputbuffer_65(6), inputbuffer_34(7)>
   _83 = bufferstep_79 ^ 1;
   _1 = _83 & 1;
   _39 = indexTable[delta_5];
   index_40 = _39 + index_81;
   _84 = MIN_EXPR <index_40, 88>;
   index_62 = MAX_EXPR <_84, 0>;
   sign_41 = delta_5 & 8;
   vpdiff_42 = step_71 >> 3;
   _43 = delta_5 & 4;
   if (_43 != 0)
     goto <bb 9>;
   else
     goto <bb 10>;



The difference is there's all kinds of code between BB8 and the latch. 
So it doesn't trigger path splitting, which in turn doesn't expose the 
CSE/DCE opportunity for adpcm_decoder.  We're likely leaving some 
performance on the table for this code.

This brings up a deeper question.   Are we approaching path splitting in 
a fundamentally backward way?

Right now path splitting is guided by finding the right shape in the CFG 
immediately preceding the latch block of a loop.  Plus some heuristics 
I'm working on to predict when the duplication is more likely to lead to 
CSE/DCE opportunities.  But really it's driven by finding the right CFG 
shape before the latch block.

Should path splitting really be driven by partitions of PHI arguments 
and other incoming values (such as the implicit sets from conditionals). 
  I've been pondering this for a long time as it's a natural extension 
of what we do in the erroneous path isolation pass.

ie at any join point in the CFG, look for partitions of the arguments in 
any PHI nodes and split paths based on that.

So for example if we had a PHI like

x5 = phi (x4, 0, 0, 1, 1)

Look at how x5 is used.  If propagation of any of those PHI argument 
values would result in simplifications at the use points of x5, then we 
should consider splitting off all the paths with the beneficial PHI 
argument.

So in the case above, assume that the value 0 results in 
simplifications.  We'd create two paths.  One for the paths where x5 
would get the value zero, another for everything else.

This is *a lot* like the path isolation done erroneous paths in terms of 
the CFG & SSA manipulations.  Essentially in both cases we want to 
isolate a path so that we can manipulate it based on known inputs 
without affecting paths with uninteresting inputs.  All that really 
changes is how we drive the selection of paths to isolate and whether or 
not we insert the __builtin_trap or not.


Anyway, going back to adpcm_decode, we do end up splitting this path:

  # vpdiff_12 = PHI <vpdiff_11(12), vpdiff_50(13)>
   if (sign_41 != 0)
     goto <bb 15>;
   else
     goto <bb 16>;
;;    succ:       15
;;                16

;;   basic block 15, loop depth 1
;;    pred:       14
   valpred_51 = valpred_76 - vpdiff_12;
   goto <bb 17>;
;;    succ:       17

;;   basic block 16, loop depth 1
;;    pred:       14
   valpred_52 = vpdiff_12 + valpred_76;
;;    succ:       17

;;   basic block 17, loop depth 1
;;    pred:       15
;;                16
   # valpred_7 = PHI <valpred_51(15), valpred_52(16)>
   _85 = MAX_EXPR <valpred_7, -32768>;
   valpred_13 = MIN_EXPR <_85, 32767>;
   step_53 = stepsizeTable[index_62];
   outp_54 = outp_69 + 2;
   _55 = (short int) valpred_13;
   MEM[base: outp_54, offset: -2B] = _55;
   if (outp_54 != _74)
     goto <bb 20>;
   else
     goto <bb 18>;

This doesn't result in anything particularly interesting/good AFAICT. 
We propagate valpred_51/52 into the use in the MAX_EXPR in the duplicate 
paths, but that doesn't allow any further simplification.

Ajit, can you confirm which of adpcm_code or adpcm_decode where path 
splitting is showing a gain?  I suspect it's the former but would like 
to make sure so that I can adjust the heuristics properly.


jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2016-01-13  8:10                                                 ` Jeff Law
       [not found]                                                   ` <56976289.3080203@redhat.com! >
  2016-01-14  8:55                                                   ` Jeff Law
@ 2016-01-15 22:38                                                   ` Jeff Law
  2 siblings, 0 replies; 72+ messages in thread
From: Jeff Law @ 2016-01-15 22:38 UTC (permalink / raw)
  To: Ajit Kumar Agarwal, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On 01/13/2016 01:10 AM, Jeff Law wrote:
>
> I'm going to focus on adpcm for the moment, in particular adpcm_coder.
> It appears the key blocks are:
>
>
> ;;   basic block 14, loop depth 1, count 0, freq 9100, maybe hot
> ;;    prev block 13, next block 15, flags: (NEW, REACHABLE)
> ;;    pred:       12 [100.0%]  (FALLTHRU,EXECUTABLE)
> ;;                13 [100.0%]  (FALLTHRU,EXECUTABLE)
>    # valpred_12 = PHI <valpred_54(12), valpred_55(13)>
>    _112 = MAX_EXPR <valpred_12, -32768>;
>    valpred_18 = MIN_EXPR <_112, 32767>;
>    delta_56 = delta_7 | iftmp.1_114;
>    _57 = indexTable[delta_56];
>    index_58 = _57 + index_107;
>    _113 = MIN_EXPR <index_58, 88>;
>    index_111 = MAX_EXPR <_113, 0>;
>    step_59 = stepsizeTable[index_111];
>    if (bufferstep_93 != 0)
>      goto <bb 15>;
>    else
>      goto <bb 16>;
> ;;    succ:       15 [50.0%]  (TRUE_VALUE,EXECUTABLE)
> ;;                16 [50.0%]  (FALSE_VALUE,EXECUTABLE)
>
> ;;   basic block 15, loop depth 1, count 0, freq 4550, maybe hot
> ;;    prev block 14, next block 16, flags: (NEW, REACHABLE)
> ;;    pred:       14 [50.0%]  (TRUE_VALUE,EXECUTABLE)
>    _60 = delta_56 << 4;
>    goto <bb 17>;
> ;;    succ:       17 [100.0%]  (FALLTHRU,EXECUTABLE)
>
> ;;   basic block 16, loop depth 1, count 0, freq 4550, maybe hot
> ;;    prev block 15, next block 17, flags: (NEW, REACHABLE)
> ;;    pred:       14 [50.0%]  (FALSE_VALUE,EXECUTABLE)
>    outp_62 = outp_83 + 1;
>    _63 = (signed char) delta_56;
>    _65 = (signed char) outputbuffer_90;
>    _66 = _63 | _65;
>    *outp_83 = _66;
> ;;    succ:       17 [100.0%]  (FALLTHRU,EXECUTABLE)
>
> ;;   basic block 17, loop depth 1, count 0, freq 9100, maybe hot
> ;;    prev block 16, next block 18, flags: (NEW, REACHABLE)
> ;;    pred:       15 [100.0%]  (FALLTHRU,EXECUTABLE)
> ;;                16 [100.0%]  (FALLTHRU,EXECUTABLE)
>    # outp_3 = PHI <outp_83(15), outp_62(16)>
>    # outputbuffer_21 = PHI <_60(15), outputbuffer_90(16)>
>    _109 = bufferstep_93 ^ 1;
>    _98 = _109 & 1;
>    ivtmp.11_68 = ivtmp.11_105 + 2;
>    if (ivtmp.11_68 != _116)
>      goto <bb 4>;
>    else
>      goto <bb 18>;
>
>
> Block #17 is the join point that we're going to effectively copy into
> blocks #15 and #16.  Doing so in turn exposes bufferstep_93 as the
> constant 0 in block #16, which in turn allows elimination of a couple
> statements in the extended version of block #16 and we propagate the
> constant 1 for bufferstep_93 to the top of the loop when reached via
> block #16.  So we save a few instructions.  However, I think we're
> actually doing a fairly poor job here.
>
> bufferstep is a great example of a flip-flop variable and its value is
> statically computable based on the path from the prior loop iteration
> which, if exploited would allow the FSM threader to eliminate the
> conditional at the end of bb14.  I'm going to have to play with that.
So I've extended DOM & uncprop to pick up the missed propagation 
opportunity, which in turn allows DOM to simplify this function even 
further and hopefully set ourselves up for either unrolling the loop or 
using the FSM threader to eliminate the test on bufferstep completely. 
But those are gcc-7 items.


>
> Anyway, it's late and I want to rip this test apart a bit more and see
> how it interacts with the heuristic that I've cobbled together as well
> as see what it would take to have DOM or VRP get data on bufferstep_93
> on the true path out of BB14 after a path-split.
As I expected, this showed a need for a minor tweak to the heuristic I'm 
poking at for path splitting.  Nothing particularly hard, it needs 
further work (it's not compile-time efficient right now), but it's good 
enough to put away adpcm_code and continue looking more closely at 
adpcm_decode.

Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2016-01-14  8:55                                                   ` Jeff Law
@ 2016-01-15 23:02                                                     ` Jeff Law
  2016-01-18 18:27                                                       ` Ajit Kumar Agarwal
  0 siblings, 1 reply; 72+ messages in thread
From: Jeff Law @ 2016-01-15 23:02 UTC (permalink / raw)
  To: Ajit Kumar Agarwal, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On 01/14/2016 01:55 AM, Jeff Law wrote:
[ Replying to myself again, mostly to make sure we've got these thoughts 
in the archives. ]
>
> Anyway, going back to adpcm_decode, we do end up splitting this path:
>
>   # vpdiff_12 = PHI <vpdiff_11(12), vpdiff_50(13)>
>    if (sign_41 != 0)
>      goto <bb 15>;
>    else
>      goto <bb 16>;
> ;;    succ:       15
> ;;                16
>
> ;;   basic block 15, loop depth 1
> ;;    pred:       14
>    valpred_51 = valpred_76 - vpdiff_12;
>    goto <bb 17>;
> ;;    succ:       17
>
> ;;   basic block 16, loop depth 1
> ;;    pred:       14
>    valpred_52 = vpdiff_12 + valpred_76;
> ;;    succ:       17
>
> ;;   basic block 17, loop depth 1
> ;;    pred:       15
> ;;                16
>    # valpred_7 = PHI <valpred_51(15), valpred_52(16)>
>    _85 = MAX_EXPR <valpred_7, -32768>;
>    valpred_13 = MIN_EXPR <_85, 32767>;
>    step_53 = stepsizeTable[index_62];
>    outp_54 = outp_69 + 2;
>    _55 = (short int) valpred_13;
>    MEM[base: outp_54, offset: -2B] = _55;
>    if (outp_54 != _74)
>      goto <bb 20>;
>    else
>      goto <bb 18>;
>
> This doesn't result in anything particularly interesting/good AFAICT. We
> propagate valpred_51/52 into the use in the MAX_EXPR in the duplicate
> paths, but that doesn't allow any further simplification.
So with the heuristic I'm poking at, this gets rejected.  Essentially it 
doesn't think it's likely to expose CSE/DCE opportunities (and it's 
correct).  The number of statements in predecessor blocks that feed 
operands in the to-be-copied-block is too small relative to the size of 
the to-be-copied-block.


>
> Ajit, can you confirm which of adpcm_code or adpcm_decode where path
> splitting is showing a gain?  I suspect it's the former but would like
> to make sure so that I can adjust the heuristics properly.
I'd still like to have this answered when you can Ajit, just to be 100% 
  that it's the path splitting in adpcm_code that's responsible for the 
improvements you're seeing in adpcm.

jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2016-01-04 14:32                                               ` Ajit Kumar Agarwal
  2016-01-13  8:10                                                 ` Jeff Law
@ 2016-01-16  6:32                                                 ` Jeff Law
  2016-01-18  9:13                                                   ` Ajit Kumar Agarwal
  2016-02-04  8:57                                                 ` Jeff Law
  2 siblings, 1 reply; 72+ messages in thread
From: Jeff Law @ 2016-01-16  6:32 UTC (permalink / raw)
  To: Ajit Kumar Agarwal, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On 01/04/2016 07:32 AM, Ajit Kumar Agarwal wrote:
>
>
> -----Original Message----- From: Jeff Law [mailto:law@redhat.com]
> Sent: Wednesday, December 23, 2015 12:06 PM To: Ajit Kumar Agarwal;
> Richard Biener Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta;
> Vidhumouli Hunsigida; Nagaraju Mekala Subject: Re:
> [Patch,tree-optimization]: Add new path Splitting pass on tree ssa
> representation
>
> On 12/11/2015 02:11 AM, Ajit Kumar Agarwal wrote:
>>
>> Mibench/EEMBC benchmarks (Target Microblaze)
>>
>> Automotive_qsort1(4.03%), Office_ispell(4.29%),
>> Office_stringsearch1(3.5%). Telecom_adpcm_d( 1.37%),
>> ospfv2_lite(1.35%).
>>> I'm having a real tough time reproducing any of these results.
>>> In fact, I'm having a tough time seeing cases where path
>>> splitting even applies to the Mibench/EEMBC benchmarks
>>> >>mentioned above.
>
>>> In the very few cases where split-paths might apply, the net
>>> resulting assembly code I get is the same with and without
>>> split-paths.
>
>>> How consistent are these results?
>
> I am consistently getting the gains for office_ispell and
> office_stringsearch1, telcom_adpcm_d. I ran it again today and we see
> gains in the same bench mark tests with the split path changes.
>
>>> What functions are being affected that in turn impact
>>> performance?
>
> For office_ispell: The function are Function "linit (linit,
> funcdef_no=0, decl_uid=2535, cgraph_uid=0, symbol_order=2) for
> lookup.c file". "Function checkfile (checkfile, funcdef_no=1,
> decl_uid=2478, cgraph_uid=1, symbol_order=4)" " Function correct
> (correct, funcdef_no=2, decl_uid=2503, cgraph_uid=2,
> symbol_order=5)" " Function askmode (askmode, funcdef_no=24,
> decl_uid=2464, cgraph_uid=24, symbol_order=27)" for correct.c file.
>
> For office_stringsearch1: The function is Function "bmhi_search
> (bmhi_search, funcdef_no=1, decl_uid=2178, cgraph_uid=1,
> symbol_order=5)" for bmhisrch.c file.
Can you send me the pre-processed lookup.c, correct.c and bmhi_search.c?

I generated mine using x86 and that may be affecting my ability to 
reproduce your results on the microblaze target.  Looking specifically 
at bmhi_search.c and correct.c, I see they are going to be sensitive to 
the target headers.  If (for exmaple) they use FORTIFY_SOURCE or macros 
for toupper.

In the bmhi_search I'm looking at, I don't see any opportunities for the 
path splitter to do anything.  The CFG just doesn't have the right 
shape.  Again, that may be an artifact of how toupper is implemented in 
the system header files -- hence my request for the cpp output on each 
of the important files.

Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2016-01-16  6:32                                                 ` Jeff Law
@ 2016-01-18  9:13                                                   ` Ajit Kumar Agarwal
  2016-01-27  7:13                                                     ` Jeff Law
  0 siblings, 1 reply; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2016-01-18  9:13 UTC (permalink / raw)
  To: Jeff Law, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala



-----Original Message-----
From: Jeff Law [mailto:law@redhat.com] 
Sent: Saturday, January 16, 2016 12:03 PM
To: Ajit Kumar Agarwal; Richard Biener
Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On 01/04/2016 07:32 AM, Ajit Kumar Agarwal wrote:
>
>
> -----Original Message----- From: Jeff Law [mailto:law@redhat.com]
> Sent: Wednesday, December 23, 2015 12:06 PM To: Ajit Kumar Agarwal; 
> Richard Biener Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta; 
> Vidhumouli Hunsigida; Nagaraju Mekala Subject: Re:
> [Patch,tree-optimization]: Add new path Splitting pass on tree ssa 
> representation
>
> On 12/11/2015 02:11 AM, Ajit Kumar Agarwal wrote:
>>
>> Mibench/EEMBC benchmarks (Target Microblaze)
>>
>> Automotive_qsort1(4.03%), Office_ispell(4.29%), 
>> Office_stringsearch1(3.5%). Telecom_adpcm_d( 1.37%), 
>> ospfv2_lite(1.35%).
>>> I'm having a real tough time reproducing any of these results.
>>> In fact, I'm having a tough time seeing cases where path splitting 
>>> even applies to the Mibench/EEMBC benchmarks
>>> >>mentioned above.
>
>>> In the very few cases where split-paths might apply, the net 
>>> resulting assembly code I get is the same with and without 
>>> split-paths.
>
>>> How consistent are these results?
>
> I am consistently getting the gains for office_ispell and 
> office_stringsearch1, telcom_adpcm_d. I ran it again today and we see 
> gains in the same bench mark tests with the split path changes.
>
>>> What functions are being affected that in turn impact performance?
>
> For office_ispell: The function are Function "linit (linit, 
> funcdef_no=0, decl_uid=2535, cgraph_uid=0, symbol_order=2) for 
> lookup.c file". "Function checkfile (checkfile, funcdef_no=1, 
> decl_uid=2478, cgraph_uid=1, symbol_order=4)" " Function correct 
> (correct, funcdef_no=2, decl_uid=2503, cgraph_uid=2, symbol_order=5)" 
> " Function askmode (askmode, funcdef_no=24, decl_uid=2464, 
> cgraph_uid=24, symbol_order=27)" for correct.c file.
>
> For office_stringsearch1: The function is Function "bmhi_search 
> (bmhi_search, funcdef_no=1, decl_uid=2178, cgraph_uid=1, 
> symbol_order=5)" for bmhisrch.c file.
>>Can you send me the pre-processed lookup.c, correct.c and bmhi_search.c?

>>I generated mine using x86 and that may be affecting my ability to reproduce your results on the microblaze target.  Looking specifically at bmhi_search.c and correct.c, I see they are >>going to be sensitive to the target headers.  If (for exmaple) they use FORTIFY_SOURCE or macros for toupper.

>>In the bmhi_search I'm looking at, I don't see any opportunities for the path splitter to do anything.  The CFG just doesn't have the right shape.  Again, that may be an artifact of how >>toupper is implemented in the system header files -- hence my request for the cpp output on each of the important files.

Would you like me  to send the above files and function pre-processed with -E option flag.

Thanks & Regards
Ajit
Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2016-01-15 23:02                                                     ` Jeff Law
@ 2016-01-18 18:27                                                       ` Ajit Kumar Agarwal
  2016-01-27  7:17                                                         ` Jeff Law
  0 siblings, 1 reply; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2016-01-18 18:27 UTC (permalink / raw)
  To: Jeff Law, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala



-----Original Message-----
From: Jeff Law [mailto:law@redhat.com] 
Sent: Saturday, January 16, 2016 4:33 AM
To: Ajit Kumar Agarwal; Richard Biener
Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On 01/14/2016 01:55 AM, Jeff Law wrote:
[ Replying to myself again, mostly to make sure we've got these thoughts in the archives. ]
>
> Anyway, going back to adpcm_decode, we do end up splitting this path:
>
>   # vpdiff_12 = PHI <vpdiff_11(12), vpdiff_50(13)>
>    if (sign_41 != 0)
>      goto <bb 15>;
>    else
>      goto <bb 16>;
> ;;    succ:       15
> ;;                16
>
> ;;   basic block 15, loop depth 1
> ;;    pred:       14
>    valpred_51 = valpred_76 - vpdiff_12;
>    goto <bb 17>;
> ;;    succ:       17
>
> ;;   basic block 16, loop depth 1
> ;;    pred:       14
>    valpred_52 = vpdiff_12 + valpred_76;
> ;;    succ:       17
>
> ;;   basic block 17, loop depth 1
> ;;    pred:       15
> ;;                16
>    # valpred_7 = PHI <valpred_51(15), valpred_52(16)>
>    _85 = MAX_EXPR <valpred_7, -32768>;
>    valpred_13 = MIN_EXPR <_85, 32767>;
>    step_53 = stepsizeTable[index_62];
>    outp_54 = outp_69 + 2;
>    _55 = (short int) valpred_13;
>    MEM[base: outp_54, offset: -2B] = _55;
>    if (outp_54 != _74)
>      goto <bb 20>;
>    else
>      goto <bb 18>;
>
> This doesn't result in anything particularly interesting/good AFAICT. 
> We propagate valpred_51/52 into the use in the MAX_EXPR in the 
> duplicate paths, but that doesn't allow any further simplification.
>>So with the heuristic I'm poking at, this gets rejected.  Essentially it doesn't think it's likely to expose CSE/DCE opportunities (and it's correct).  The number of statements in predecessor >>blocks that feed operands in the to-be-copied-block is too small relative to the size of the to-be-copied-block.


>
> Ajit, can you confirm which of adpcm_code or adpcm_decode where path 
> splitting is showing a gain?  I suspect it's the former but would like 
> to make sure so that I can adjust the heuristics properly.
>>I'd still like to have this answered when you can Ajit, just to be 100%
 >> that it's the path splitting in adpcm_code that's responsible for the improvements you're seeing in adpcm.

The adpcm_coder get optimized with path splitting whereas the adpcm_decoder is not optimized further with path splitting. In adpcm_decoder
the join node is duplicated into its predecessors and with the duplication of join node the code is not optimized further.

In adpcm_coder with path splitting the following optimization is triggered with path splitting.

1. /* Output last step, if needed */
    if ( !bufferstep )
      *outp++ = outputbuffer;

     IF-THEN inside the loop will be triggered with bufferstep is 1.  Then the flip happens and bufferstep is 0. For the exit branch if the bufferstep
Is 1 the flip convert it to 0  and above IF-THEN generate store to assign outputbuffer to outp.

The above sequence is optimized with path splitting, if the bufferstep is 1 then exit branch of the loop branches to the above store. This does not require the flip of
bufferstep using xor with immediate 1. With this optimization there is one level of exit branch for the bufferstep 1 path. This lead to scheduling the
exit branch to the store with a meaningful instruction instead of xor with immediate 1.

Without Path Splitting if the bufferstep is 1  the exit branch of the loop branches to piece of branch flipping it to zero and the above IF-THEN outside the
loop does the store to assign outputbuffer to outp. Thus without path splitting there is two level of branch in the case of exit branch in the path where 
bufferstep is 1 inside the loop generating non optimized. Also without path splitting the two level of exit branch of the loop is scheduled with xor immediate with 1.

 Thanks & Regards
Ajit

jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2016-01-18  9:13                                                   ` Ajit Kumar Agarwal
@ 2016-01-27  7:13                                                     ` Jeff Law
  2016-01-27  9:35                                                       ` Ajit Kumar Agarwal
  0 siblings, 1 reply; 72+ messages in thread
From: Jeff Law @ 2016-01-27  7:13 UTC (permalink / raw)
  To: Ajit Kumar Agarwal, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On 01/18/2016 02:13 AM, Ajit Kumar Agarwal wrote:
>
>
> -----Original Message-----
> From: Jeff Law [mailto:law@redhat.com]
> Sent: Saturday, January 16, 2016 12:03 PM
> To: Ajit Kumar Agarwal; Richard Biener
> Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
> Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
>
> On 01/04/2016 07:32 AM, Ajit Kumar Agarwal wrote:
>>
>>
>> -----Original Message----- From: Jeff Law [mailto:law@redhat.com]
>> Sent: Wednesday, December 23, 2015 12:06 PM To: Ajit Kumar Agarwal;
>> Richard Biener Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta;
>> Vidhumouli Hunsigida; Nagaraju Mekala Subject: Re:
>> [Patch,tree-optimization]: Add new path Splitting pass on tree ssa
>> representation
>>
>> On 12/11/2015 02:11 AM, Ajit Kumar Agarwal wrote:
>>>
>>> Mibench/EEMBC benchmarks (Target Microblaze)
>>>
>>> Automotive_qsort1(4.03%), Office_ispell(4.29%),
>>> Office_stringsearch1(3.5%). Telecom_adpcm_d( 1.37%),
>>> ospfv2_lite(1.35%).
>>>> I'm having a real tough time reproducing any of these results.
>>>> In fact, I'm having a tough time seeing cases where path splitting
>>>> even applies to the Mibench/EEMBC benchmarks
>>>>>> mentioned above.
>>
>>>> In the very few cases where split-paths might apply, the net
>>>> resulting assembly code I get is the same with and without
>>>> split-paths.
>>
>>>> How consistent are these results?
>>
>> I am consistently getting the gains for office_ispell and
>> office_stringsearch1, telcom_adpcm_d. I ran it again today and we see
>> gains in the same bench mark tests with the split path changes.
>>
>>>> What functions are being affected that in turn impact performance?
>>
>> For office_ispell: The function are Function "linit (linit,
>> funcdef_no=0, decl_uid=2535, cgraph_uid=0, symbol_order=2) for
>> lookup.c file". "Function checkfile (checkfile, funcdef_no=1,
>> decl_uid=2478, cgraph_uid=1, symbol_order=4)" " Function correct
>> (correct, funcdef_no=2, decl_uid=2503, cgraph_uid=2, symbol_order=5)"
>> " Function askmode (askmode, funcdef_no=24, decl_uid=2464,
>> cgraph_uid=24, symbol_order=27)" for correct.c file.
>>
>> For office_stringsearch1: The function is Function "bmhi_search
>> (bmhi_search, funcdef_no=1, decl_uid=2178, cgraph_uid=1,
>> symbol_order=5)" for bmhisrch.c file.
>>> Can you send me the pre-processed lookup.c, correct.c and bmhi_search.c?
>
>>> I generated mine using x86 and that may be affecting my ability to reproduce your results on the microblaze target.  Looking specifically at bmhi_search.c and correct.c, I see they are >>going to be sensitive to the target headers.  If (for exmaple) they use FORTIFY_SOURCE or macros for toupper.
>
>>> In the bmhi_search I'm looking at, I don't see any opportunities for the path splitter to do anything.  The CFG just doesn't have the right shape.  Again, that may be an artifact of how >>toupper is implemented in the system header files -- hence my request for the cpp output on each of the important files.
>
> Would you like me  to send the above files and function pre-processed with -E option flag.
That would be perfect.  I'm on the road the latter half of the week into 
early next week -- the long flights might be a good time for me to stare 
at the dumps and tweak the heuristic a bit.

jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2016-01-18 18:27                                                       ` Ajit Kumar Agarwal
@ 2016-01-27  7:17                                                         ` Jeff Law
  2016-01-27 17:21                                                           ` Ajit Kumar Agarwal
  0 siblings, 1 reply; 72+ messages in thread
From: Jeff Law @ 2016-01-27  7:17 UTC (permalink / raw)
  To: Ajit Kumar Agarwal, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

On 01/18/2016 11:27 AM, Ajit Kumar Agarwal wrote:

>>
>> Ajit, can you confirm which of adpcm_code or adpcm_decode where
>> path splitting is showing a gain?  I suspect it's the former but
>> would like to make sure so that I can adjust the heuristics
>> properly.
>>> I'd still like to have this answered when you can Ajit, just to
>>> be 100% that it's the path splitting in adpcm_code that's
>>> responsible for the improvements you're seeing in adpcm.
>
> The adpcm_coder get optimized with path splitting whereas the
> adpcm_decoder is not optimized further with path splitting. In
> adpcm_decoder the join node is duplicated into its predecessors and
> with the duplication of join node the code is not optimized further.
Right.  Just wanted to make sure my analysis corresponded with what you 
were seeing in your benchmarking -- and it does.

I suspect that if we looked at this problem from the angle of isolating 
paths based on how constant PHI arguments feed into and allow 
simplifications in later blocks that we might get better long term 
results -- including improving adpcm_decoder which has the same idiom as 
adpcm_coder -- it's just in the wrong spot in the CFG.

But that's obviously gcc-7 material.

Jeff

^ permalink raw reply	[flat|nested] 72+ messages in thread

* RE: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation
  2016-01-27  7:13                                                     ` Jeff Law
@ 2016-01-27  9:35                                                       ` Ajit Kumar Agarwal
  0 siblings, 0 replies; 72+ messages in thread
From: Ajit Kumar Agarwal @ 2016-01-27  9:35 UTC (permalink / raw)
  To: Jeff Law, Richard Biener
  Cc: GCC Patches, Vinod Kathail, Shail Aditya Gupta,
	Vidhumouli Hunsigida, Nagaraju Mekala

[-- Attachment #1: Type: text/plain, Size: 4071 bytes --]



-----Original Message-----
From: Jeff Law [mailto:law@redhat.com] 
Sent: Wednesday, January 27, 2016 12:44 PM
To: Ajit Kumar Agarwal; Richard Biener
Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; Nagaraju Mekala
Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation

On 01/18/2016 02:13 AM, Ajit Kumar Agarwal wrote:
>
>
> -----Original Message-----
> From: Jeff Law [mailto:law@redhat.com]
> Sent: Saturday, January 16, 2016 12:03 PM
> To: Ajit Kumar Agarwal; Richard Biener
> Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli 
> Hunsigida; Nagaraju Mekala
> Subject: Re: [Patch,tree-optimization]: Add new path Splitting pass on 
> tree ssa representation
>
> On 01/04/2016 07:32 AM, Ajit Kumar Agarwal wrote:
>>
>>
>> -----Original Message----- From: Jeff Law [mailto:law@redhat.com]
>> Sent: Wednesday, December 23, 2015 12:06 PM To: Ajit Kumar Agarwal; 
>> Richard Biener Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta; 
>> Vidhumouli Hunsigida; Nagaraju Mekala Subject: Re:
>> [Patch,tree-optimization]: Add new path Splitting pass on tree ssa 
>> representation
>>
>> On 12/11/2015 02:11 AM, Ajit Kumar Agarwal wrote:
>>>
>>> Mibench/EEMBC benchmarks (Target Microblaze)
>>>
>>> Automotive_qsort1(4.03%), Office_ispell(4.29%), 
>>> Office_stringsearch1(3.5%). Telecom_adpcm_d( 1.37%), 
>>> ospfv2_lite(1.35%).
>>>> I'm having a real tough time reproducing any of these results.
>>>> In fact, I'm having a tough time seeing cases where path splitting 
>>>> even applies to the Mibench/EEMBC benchmarks
>>>>>> mentioned above.
>>
>>>> In the very few cases where split-paths might apply, the net 
>>>> resulting assembly code I get is the same with and without 
>>>> split-paths.
>>
>>>> How consistent are these results?
>>
>> I am consistently getting the gains for office_ispell and 
>> office_stringsearch1, telcom_adpcm_d. I ran it again today and we see 
>> gains in the same bench mark tests with the split path changes.
>>
>>>> What functions are being affected that in turn impact performance?
>>
>> For office_ispell: The function are Function "linit (linit, 
>> funcdef_no=0, decl_uid=2535, cgraph_uid=0, symbol_order=2) for 
>> lookup.c file". "Function checkfile (checkfile, funcdef_no=1, 
>> decl_uid=2478, cgraph_uid=1, symbol_order=4)" " Function correct 
>> (correct, funcdef_no=2, decl_uid=2503, cgraph_uid=2, symbol_order=5)"
>> " Function askmode (askmode, funcdef_no=24, decl_uid=2464, 
>> cgraph_uid=24, symbol_order=27)" for correct.c file.
>>
>> For office_stringsearch1: The function is Function "bmhi_search 
>> (bmhi_search, funcdef_no=1, decl_uid=2178, cgraph_uid=1, 
>> symbol_order=5)" for bmhisrch.c file.
>>> Can you send me the pre-processed lookup.c, correct.c and bmhi_search.c?
>
>>> I generated mine using x86 and that may be affecting my ability to reproduce your results on the microblaze target.  Looking specifically at bmhi_search.c and correct.c, I see they are >>going to be sensitive to the target headers.  If (for exmaple) they use FORTIFY_SOURCE or macros for toupper.
>
>>> In the bmhi_search I'm looking at, I don't see any opportunities for the path splitter to do anything.  The CFG just doesn't have the right shape.  Again, that may be an artifact of how >>toupper is implemented in the system header files -- hence my request for the cpp output on each of the important files.
>
> Would you like me  to send the above files and function pre-processed with -E option flag.
>>That would be perfect.  I'm on the road the latter half of the week into early next week -- the long flights might be a good time for me to stare at the dumps and tweak the heuristic a bit.

Please find the preprocessed file pre-processed with -E flag option for  office_ispell/src/lookup.c , office_ispell/src/correct.c and office_stringsearch1/src/bmhisrch.c . 
The function that are interest to us are :  correct,  linit,  bmhi_search.

Thanks & Regards
Ajit

jeff

[-- Attachment #2: bmhisrch.E --]
[-- Type: application/octet-stream, Size: 30337 bytes --]

# 1 "./office_stringsearch1/src/bmhisrch.c"
# 1 "/proj/esdt_sdk/aagarwa/mb_mibench_eembc_tests_split_paths/tests/suites/mibench_v1.3//"
# 1 "<built-in>"
# 1 "<command-line>"
# 1 "./office_stringsearch1/src/bmhisrch.c"
# 17 "./office_stringsearch1/src/bmhisrch.c"
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include-fixed/limits.h" 1 3 4
# 34 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include-fixed/limits.h" 3 4
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include-fixed/syslimits.h" 1 3 4






# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include-fixed/limits.h" 1 3 4
# 168 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include-fixed/limits.h" 3 4
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/limits.h" 1 3 4



# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/newlib.h" 1 3 4
# 5 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/limits.h" 2 3 4
# 169 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include-fixed/limits.h" 2 3 4
# 8 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include-fixed/syslimits.h" 2 3 4
# 35 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include-fixed/limits.h" 2 3 4
# 18 "./office_stringsearch1/src/bmhisrch.c" 2
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdlib.h" 1 3
# 10 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdlib.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/ieeefp.h" 1 3
# 11 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdlib.h" 2 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/_ansi.h" 1 3
# 16 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/_ansi.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/config.h" 1 3



# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/ieeefp.h" 1 3
# 5 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/config.h" 2 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/features.h" 1 3
# 6 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/config.h" 2 3
# 17 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/_ansi.h" 2 3
# 12 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdlib.h" 2 3




# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 1 3 4
# 216 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 3 4

# 216 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 3 4
typedef unsigned int size_t;
# 328 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 3 4
typedef int wchar_t;
# 17 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdlib.h" 2 3

# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 1 3
# 13 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/_ansi.h" 1 3
# 14 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 2 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 1 3 4
# 149 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 3 4
typedef int ptrdiff_t;
# 426 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 3 4
typedef struct {
  long long __max_align_ll __attribute__((__aligned__(__alignof__(long long))));
  long double __max_align_ld __attribute__((__aligned__(__alignof__(long double))));
} max_align_t;
# 15 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 2 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/_types.h" 1 3
# 12 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/_types.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_types.h" 1 3






# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 1 3
# 27 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 3
typedef signed char __int8_t;
typedef unsigned char __uint8_t;
# 37 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 3
typedef short int __int16_t;
typedef short unsigned int __uint16_t;
# 55 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 3
typedef long int __int32_t;
typedef long unsigned int __uint32_t;
# 77 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 3
typedef long long int __int64_t;
typedef long long unsigned int __uint64_t;
# 104 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 3
typedef signed char __int_least8_t;
typedef unsigned char __uint_least8_t;
# 126 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 3
typedef short int __int_least16_t;
typedef short unsigned int __uint_least16_t;
# 144 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 3
typedef long int __int_least32_t;
typedef long unsigned int __uint_least32_t;
# 158 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 3
typedef long long int __int_least64_t;
typedef long long unsigned int __uint_least64_t;
# 168 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 3
typedef int __intptr_t;
typedef unsigned int __uintptr_t;
# 8 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_types.h" 2 3
# 13 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/_types.h" 2 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/lock.h" 1 3





typedef int _LOCK_T;
typedef int _LOCK_RECURSIVE_T;
# 14 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/_types.h" 2 3


typedef long _off_t;



typedef short __dev_t;



typedef unsigned short __uid_t;


typedef unsigned short __gid_t;



__extension__ typedef long long _off64_t;







typedef long _fpos_t;
# 55 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/_types.h" 3
typedef signed int _ssize_t;
# 67 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/_types.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 1 3 4
# 357 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 3 4
typedef unsigned int wint_t;
# 68 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/_types.h" 2 3



typedef struct
{
  int __count;
  union
  {
    wint_t __wch;
    unsigned char __wchb[4];
  } __value;
} _mbstate_t;



typedef _LOCK_RECURSIVE_T _flock_t;




typedef void *_iconv_t;
# 16 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 2 3






typedef unsigned long __ULong;
# 38 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 3
struct _reent;






struct _Bigint
{
  struct _Bigint *_next;
  int _k, _maxwds, _sign, _wds;
  __ULong _x[1];
};


struct __tm
{
  int __tm_sec;
  int __tm_min;
  int __tm_hour;
  int __tm_mday;
  int __tm_mon;
  int __tm_year;
  int __tm_wday;
  int __tm_yday;
  int __tm_isdst;
};







struct _on_exit_args {
 void * _fnargs[32];
 void * _dso_handle[32];

 __ULong _fntypes;


 __ULong _is_cxa;
};


struct _atexit {
 struct _atexit *_next;
 int _ind;
 void (*_fns[32])(void);
        struct _on_exit_args * _on_exit_args_ptr;
};
# 115 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 3
struct __sbuf {
 unsigned char *_base;
 int _size;
};
# 151 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 3
struct __sFILE_fake {
  unsigned char *_p;
  int _r;
  int _w;
  short _flags;
  short _file;
  struct __sbuf _bf;
  int _lbfsize;

  struct _reent *_data;
};




extern void __sinit (struct _reent *);
# 179 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 3
struct __sFILE {
  unsigned char *_p;
  int _r;
  int _w;
  short _flags;
  short _file;
  struct __sbuf _bf;
  int _lbfsize;


  struct _reent *_data;



  void * _cookie;

  int (* _read) (struct _reent *, void *, char *, int)
                                          ;
  int (* _write) (struct _reent *, void *, const char *, int)

                                   ;
  _fpos_t (* _seek) (struct _reent *, void *, _fpos_t, int);
  int (* _close) (struct _reent *, void *);


  struct __sbuf _ub;
  unsigned char *_up;
  int _ur;


  unsigned char _ubuf[3];
  unsigned char _nbuf[1];


  struct __sbuf _lb;


  int _blksize;
  _off_t _offset;






  _flock_t _lock;

  _mbstate_t _mbstate;
  int _flags2;
};
# 285 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 3
typedef struct __sFILE __FILE;



struct _glue
{
  struct _glue *_next;
  int _niobs;
  __FILE *_iobs;
};
# 317 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 3
struct _rand48 {
  unsigned short _seed[3];
  unsigned short _mult[3];
  unsigned short _add;


  __extension__ unsigned long long _rand_next;

};
# 342 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 3
struct _mprec
{

  struct _Bigint *_result;
  int _result_k;
  struct _Bigint *_p5s;
  struct _Bigint **_freelist;
};


struct _misc_reent
{

  char *_strtok_last;
  _mbstate_t _mblen_state;
  _mbstate_t _wctomb_state;
  _mbstate_t _mbtowc_state;
  char _l64a_buf[8];
  int _getdate_err;
  _mbstate_t _mbrlen_state;
  _mbstate_t _mbrtowc_state;
  _mbstate_t _mbsrtowcs_state;
  _mbstate_t _wcrtomb_state;
  _mbstate_t _wcsrtombs_state;
};



struct _reent
{


  int _errno;




  __FILE *_stdin, *_stdout, *_stderr;

  int _inc;

  char *_emergency;

  int __sdidinit;

  int _current_category;
  const char *_current_locale;

  struct _mprec *_mp;

  void (* __cleanup) (struct _reent *);

  int _gamma_signgam;


  int _cvtlen;
  char *_cvtbuf;

  struct _rand48 *_r48;
  struct __tm *_localtime_buf;
  char *_asctime_buf;


  void (**(_sig_func))(int);



  struct _atexit *_atexit;
  struct _atexit _atexit0;


  struct _glue __sglue;
  __FILE *__sf;
  struct _misc_reent *_misc;
  char *_signal_buf;
};

extern const struct __sFILE_fake __sf_fake_stdin;
extern const struct __sFILE_fake __sf_fake_stdout;
extern const struct __sFILE_fake __sf_fake_stderr;
# 762 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 3
extern struct _reent *_impure_ptr ;
extern struct _reent *const _global_impure_ptr ;

void _reclaim_reent (struct _reent *);
# 19 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdlib.h" 2 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/stdlib.h" 1 3
# 20 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdlib.h" 2 3

# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/alloca.h" 1 3
# 22 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdlib.h" 2 3








typedef struct
{
  int quot;
  int rem;
} div_t;

typedef struct
{
  long quot;
  long rem;
} ldiv_t;




typedef struct
{
  long long int quot;
  long long int rem;
} lldiv_t;




typedef int (*__compar_fn_t) (const void *, const void *);







int __locale_mb_cur_max (void);



void abort (void) __attribute__ ((noreturn));
int abs (int);
int atexit (void (*__func)(void));
double atof (const char *__nptr);

float atoff (const char *__nptr);

int atoi (const char *__nptr);
int _atoi_r (struct _reent *, const char *__nptr);
long atol (const char *__nptr);
long _atol_r (struct _reent *, const char *__nptr);
void * bsearch (const void * __key, const void * __base, size_t __nmemb, size_t __size, __compar_fn_t _compar)



                                ;
void * calloc (size_t __nmemb, size_t __size) ;
div_t div (int __numer, int __denom);
void exit (int __status) __attribute__ ((noreturn));
void free (void *) ;
char * getenv (const char *__string);
char * _getenv_r (struct _reent *, const char *__string);
char * _findenv (const char *, int *);
char * _findenv_r (struct _reent *, const char *, int *);

extern char *suboptarg;
int getsubopt (char **, char * const *, char **);

long labs (long);
ldiv_t ldiv (long __numer, long __denom);
void * malloc (size_t __size) ;
int mblen (const char *, size_t);
int _mblen_r (struct _reent *, const char *, size_t, _mbstate_t *);
int mbtowc (wchar_t *__restrict, const char *__restrict, size_t);
int _mbtowc_r (struct _reent *, wchar_t *__restrict, const char *__restrict, size_t, _mbstate_t *);
int wctomb (char *, wchar_t);
int _wctomb_r (struct _reent *, char *, wchar_t, _mbstate_t *);
size_t mbstowcs (wchar_t *__restrict, const char *__restrict, size_t);
size_t _mbstowcs_r (struct _reent *, wchar_t *__restrict, const char *__restrict, size_t, _mbstate_t *);
size_t wcstombs (char *__restrict, const wchar_t *__restrict, size_t);
size_t _wcstombs_r (struct _reent *, char *__restrict, const wchar_t *__restrict, size_t, _mbstate_t *);


char * mkdtemp (char *);
int mkostemp (char *, int);
int mkostemps (char *, int, int);
int mkstemp (char *);
int mkstemps (char *, int);
char * mktemp (char *) __attribute__ ((__warning__ ("the use of `mktemp' is dangerous; use `mkstemp' instead")));

char * _mkdtemp_r (struct _reent *, char *);
int _mkostemp_r (struct _reent *, char *, int);
int _mkostemps_r (struct _reent *, char *, int, int);
int _mkstemp_r (struct _reent *, char *);
int _mkstemps_r (struct _reent *, char *, int);
char * _mktemp_r (struct _reent *, char *) __attribute__ ((__warning__ ("the use of `mktemp' is dangerous; use `mkstemp' instead")));

void qsort (void * __base, size_t __nmemb, size_t __size, __compar_fn_t _compar);
int rand (void);
void * realloc (void * __r, size_t __size) ;

void * reallocf (void * __r, size_t __size);
char * realpath (const char *__restrict path, char *__restrict resolved_path);

void srand (unsigned __seed);
double strtod (const char *__restrict __n, char **__restrict __end_PTR);
double _strtod_r (struct _reent *,const char *__restrict __n, char **__restrict __end_PTR);

float strtof (const char *__restrict __n, char **__restrict __end_PTR);







long strtol (const char *__restrict __n, char **__restrict __end_PTR, int __base);
long _strtol_r (struct _reent *,const char *__restrict __n, char **__restrict __end_PTR, int __base);
unsigned long strtoul (const char *__restrict __n, char **__restrict __end_PTR, int __base);
unsigned long _strtoul_r (struct _reent *,const char *__restrict __n, char **__restrict __end_PTR, int __base);

int system (const char *__string);


long a64l (const char *__input);
char * l64a (long __input);
char * _l64a_r (struct _reent *,long __input);
int on_exit (void (*__func)(int, void *),void * __arg);
void _Exit (int __status) __attribute__ ((noreturn));
int putenv (char *__string);
int _putenv_r (struct _reent *, char *__string);
void * _reallocf_r (struct _reent *, void *, size_t);
int setenv (const char *__string, const char *__value, int __overwrite);
int _setenv_r (struct _reent *, const char *__string, const char *__value, int __overwrite);

char * gcvt (double,int,char *);
char * gcvtf (float,int,char *);
char * fcvt (double,int,int *,int *);
char * fcvtf (float,int,int *,int *);
char * ecvt (double,int,int *,int *);
char * ecvtbuf (double, int, int*, int*, char *);
char * fcvtbuf (double, int, int*, int*, char *);
char * ecvtf (float,int,int *,int *);
char * dtoa (double, int, int, int *, int*, char**);
int rand_r (unsigned *__seed);

double drand48 (void);
double _drand48_r (struct _reent *);
double erand48 (unsigned short [3]);
double _erand48_r (struct _reent *, unsigned short [3]);
long jrand48 (unsigned short [3]);
long _jrand48_r (struct _reent *, unsigned short [3]);
void lcong48 (unsigned short [7]);
void _lcong48_r (struct _reent *, unsigned short [7]);
long lrand48 (void);
long _lrand48_r (struct _reent *);
long mrand48 (void);
long _mrand48_r (struct _reent *);
long nrand48 (unsigned short [3]);
long _nrand48_r (struct _reent *, unsigned short [3]);
unsigned short *
       seed48 (unsigned short [3]);
unsigned short *
       _seed48_r (struct _reent *, unsigned short [3]);
void srand48 (long);
void _srand48_r (struct _reent *, long);
long long atoll (const char *__nptr);
long long _atoll_r (struct _reent *, const char *__nptr);
long long llabs (long long);
lldiv_t lldiv (long long __numer, long long __denom);


long long strtoll (const char *__restrict __n, char **__restrict __end_PTR, int __base);


long long _strtoll_r (struct _reent *, const char *__restrict __n, char **__restrict __end_PTR, int __base);


unsigned long long strtoull (const char *__restrict __n, char **__restrict __end_PTR, int __base);


unsigned long long _strtoull_r (struct _reent *, const char *__restrict __n, char **__restrict __end_PTR, int __base);


void cfree (void *);
int unsetenv (const char *__string);
int _unsetenv_r (struct _reent *, const char *__string);
# 221 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdlib.h" 3
char * _dtoa_r (struct _reent *, double, int, int, int *, int*, char**);

void * _malloc_r (struct _reent *, size_t) ;
void * _calloc_r (struct _reent *, size_t, size_t) ;
void _free_r (struct _reent *, void *) ;
void * _realloc_r (struct _reent *, void *, size_t) ;
void _mstats_r (struct _reent *, char *);

int _system_r (struct _reent *, const char *);

void __eprintf (const char *, const char *, unsigned int, const char *);




extern long double strtold (const char *__restrict, char **__restrict);




# 19 "./office_stringsearch1/src/bmhisrch.c" 2
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/string.h" 1 3
# 12 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/string.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/cdefs.h" 1 3
# 45 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/cdefs.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 1 3 4
# 46 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/cdefs.h" 2 3
# 13 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/string.h" 2 3




# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 1 3 4
# 18 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/string.h" 2 3



void * memchr (const void *, int, size_t);
int memcmp (const void *, const void *, size_t);
void * memcpy (void * restrict, const void * restrict, size_t);
void * memmove (void *, const void *, size_t);
void * memset (void *, int, size_t);
char *strcat (char *restrict, const char *restrict);
char *strchr (const char *, int);
int strcmp (const char *, const char *);
int strcoll (const char *, const char *);
char *strcpy (char *restrict, const char *restrict);
size_t strcspn (const char *, const char *);
char *strerror (int);
size_t strlen (const char *);
char *strncat (char *restrict, const char *restrict, size_t);
int strncmp (const char *, const char *, size_t);
char *strncpy (char *restrict, const char *restrict, size_t);
char *strpbrk (const char *, const char *);
char *strrchr (const char *, int);
size_t strspn (const char *, const char *);
char *strstr (const char *, const char *);


char *strtok (char *restrict, const char *restrict);


size_t strxfrm (char *restrict, const char *restrict, size_t);


char *strtok_r (char *restrict, const char *restrict, char **restrict);

int bcmp (const void *, const void *, size_t);
void bcopy (const void *, void *, size_t);
void bzero (void *, size_t);
int ffs (int);
char *index (const char *, int);
void * memccpy (void * restrict, const void * restrict, int, size_t);
void * mempcpy (void *, const void *, size_t);
void * memmem (const void *, size_t, const void *, size_t);
void * memrchr (const void *, int, size_t);
void * rawmemchr (const void *, int);
char *rindex (const char *, int);
char *stpcpy (char *restrict, const char *restrict);
char *stpncpy (char *restrict, const char *restrict, size_t);
int strcasecmp (const char *, const char *);
char *strcasestr (const char *, const char *);
char *strchrnul (const char *, int);


char *strdup (const char *);


char *_strdup_r (struct _reent *, const char *);


char *strndup (const char *, size_t);


char *_strndup_r (struct _reent *, const char *, size_t);
# 87 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/string.h" 3
int strerror_r (int, char *, size_t) __asm__ ("" "__xpg_strerror_r");





size_t strlcat (char *, const char *, size_t);
size_t strlcpy (char *, const char *, size_t);
int strncasecmp (const char *, const char *, size_t);
size_t strnlen (const char *, size_t);
char *strsep (char **, const char *);
char *strlwr (char *);
char *strupr (char *);

char *strsignal (int __signo);






char * _strerror_r (struct _reent *, int, int, int *);
# 140 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/string.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/string.h" 1 3
# 141 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/string.h" 2 3


# 20 "./office_stringsearch1/src/bmhisrch.c" 2
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/ctype.h" 1 3







int isalnum (int __c);
int isalpha (int __c);
int iscntrl (int __c);
int isdigit (int __c);
int isgraph (int __c);
int islower (int __c);
int isprint (int __c);
int ispunct (int __c);
int isspace (int __c);
int isupper (int __c);
int isxdigit (int __c);
int tolower (int __c);
int toupper (int __c);


int isblank (int __c);



int isascii (int __c);
int toascii (int __c);
# 43 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/ctype.h" 3
const

extern char *__ctype_ptr__;
# 109 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/ctype.h" 3
extern const char _ctype_[];


# 21 "./office_stringsearch1/src/bmhisrch.c" 2

# 21 "./office_stringsearch1/src/bmhisrch.c"
typedef unsigned char uchar;

void bmhi_init(const char *);
char *bmhi_search(const char *, const int);
void bhmi_cleanup(void);




static int patlen;
static int skip[
# 31 "./office_stringsearch1/src/bmhisrch.c" 3 4
               (0x7f * 2 + 1)
# 31 "./office_stringsearch1/src/bmhisrch.c"
                        +1];
static int skip2;

static uchar *pat = 
# 34 "./office_stringsearch1/src/bmhisrch.c" 3 4
                   ((void *)0)
# 34 "./office_stringsearch1/src/bmhisrch.c"
                       ;







void bmhi_init(const char *pattern)
{
      int i, lastpatchar;
      patlen = strlen(pattern);



      pat = realloc ((void*)pat, patlen);
      if (!pat)
            exit(1);
      else atexit(bhmi_cleanup);
      for (i=0; i < patlen; i++)
            pat[i] = 
# 54 "./office_stringsearch1/src/bmhisrch.c" 3
                    __extension__ ({ __typeof__ (
# 54 "./office_stringsearch1/src/bmhisrch.c"
                    pattern[i]
# 54 "./office_stringsearch1/src/bmhisrch.c" 3
                    ) __x = (
# 54 "./office_stringsearch1/src/bmhisrch.c"
                    pattern[i]
# 54 "./office_stringsearch1/src/bmhisrch.c" 3
                    ); ((((__ctype_ptr__+sizeof(""[__x]))[(int)(__x)])&(01|02))==02) ? (int) __x - 'a' + 'A' : (int) __x;})
# 54 "./office_stringsearch1/src/bmhisrch.c"
                                       ;



      for ( i = 0; i <= 
# 58 "./office_stringsearch1/src/bmhisrch.c" 3 4
                       (0x7f * 2 + 1)
# 58 "./office_stringsearch1/src/bmhisrch.c"
                                ; ++i )
            skip[i] = patlen;
      for ( i = 0; i < patlen - 1; ++i )
      {
            skip[ pat[i] ] = patlen - i - 1;
            skip[
# 63 "./office_stringsearch1/src/bmhisrch.c" 3
                __extension__ ({ __typeof__ (
# 63 "./office_stringsearch1/src/bmhisrch.c"
                pat[i]
# 63 "./office_stringsearch1/src/bmhisrch.c" 3
                ) __x = (
# 63 "./office_stringsearch1/src/bmhisrch.c"
                pat[i]
# 63 "./office_stringsearch1/src/bmhisrch.c" 3
                ); ((((__ctype_ptr__+sizeof(""[__x]))[(int)(__x)])&(01|02))==01) ? (int) __x - 'A' + 'a' : (int) __x;})
# 63 "./office_stringsearch1/src/bmhisrch.c"
                               ] = patlen - i - 1;
      }
      lastpatchar = pat[patlen - 1];
      skip[ lastpatchar ] = 32767;
      skip[
# 67 "./office_stringsearch1/src/bmhisrch.c" 3
          __extension__ ({ __typeof__ (
# 67 "./office_stringsearch1/src/bmhisrch.c"
          lastpatchar
# 67 "./office_stringsearch1/src/bmhisrch.c" 3
          ) __x = (
# 67 "./office_stringsearch1/src/bmhisrch.c"
          lastpatchar
# 67 "./office_stringsearch1/src/bmhisrch.c" 3
          ); ((((__ctype_ptr__+sizeof(""[__x]))[(int)(__x)])&(01|02))==01) ? (int) __x - 'A' + 'a' : (int) __x;})
# 67 "./office_stringsearch1/src/bmhisrch.c"
                              ] = 32767;
      skip2 = patlen;
      for (i = 0; i < patlen - 1; ++i)
      {
            if ( pat[i] == lastpatchar )
                  skip2 = patlen - i - 1;
      }
}

char *bmhi_search(const char *string, const int stringlen)
{
      int i, j;
      char *s;

      i = patlen - 1 - stringlen;
      if (i >= 0)
            return 
# 83 "./office_stringsearch1/src/bmhisrch.c" 3 4
                  ((void *)0)
# 83 "./office_stringsearch1/src/bmhisrch.c"
                      ;
      string += stringlen;
      for ( ;; )
      {
            while ( (i += skip[((uchar *)string)[i]]) < 0 )
                  ;
            if (i < (32767 - stringlen))
                  return 
# 90 "./office_stringsearch1/src/bmhisrch.c" 3 4
                        ((void *)0)
# 90 "./office_stringsearch1/src/bmhisrch.c"
                            ;
            i -= 32767;
            j = patlen - 1;
            s = (char *)string + (i - j);
            while ( --j >= 0 && 
# 94 "./office_stringsearch1/src/bmhisrch.c" 3
                               __extension__ ({ __typeof__ (
# 94 "./office_stringsearch1/src/bmhisrch.c"
                               s[j]
# 94 "./office_stringsearch1/src/bmhisrch.c" 3
                               ) __x = (
# 94 "./office_stringsearch1/src/bmhisrch.c"
                               s[j]
# 94 "./office_stringsearch1/src/bmhisrch.c" 3
                               ); ((((__ctype_ptr__+sizeof(""[__x]))[(int)(__x)])&(01|02))==02) ? (int) __x - 'a' + 'A' : (int) __x;}) 
# 94 "./office_stringsearch1/src/bmhisrch.c"
                                             == pat[j] )
                  ;
            if ( j < 0 )
                  return s;
            if ( (i += skip2) >= 0 )
                  return 
# 99 "./office_stringsearch1/src/bmhisrch.c" 3 4
                        ((void *)0)
# 99 "./office_stringsearch1/src/bmhisrch.c"
                            ;
      }
}

void bhmi_cleanup(void)
{
      free(pat);
}

[-- Attachment #3: correct.E --]
[-- Type: application/octet-stream, Size: 118803 bytes --]

# 1 "./office_ispell/src/correct.c"
# 1 "/proj/esdt_sdk/aagarwa/mb_mibench_eembc_tests_split_paths/tests/suites/mibench_v1.3//"
# 1 "<built-in>"
# 1 "<command-line>"
# 1 "./office_ispell/src/correct.c"

static char Rcs_Id[] =
    "$Id: //IP3/DEV/hw/microblaze/microblaze_v9_4/MB_ISA/testCases/suites/mibench_v1.3/office_ispell/src/correct.c#1 $";
# 133 "./office_ispell/src/correct.c"
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/ctype.h" 1 3



# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/_ansi.h" 1 3
# 15 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/_ansi.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/newlib.h" 1 3
# 16 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/_ansi.h" 2 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/config.h" 1 3



# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/ieeefp.h" 1 3
# 5 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/config.h" 2 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/features.h" 1 3
# 6 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/config.h" 2 3
# 17 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/_ansi.h" 2 3
# 5 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/ctype.h" 2 3




# 8 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/ctype.h" 3
int isalnum (int __c);
int isalpha (int __c);
int iscntrl (int __c);
int isdigit (int __c);
int isgraph (int __c);
int islower (int __c);
int isprint (int __c);
int ispunct (int __c);
int isspace (int __c);
int isupper (int __c);
int isxdigit (int __c);
int tolower (int __c);
int toupper (int __c);


int isblank (int __c);



int isascii (int __c);
int toascii (int __c);
# 43 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/ctype.h" 3
const

extern char *__ctype_ptr__;
# 109 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/ctype.h" 3
extern const char _ctype_[];


# 134 "./office_ispell/src/correct.c" 2
# 1 "./office_ispell/src/config.h" 1
# 110 "./office_ispell/src/config.h"
# 1 "./office_ispell/src/local.h" 1
# 111 "./office_ispell/src/config.h" 2
# 120 "./office_ispell/src/config.h"
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/param.h" 1 3
# 9 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/param.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/syslimits.h" 1 3
# 10 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/param.h" 2 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/endian.h" 1 3
# 11 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/param.h" 2 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/param.h" 1 3
# 12 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/param.h" 2 3
# 121 "./office_ispell/src/config.h" 2
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/types.h" 1 3
# 20 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/types.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/_ansi.h" 1 3
# 21 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/types.h" 2 3




# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_types.h" 1 3






# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 1 3
# 27 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 3
typedef signed char __int8_t;
typedef unsigned char __uint8_t;
# 37 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 3
typedef short int __int16_t;
typedef short unsigned int __uint16_t;
# 55 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 3
typedef long int __int32_t;
typedef long unsigned int __uint32_t;
# 77 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 3
typedef long long int __int64_t;
typedef long long unsigned int __uint64_t;
# 104 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 3
typedef signed char __int_least8_t;
typedef unsigned char __uint_least8_t;
# 126 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 3
typedef short int __int_least16_t;
typedef short unsigned int __uint_least16_t;
# 144 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 3
typedef long int __int_least32_t;
typedef long unsigned int __uint_least32_t;
# 158 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 3
typedef long long int __int_least64_t;
typedef long long unsigned int __uint_least64_t;
# 168 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_default_types.h" 3
typedef int __intptr_t;
typedef unsigned int __uintptr_t;
# 8 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/_types.h" 2 3
# 26 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/types.h" 2 3
# 61 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/types.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/_types.h" 1 3
# 13 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/_types.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/lock.h" 1 3





typedef int _LOCK_T;
typedef int _LOCK_RECURSIVE_T;
# 14 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/_types.h" 2 3


typedef long _off_t;



typedef short __dev_t;



typedef unsigned short __uid_t;


typedef unsigned short __gid_t;



__extension__ typedef long long _off64_t;







typedef long _fpos_t;
# 55 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/_types.h" 3
typedef signed int _ssize_t;
# 67 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/_types.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 1 3 4
# 357 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 3 4
typedef unsigned int wint_t;
# 68 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/_types.h" 2 3



typedef struct
{
  int __count;
  union
  {
    wint_t __wch;
    unsigned char __wchb[4];
  } __value;
} _mbstate_t;



typedef _LOCK_RECURSIVE_T _flock_t;




typedef void *_iconv_t;
# 62 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/types.h" 2 3







# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 1 3 4
# 149 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 3 4
typedef int ptrdiff_t;
# 216 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 3 4
typedef unsigned int size_t;
# 328 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 3 4
typedef int wchar_t;
# 426 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 3 4
typedef struct {
  long long __max_align_ll __attribute__((__aligned__(__alignof__(long long))));
  long double __max_align_ld __attribute__((__aligned__(__alignof__(long double))));
} max_align_t;
# 70 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/types.h" 2 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/types.h" 1 3
# 19 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/types.h" 3
typedef long int __off_t;
typedef int __pid_t;

__extension__ typedef long long int __loff_t;
# 71 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/types.h" 2 3
# 93 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/types.h" 3
typedef unsigned char u_char;



typedef unsigned short u_short;



typedef unsigned int u_int;



typedef unsigned long u_long;





typedef unsigned short ushort;
typedef unsigned int uint;
typedef unsigned long ulong;



typedef unsigned long clock_t;




typedef long time_t;







struct timespec {
  time_t tv_sec;
  long tv_nsec;
};


struct itimerspec {
  struct timespec it_interval;
  struct timespec it_value;
};


typedef long daddr_t;



typedef char * caddr_t;
# 155 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/types.h" 3
typedef unsigned short ino_t;
# 184 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/types.h" 3
typedef _off_t off_t;
typedef __dev_t dev_t;
typedef __uid_t uid_t;
typedef __gid_t gid_t;





typedef int pid_t;







typedef long key_t;

typedef _ssize_t ssize_t;
# 217 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/types.h" 3
typedef unsigned int mode_t __attribute__ ((__mode__ (__SI__)));




typedef unsigned short nlink_t;
# 244 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/types.h" 3
typedef long fd_mask;







typedef struct _types_fd_set {
 fd_mask fds_bits[(((64)+(((sizeof (fd_mask) * 8))-1))/((sizeof (fd_mask) * 8)))];
} _types_fd_set;
# 275 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/types.h" 3
typedef unsigned long clockid_t;




typedef unsigned long timer_t;



typedef unsigned long useconds_t;
typedef long suseconds_t;
# 122 "./office_ispell/src/config.h" 2

# 1 "./hal/include/sys/dir.h" 1
# 124 "./office_ispell/src/config.h" 2
# 135 "./office_ispell/src/correct.c" 2
# 1 "./office_ispell/src/ispell.h" 1
# 103 "./office_ispell/src/ispell.h"
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdio.h" 1 3
# 35 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdio.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 1 3 4
# 36 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdio.h" 2 3


# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stdarg.h" 1 3 4
# 40 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stdarg.h" 3 4
typedef __builtin_va_list __gnuc_va_list;
# 39 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdio.h" 2 3







# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 1 3
# 14 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 1 3 4
# 15 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 2 3







typedef unsigned long __ULong;
# 38 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 3
struct _reent;






struct _Bigint
{
  struct _Bigint *_next;
  int _k, _maxwds, _sign, _wds;
  __ULong _x[1];
};


struct __tm
{
  int __tm_sec;
  int __tm_min;
  int __tm_hour;
  int __tm_mday;
  int __tm_mon;
  int __tm_year;
  int __tm_wday;
  int __tm_yday;
  int __tm_isdst;
};







struct _on_exit_args {
 void * _fnargs[32];
 void * _dso_handle[32];

 __ULong _fntypes;


 __ULong _is_cxa;
};


struct _atexit {
 struct _atexit *_next;
 int _ind;
 void (*_fns[32])(void);
        struct _on_exit_args * _on_exit_args_ptr;
};
# 115 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 3
struct __sbuf {
 unsigned char *_base;
 int _size;
};
# 151 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 3
struct __sFILE_fake {
  unsigned char *_p;
  int _r;
  int _w;
  short _flags;
  short _file;
  struct __sbuf _bf;
  int _lbfsize;

  struct _reent *_data;
};




extern void __sinit (struct _reent *);
# 179 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 3
struct __sFILE {
  unsigned char *_p;
  int _r;
  int _w;
  short _flags;
  short _file;
  struct __sbuf _bf;
  int _lbfsize;


  struct _reent *_data;



  void * _cookie;

  int (* _read) (struct _reent *, void *, char *, int)
                                          ;
  int (* _write) (struct _reent *, void *, const char *, int)

                                   ;
  _fpos_t (* _seek) (struct _reent *, void *, _fpos_t, int);
  int (* _close) (struct _reent *, void *);


  struct __sbuf _ub;
  unsigned char *_up;
  int _ur;


  unsigned char _ubuf[3];
  unsigned char _nbuf[1];


  struct __sbuf _lb;


  int _blksize;
  _off_t _offset;






  _flock_t _lock;

  _mbstate_t _mbstate;
  int _flags2;
};
# 285 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 3
typedef struct __sFILE __FILE;



struct _glue
{
  struct _glue *_next;
  int _niobs;
  __FILE *_iobs;
};
# 317 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 3
struct _rand48 {
  unsigned short _seed[3];
  unsigned short _mult[3];
  unsigned short _add;


  __extension__ unsigned long long _rand_next;

};
# 342 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 3
struct _mprec
{

  struct _Bigint *_result;
  int _result_k;
  struct _Bigint *_p5s;
  struct _Bigint **_freelist;
};


struct _misc_reent
{

  char *_strtok_last;
  _mbstate_t _mblen_state;
  _mbstate_t _wctomb_state;
  _mbstate_t _mbtowc_state;
  char _l64a_buf[8];
  int _getdate_err;
  _mbstate_t _mbrlen_state;
  _mbstate_t _mbrtowc_state;
  _mbstate_t _mbsrtowcs_state;
  _mbstate_t _wcrtomb_state;
  _mbstate_t _wcsrtombs_state;
};



struct _reent
{


  int _errno;




  __FILE *_stdin, *_stdout, *_stderr;

  int _inc;

  char *_emergency;

  int __sdidinit;

  int _current_category;
  const char *_current_locale;

  struct _mprec *_mp;

  void (* __cleanup) (struct _reent *);

  int _gamma_signgam;


  int _cvtlen;
  char *_cvtbuf;

  struct _rand48 *_r48;
  struct __tm *_localtime_buf;
  char *_asctime_buf;


  void (**(_sig_func))(int);



  struct _atexit *_atexit;
  struct _atexit _atexit0;


  struct _glue __sglue;
  __FILE *__sf;
  struct _misc_reent *_misc;
  char *_signal_buf;
};

extern const struct __sFILE_fake __sf_fake_stdin;
extern const struct __sFILE_fake __sf_fake_stdout;
extern const struct __sFILE_fake __sf_fake_stderr;
# 762 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/reent.h" 3
extern struct _reent *_impure_ptr ;
extern struct _reent *const _global_impure_ptr ;

void _reclaim_reent (struct _reent *);
# 47 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdio.h" 2 3




typedef __FILE FILE;




typedef _fpos_t fpos_t;





# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/stdio.h" 1 3
# 63 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdio.h" 2 3
# 162 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdio.h" 3
FILE * tmpfile (void);
char * tmpnam (char *);
int fclose (FILE *);
int fflush (FILE *);
FILE * freopen (const char *__restrict, const char *__restrict, FILE *__restrict);
void setbuf (FILE *__restrict, char *__restrict);
int setvbuf (FILE *__restrict, char *__restrict, int, size_t);
int fprintf (FILE *__restrict, const char *__restrict, ...) __attribute__ ((__format__ (__printf__, 2, 3)))
                                                            ;
int fscanf (FILE *__restrict, const char *__restrict, ...) __attribute__ ((__format__ (__scanf__, 2, 3)))
                                                           ;
int printf (const char *__restrict, ...) __attribute__ ((__format__ (__printf__, 1, 2)))
                                                            ;
int scanf (const char *__restrict, ...) __attribute__ ((__format__ (__scanf__, 1, 2)))
                                                           ;
int sscanf (const char *__restrict, const char *__restrict, ...) __attribute__ ((__format__ (__scanf__, 2, 3)))
                                                           ;
int vfprintf (FILE *__restrict, const char *__restrict, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 2, 0)))
                                                            ;
int vprintf (const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 1, 0)))
                                                            ;
int vsprintf (char *__restrict, const char *__restrict, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 2, 0)))
                                                            ;
int fgetc (FILE *);
char * fgets (char *__restrict, int, FILE *__restrict);
int fputc (int, FILE *);
int fputs (const char *__restrict, FILE *__restrict);
int getc (FILE *);
int getchar (void);
char * gets (char *);
int putc (int, FILE *);
int putchar (int);
int puts (const char *);
int ungetc (int, FILE *);
size_t fread (void * __restrict, size_t _size, size_t _n, FILE *__restrict);
size_t fwrite (const void * __restrict , size_t _size, size_t _n, FILE *);



int fgetpos (FILE *__restrict, fpos_t *__restrict);

int fseek (FILE *, long, int);



int fsetpos (FILE *, const fpos_t *);

long ftell ( FILE *);
void rewind (FILE *);
void clearerr (FILE *);
int feof (FILE *);
int ferror (FILE *);
void perror (const char *);

FILE * fopen (const char *__restrict _name, const char *__restrict _type);
int sprintf (char *__restrict, const char *__restrict, ...) __attribute__ ((__format__ (__printf__, 2, 3)))
                                                            ;
int remove (const char *);
int rename (const char *, const char *);
void xil_printf (const char*, ...);
void putnum (unsigned int );
void print (char* );
# 233 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdio.h" 3
int fseeko (FILE *, off_t, int);
off_t ftello ( FILE *);




int asiprintf (char **, const char *, ...) __attribute__ ((__format__ (__printf__, 2, 3)))
                                                            ;
char * asniprintf (char *, size_t *, const char *, ...) __attribute__ ((__format__ (__printf__, 3, 4)))
                                                            ;
char * asnprintf (char *__restrict, size_t *__restrict, const char *__restrict, ...) __attribute__ ((__format__ (__printf__, 3, 4)))
                                                            ;
int asprintf (char **__restrict, const char *__restrict, ...) __attribute__ ((__format__ (__printf__, 2, 3)))
                                                            ;

int diprintf (int, const char *, ...) __attribute__ ((__format__ (__printf__, 2, 3)))
                                                            ;

int fcloseall (void);
int fiprintf (FILE *, const char *, ...) __attribute__ ((__format__ (__printf__, 2, 3)))
                                                            ;
int fiscanf (FILE *, const char *, ...) __attribute__ ((__format__ (__scanf__, 2, 3)))
                                                           ;
int iprintf (const char *, ...) __attribute__ ((__format__ (__printf__, 1, 2)))
                                                            ;
int iscanf (const char *, ...) __attribute__ ((__format__ (__scanf__, 1, 2)))
                                                           ;
int siprintf (char *, const char *, ...) __attribute__ ((__format__ (__printf__, 2, 3)))
                                                            ;
int siscanf (const char *, const char *, ...) __attribute__ ((__format__ (__scanf__, 2, 3)))
                                                           ;
int snprintf (char *__restrict, size_t, const char *__restrict, ...) __attribute__ ((__format__ (__printf__, 3, 4)))
                                                            ;
int sniprintf (char *, size_t, const char *, ...) __attribute__ ((__format__ (__printf__, 3, 4)))
                                                            ;
char * tempnam (const char *, const char *);
int vasiprintf (char **, const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 2, 0)))
                                                            ;
char * vasniprintf (char *, size_t *, const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 3, 0)))
                                                            ;
char * vasnprintf (char *, size_t *, const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 3, 0)))
                                                            ;
int vasprintf (char **, const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 2, 0)))
                                                            ;
int vdiprintf (int, const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 2, 0)))
                                                            ;
int vfiprintf (FILE *, const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 2, 0)))
                                                            ;
int vfiscanf (FILE *, const char *, __gnuc_va_list) __attribute__ ((__format__ (__scanf__, 2, 0)))
                                                           ;
int vfscanf (FILE *__restrict, const char *__restrict, __gnuc_va_list) __attribute__ ((__format__ (__scanf__, 2, 0)))
                                                           ;
int viprintf (const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 1, 0)))
                                                            ;
int viscanf (const char *, __gnuc_va_list) __attribute__ ((__format__ (__scanf__, 1, 0)))
                                                           ;
int vscanf (const char *, __gnuc_va_list) __attribute__ ((__format__ (__scanf__, 1, 0)))
                                                           ;
int vsiprintf (char *, const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 2, 0)))
                                                            ;
int vsiscanf (const char *, const char *, __gnuc_va_list) __attribute__ ((__format__ (__scanf__, 2, 0)))
                                                           ;
int vsniprintf (char *, size_t, const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 3, 0)))
                                                            ;
int vsnprintf (char *__restrict, size_t, const char *__restrict, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 3, 0)))
                                                            ;
int vsscanf (const char *__restrict, const char *__restrict, __gnuc_va_list) __attribute__ ((__format__ (__scanf__, 2, 0)))
                                                           ;
# 310 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdio.h" 3
FILE * fdopen (int, const char *);

int fileno (FILE *);
int getw (FILE *);
int pclose (FILE *);
FILE * popen (const char *, const char *);
int putw (int, FILE *);
void setbuffer (FILE *, char *, int);
int setlinebuf (FILE *);
int getc_unlocked (FILE *);
int getchar_unlocked (void);
void flockfile (FILE *);
int ftrylockfile (FILE *);
void funlockfile (FILE *);
int putc_unlocked (int, FILE *);
int putchar_unlocked (int);
# 335 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdio.h" 3
int dprintf (int, const char *__restrict, ...) __attribute__ ((__format__ (__printf__, 2, 3)))
                                                            ;

FILE * fmemopen (void *__restrict, size_t, const char *__restrict);


FILE * open_memstream (char **, size_t *);



int vdprintf (int, const char *__restrict, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 2, 0)))
                                                            ;







int _asiprintf_r (struct _reent *, char **, const char *, ...) __attribute__ ((__format__ (__printf__, 3, 4)))
                                                            ;
char * _asniprintf_r (struct _reent *, char *, size_t *, const char *, ...) __attribute__ ((__format__ (__printf__, 4, 5)))
                                                            ;
char * _asnprintf_r (struct _reent *, char *__restrict, size_t *__restrict, const char *__restrict, ...) __attribute__ ((__format__ (__printf__, 4, 5)))
                                                            ;
int _asprintf_r (struct _reent *, char **__restrict, const char *__restrict, ...) __attribute__ ((__format__ (__printf__, 3, 4)))
                                                            ;
int _diprintf_r (struct _reent *, int, const char *, ...) __attribute__ ((__format__ (__printf__, 3, 4)))
                                                            ;
int _dprintf_r (struct _reent *, int, const char *__restrict, ...) __attribute__ ((__format__ (__printf__, 3, 4)))
                                                            ;
int _fclose_r (struct _reent *, FILE *);
int _fcloseall_r (struct _reent *);
FILE * _fdopen_r (struct _reent *, int, const char *);
int _fflush_r (struct _reent *, FILE *);
int _fgetc_r (struct _reent *, FILE *);
char * _fgets_r (struct _reent *, char *__restrict, int, FILE *__restrict);




int _fgetpos_r (struct _reent *, FILE *, fpos_t *);
int _fsetpos_r (struct _reent *, FILE *, const fpos_t *);

int _fiprintf_r (struct _reent *, FILE *, const char *, ...) __attribute__ ((__format__ (__printf__, 3, 4)))
                                                            ;
int _fiscanf_r (struct _reent *, FILE *, const char *, ...) __attribute__ ((__format__ (__scanf__, 3, 4)))
                                                           ;
FILE * _fmemopen_r (struct _reent *, void *__restrict, size_t, const char *__restrict);
FILE * _fopen_r (struct _reent *, const char *__restrict, const char *__restrict);
FILE * _freopen_r (struct _reent *, const char *__restrict, const char *__restrict, FILE *__restrict);
int _fprintf_r (struct _reent *, FILE *__restrict, const char *__restrict, ...) __attribute__ ((__format__ (__printf__, 3, 4)))
                                                            ;
int _fpurge_r (struct _reent *, FILE *);
int _fputc_r (struct _reent *, int, FILE *);
int _fputs_r (struct _reent *, const char *__restrict, FILE *__restrict);
size_t _fread_r (struct _reent *, void * __restrict, size_t _size, size_t _n, FILE *__restrict);
int _fscanf_r (struct _reent *, FILE *__restrict, const char *__restrict, ...) __attribute__ ((__format__ (__scanf__, 3, 4)))
                                                           ;
int _fseek_r (struct _reent *, FILE *, long, int);
int _fseeko_r (struct _reent *, FILE *, _off_t, int);
long _ftell_r (struct _reent *, FILE *);
_off_t _ftello_r (struct _reent *, FILE *);
void _rewind_r (struct _reent *, FILE *);
size_t _fwrite_r (struct _reent *, const void * __restrict, size_t _size, size_t _n, FILE *__restrict);
int _getc_r (struct _reent *, FILE *);
int _getc_unlocked_r (struct _reent *, FILE *);
int _getchar_r (struct _reent *);
int _getchar_unlocked_r (struct _reent *);
char * _gets_r (struct _reent *, char *);
int _iprintf_r (struct _reent *, const char *, ...) __attribute__ ((__format__ (__printf__, 2, 3)))
                                                            ;
int _iscanf_r (struct _reent *, const char *, ...) __attribute__ ((__format__ (__scanf__, 2, 3)))
                                                           ;
FILE * _open_memstream_r (struct _reent *, char **, size_t *);
void _perror_r (struct _reent *, const char *);
int _printf_r (struct _reent *, const char *__restrict, ...) __attribute__ ((__format__ (__printf__, 2, 3)))
                                                            ;
int _putc_r (struct _reent *, int, FILE *);
int _putc_unlocked_r (struct _reent *, int, FILE *);
int _putchar_unlocked_r (struct _reent *, int);
int _putchar_r (struct _reent *, int);
int _puts_r (struct _reent *, const char *);
int _remove_r (struct _reent *, const char *);
int _rename_r (struct _reent *, const char *_old, const char *_new)
                                          ;
int _scanf_r (struct _reent *, const char *__restrict, ...) __attribute__ ((__format__ (__scanf__, 2, 3)))
                                                           ;
int _siprintf_r (struct _reent *, char *, const char *, ...) __attribute__ ((__format__ (__printf__, 3, 4)))
                                                            ;
int _siscanf_r (struct _reent *, const char *, const char *, ...) __attribute__ ((__format__ (__scanf__, 3, 4)))
                                                           ;
int _sniprintf_r (struct _reent *, char *, size_t, const char *, ...) __attribute__ ((__format__ (__printf__, 4, 5)))
                                                            ;
int _snprintf_r (struct _reent *, char *__restrict, size_t, const char *__restrict, ...) __attribute__ ((__format__ (__printf__, 4, 5)))
                                                            ;
int _sprintf_r (struct _reent *, char *__restrict, const char *__restrict, ...) __attribute__ ((__format__ (__printf__, 3, 4)))
                                                            ;
int _sscanf_r (struct _reent *, const char *__restrict, const char *__restrict, ...) __attribute__ ((__format__ (__scanf__, 3, 4)))
                                                           ;
char * _tempnam_r (struct _reent *, const char *, const char *);
FILE * _tmpfile_r (struct _reent *);
char * _tmpnam_r (struct _reent *, char *);
int _ungetc_r (struct _reent *, int, FILE *);
int _vasiprintf_r (struct _reent *, char **, const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 3, 0)))
                                                            ;
char * _vasniprintf_r (struct _reent*, char *, size_t *, const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 4, 0)))
                                                            ;
char * _vasnprintf_r (struct _reent*, char *, size_t *, const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 4, 0)))
                                                            ;
int _vasprintf_r (struct _reent *, char **, const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 3, 0)))
                                                            ;
int _vdiprintf_r (struct _reent *, int, const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 3, 0)))
                                                            ;
int _vdprintf_r (struct _reent *, int, const char *__restrict, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 3, 0)))
                                                            ;
int _vfiprintf_r (struct _reent *, FILE *, const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 3, 0)))
                                                            ;
int _vfiscanf_r (struct _reent *, FILE *, const char *, __gnuc_va_list) __attribute__ ((__format__ (__scanf__, 3, 0)))
                                                           ;
int _vfprintf_r (struct _reent *, FILE *__restrict, const char *__restrict, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 3, 0)))
                                                            ;
int _vfscanf_r (struct _reent *, FILE *__restrict, const char *__restrict, __gnuc_va_list) __attribute__ ((__format__ (__scanf__, 3, 0)))
                                                           ;
int _viprintf_r (struct _reent *, const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 2, 0)))
                                                            ;
int _viscanf_r (struct _reent *, const char *, __gnuc_va_list) __attribute__ ((__format__ (__scanf__, 2, 0)))
                                                           ;
int _vprintf_r (struct _reent *, const char *__restrict, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 2, 0)))
                                                            ;
int _vscanf_r (struct _reent *, const char *__restrict, __gnuc_va_list) __attribute__ ((__format__ (__scanf__, 2, 0)))
                                                           ;
int _vsiprintf_r (struct _reent *, char *, const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 3, 0)))
                                                            ;
int _vsiscanf_r (struct _reent *, const char *, const char *, __gnuc_va_list) __attribute__ ((__format__ (__scanf__, 3, 0)))
                                                           ;
int _vsniprintf_r (struct _reent *, char *, size_t, const char *, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 4, 0)))
                                                            ;
int _vsnprintf_r (struct _reent *, char *__restrict, size_t, const char *__restrict, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 4, 0)))
                                                            ;
int _vsprintf_r (struct _reent *, char *__restrict, const char *__restrict, __gnuc_va_list) __attribute__ ((__format__ (__printf__, 3, 0)))
                                                            ;
int _vsscanf_r (struct _reent *, const char *__restrict, const char *__restrict, __gnuc_va_list) __attribute__ ((__format__ (__scanf__, 3, 0)))
                                                           ;



int fpurge (FILE *);
ssize_t __getdelim (char **, size_t *, int, FILE *);
ssize_t __getline (char **, size_t *, FILE *);
# 512 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdio.h" 3
int __srget_r (struct _reent *, FILE *);
int __swbuf_r (struct _reent *, int, FILE *);
# 536 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdio.h" 3
FILE *funopen (const void * __cookie, int (*__readfn)(void * __cookie, char *__buf, int __n), int (*__writefn)(void * __cookie, const char *__buf, int __n), fpos_t (*__seekfn)(void * __cookie, fpos_t __off, int __whence), int (*__closefn)(void * __cookie))





                                   ;
FILE *_funopen_r (struct _reent *, const void * __cookie, int (*__readfn)(void * __cookie, char *__buf, int __n), int (*__writefn)(void * __cookie, const char *__buf, int __n), fpos_t (*__seekfn)(void * __cookie, fpos_t __off, int __whence), int (*__closefn)(void * __cookie))





                                   ;







typedef ssize_t cookie_read_function_t(void *__cookie, char *__buf, size_t __n);
typedef ssize_t cookie_write_function_t(void *__cookie, const char *__buf,
     size_t __n);




typedef int cookie_seek_function_t(void *__cookie, off_t *__off, int __whence);

typedef int cookie_close_function_t(void *__cookie);
typedef struct
{


  cookie_read_function_t *read;
  cookie_write_function_t *write;
  cookie_seek_function_t *seek;
  cookie_close_function_t *close;
} cookie_io_functions_t;
FILE *fopencookie (void *__cookie, const char *__mode, cookie_io_functions_t __functions)
                                                         ;
FILE *_fopencookie_r (struct _reent *, void *__cookie, const char *__mode, cookie_io_functions_t __functions)
                                                         ;
# 688 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdio.h" 3

# 104 "./office_ispell/src/ispell.h" 2
# 190 "./office_ispell/src/ispell.h"

# 190 "./office_ispell/src/ispell.h"
typedef unsigned char ichar_t;
# 201 "./office_ispell/src/ispell.h"
struct dent
    {
    struct dent * next;
    char * word;
    long mask[(32 / 32)];



    };
# 300 "./office_ispell/src/ispell.h"
struct flagent
    {
    ichar_t * strip;
    ichar_t * affix;
    short flagbit;
    short stripl;
    short affl;
    short numconds;
    short flagflags;
    char conds[128 + 100];
    };







union ptr_union
    {
    struct flagptr * fp;
    struct flagent * ent;
    };

struct flagptr
    {
    union ptr_union pu;
    int numents;
    };




struct strchartype
    {
    char * name;
    char * deformatter;
    char * suffixes;
    };




struct hashheader
    {
    unsigned short magic;
    unsigned short compileoptions;
    short maxstringchars;
    short maxstringcharlen;
    short compoundmin;
    short compoundbit;
    int stringsize;
    int lstringsize;
    int tblsize;
    int stblsize;
    int ptblsize;
    int sortval;
    int nstrchars;
    int nstrchartype;
    int strtypestart;
    char nrchars[5];
    char texchars[13];
    char compoundflag;
    char defhardflag;
    char flagmarker;
    unsigned short sortorder[128 + 100];
    ichar_t lowerconv[128 + 100];
    ichar_t upperconv[128 + 100];
    char wordchars[128 + 100];
    char upperchars[128 + 100];
    char lowerchars[128 + 100];
    char boundarychars[128 + 100];
    char stringstarts[128];
    char stringchars[100][10 + 1];
    unsigned int stringdups[100];
    int dupnos[100];
    unsigned short magic2;
    };
# 413 "./office_ispell/src/ispell.h"
struct success
    {
    struct dent * dictent;
    struct flagent * prefix;
    struct flagent * suffix;
    };
# 523 "./office_ispell/src/ispell.h"
extern char * BC;
extern char * cd;
extern char * cl;
extern char * cm;
extern char * ho;
extern char * nd;
extern char * so;
extern char * se;
extern int sg;
extern char * ti;
extern char * te;
extern int li;
extern int co;

extern int contextsize;
extern char contextbufs[10][
# 538 "./office_ispell/src/ispell.h" 3
                                   1024
# 538 "./office_ispell/src/ispell.h"
                                         ];
extern int contextoffset;
extern char * currentchar;
extern char ctoken[100 + 20];
extern ichar_t itoken[100 + 20];

extern char termcap[2048];
extern char termstr[2048];
extern char * termptr;

extern int numhits;
extern struct success
  hits[10];

extern char * hashstrings;
extern struct hashheader
  hashheader;
extern struct dent *
  hashtbl;
extern int hashsize;

extern char hashname[
# 559 "./office_ispell/src/ispell.h" 3
                    1024
# 559 "./office_ispell/src/ispell.h"
                              ];

extern int aflag;
extern int cflag;
extern int lflag;
extern int incfileflag;
extern int nodictflag;

extern int uerasechar;
extern int ukillchar;

extern unsigned int laststringch;
extern int defdupchar;

extern int numpflags;
extern int numsflags;
extern struct flagptr pflagindex[128 + 100];

extern struct flagent * pflaglist;
extern struct flagptr sflagindex[128 + 100];

extern struct flagent * sflaglist;

extern struct strchartype *
  chartypes;

extern FILE * infile;
extern FILE * outfile;

extern FILE * infile1;
extern FILE * outfile1;

extern char * askfilename;

extern int changes;
extern int readonly;
extern int quit;



extern char possibilities[100][100 + 20];

extern int pcount;
extern int maxposslen;
extern int easypossibilities;






extern int Trynum;
extern ichar_t Try[128 + 100];
# 625 "./office_ispell/src/ispell.h"
extern int minimenusize;




extern int eflag;
extern int dumpflag;
extern int fflag;

extern int sflag;

extern int vflag;
extern int xflag;
extern int deftflag;
extern int tflag;
extern int prefstringchar;

extern int terse;

extern char tempfile[
# 644 "./office_ispell/src/ispell.h" 3
1024
# 644 "./office_ispell/src/ispell.h"
];

extern int minword;
extern int sortit;
extern int compoundflag;
extern int tryhardflag;

extern char * currentfile;


extern int math_mode;






extern char LaTeX_Mode;
# 136 "./office_ispell/src/correct.c" 2
# 1 "./office_ispell/src/proto.h" 1
# 88 "./office_ispell/src/proto.h"
extern int addvheader (struct dent * ent);
extern void askmode (void);
extern void backup (void);

extern int cap_ok (ichar_t * word, struct success * hit, int len);

extern int casecmp (char * a, char * b, int canonical);
extern void chupcase (char * s);
extern void checkfile (void);
extern void checkline (FILE * ofile);
extern void chk_aff (ichar_t * word, ichar_t * ucword, int len, int ignoreflagbits, int allhits, int pfxopts, int sfxopts)
                                                               ;
extern int combinecaps (struct dent * hdr, struct dent * newent);
extern int compoundgood (ichar_t * word, int pfxopts);
extern void copyout (char ** cc, int cnt);
extern void correct (char * ctok, int ctokl, ichar_t * itok, int itokl, char ** curchar)
                     ;
extern char * do_regex_lookup (char * expr, int whence);
extern void done (int);
extern void dumpmode (void);
extern void erase (void);
extern int expand_pre (char * croot, ichar_t * rootword, long mask[], int option, char *extra)
                                              ;
extern int expand_suf (char * croot, ichar_t * rootword, long mask[], int crossonly, int option, char * extra)
                                                              ;
extern int findfiletype (char * name, int searchnames, int * deformatter)
                       ;
extern void flagpr (ichar_t * word, int preflag, int prestrip, int preadd, int sufflag, int sufadd)
                                         ;
extern void givehelp (int interactive);
extern int good (ichar_t * word, int ignoreflagbits, int allhits, int pfxopts, int sfxopts)
                              ;
extern int hash (ichar_t * word, int hashtablesize);






extern int ichartostr (char * out, ichar_t * in, int outlen, int canonical)
                   ;
extern char * ichartosstr (ichar_t * in, int canonical);
extern int ins_root_cap (ichar_t * word, ichar_t * pattern, int prestrip, int preadd, int sufstrip, int sufadd, struct dent * firstdent, struct flagent * pfxent, struct flagent * sufent)


                             ;
extern void inverse (void);
extern int linit (void);
extern struct dent * lookup (ichar_t * word, int dotree);
extern void lowcase (ichar_t * string);
extern int makedent (char * lbuf, int lbuflen, struct dent * d);
extern void makepossibilities (ichar_t * word);
extern void move (int row, int col);
extern void normal (void);
extern char * printichar (int in);





extern int shellescape (char * buf);


extern char * skipoverword (char * bufp);
extern void stop (void);
extern int stringcharlen (char * bufp, int canonical);
extern int strtoichar (ichar_t * out, char * in, int outlen, int canonical)
                   ;
extern ichar_t * strtosichar (char * in, int canonical);
extern void terminit (void);
extern void toutent (FILE * outfile, struct dent * hent, int onlykeep)
                  ;
extern void treeinit (char * persdict, char * LibDict);
extern void treeinsert (char * word, int wordlen, int keep);
extern struct dent * treelookup (ichar_t * word);
extern void treeoutput (void);
extern void upcase (ichar_t * string);

extern long whatcap (ichar_t * word);

extern char * xgets (char * string, int size, FILE * stream);
extern void yyinit (void);
extern int yyopen (char * file);
extern int yyparse (void);

extern void myfree (void * area);
extern void * mymalloc (unsigned int);
extern void * myrealloc (void * area, unsigned int size, unsigned int oldsize)
                          ;
# 266 "./office_ispell/src/proto.h"
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdlib.h" 1 3
# 10 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdlib.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/ieeefp.h" 1 3
# 11 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdlib.h" 2 3





# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 1 3 4
# 17 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdlib.h" 2 3


# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/machine/stdlib.h" 1 3
# 20 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdlib.h" 2 3

# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/alloca.h" 1 3
# 22 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdlib.h" 2 3









# 30 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdlib.h" 3
typedef struct
{
  int quot;
  int rem;
} div_t;

typedef struct
{
  long quot;
  long rem;
} ldiv_t;




typedef struct
{
  long long int quot;
  long long int rem;
} lldiv_t;




typedef int (*__compar_fn_t) (const void *, const void *);







int __locale_mb_cur_max (void);



void abort (void) __attribute__ ((noreturn));
int abs (int);
int atexit (void (*__func)(void));
double atof (const char *__nptr);

float atoff (const char *__nptr);

int atoi (const char *__nptr);
int _atoi_r (struct _reent *, const char *__nptr);
long atol (const char *__nptr);
long _atol_r (struct _reent *, const char *__nptr);
void * bsearch (const void * __key, const void * __base, size_t __nmemb, size_t __size, __compar_fn_t _compar)



                                ;
void * calloc (size_t __nmemb, size_t __size) ;
div_t div (int __numer, int __denom);
void exit (int __status) __attribute__ ((noreturn));
void free (void *) ;
char * getenv (const char *__string);
char * _getenv_r (struct _reent *, const char *__string);
char * _findenv (const char *, int *);
char * _findenv_r (struct _reent *, const char *, int *);

extern char *suboptarg;
int getsubopt (char **, char * const *, char **);

long labs (long);
ldiv_t ldiv (long __numer, long __denom);
void * malloc (size_t __size) ;
int mblen (const char *, size_t);
int _mblen_r (struct _reent *, const char *, size_t, _mbstate_t *);
int mbtowc (wchar_t *__restrict, const char *__restrict, size_t);
int _mbtowc_r (struct _reent *, wchar_t *__restrict, const char *__restrict, size_t, _mbstate_t *);
int wctomb (char *, wchar_t);
int _wctomb_r (struct _reent *, char *, wchar_t, _mbstate_t *);
size_t mbstowcs (wchar_t *__restrict, const char *__restrict, size_t);
size_t _mbstowcs_r (struct _reent *, wchar_t *__restrict, const char *__restrict, size_t, _mbstate_t *);
size_t wcstombs (char *__restrict, const wchar_t *__restrict, size_t);
size_t _wcstombs_r (struct _reent *, char *__restrict, const wchar_t *__restrict, size_t, _mbstate_t *);


char * mkdtemp (char *);
int mkostemp (char *, int);
int mkostemps (char *, int, int);
int mkstemp (char *);
int mkstemps (char *, int);
char * mktemp (char *) __attribute__ ((__warning__ ("the use of `mktemp' is dangerous; use `mkstemp' instead")));

char * _mkdtemp_r (struct _reent *, char *);
int _mkostemp_r (struct _reent *, char *, int);
int _mkostemps_r (struct _reent *, char *, int, int);
int _mkstemp_r (struct _reent *, char *);
int _mkstemps_r (struct _reent *, char *, int);
char * _mktemp_r (struct _reent *, char *) __attribute__ ((__warning__ ("the use of `mktemp' is dangerous; use `mkstemp' instead")));

void qsort (void * __base, size_t __nmemb, size_t __size, __compar_fn_t _compar);
int rand (void);
void * realloc (void * __r, size_t __size) ;

void * reallocf (void * __r, size_t __size);
char * realpath (const char *__restrict path, char *__restrict resolved_path);

void srand (unsigned __seed);
double strtod (const char *__restrict __n, char **__restrict __end_PTR);
double _strtod_r (struct _reent *,const char *__restrict __n, char **__restrict __end_PTR);

float strtof (const char *__restrict __n, char **__restrict __end_PTR);







long strtol (const char *__restrict __n, char **__restrict __end_PTR, int __base);
long _strtol_r (struct _reent *,const char *__restrict __n, char **__restrict __end_PTR, int __base);
unsigned long strtoul (const char *__restrict __n, char **__restrict __end_PTR, int __base);
unsigned long _strtoul_r (struct _reent *,const char *__restrict __n, char **__restrict __end_PTR, int __base);

int system (const char *__string);


long a64l (const char *__input);
char * l64a (long __input);
char * _l64a_r (struct _reent *,long __input);
int on_exit (void (*__func)(int, void *),void * __arg);
void _Exit (int __status) __attribute__ ((noreturn));
int putenv (char *__string);
int _putenv_r (struct _reent *, char *__string);
void * _reallocf_r (struct _reent *, void *, size_t);
int setenv (const char *__string, const char *__value, int __overwrite);
int _setenv_r (struct _reent *, const char *__string, const char *__value, int __overwrite);

char * gcvt (double,int,char *);
char * gcvtf (float,int,char *);
char * fcvt (double,int,int *,int *);
char * fcvtf (float,int,int *,int *);
char * ecvt (double,int,int *,int *);
char * ecvtbuf (double, int, int*, int*, char *);
char * fcvtbuf (double, int, int*, int*, char *);
char * ecvtf (float,int,int *,int *);
char * dtoa (double, int, int, int *, int*, char**);
int rand_r (unsigned *__seed);

double drand48 (void);
double _drand48_r (struct _reent *);
double erand48 (unsigned short [3]);
double _erand48_r (struct _reent *, unsigned short [3]);
long jrand48 (unsigned short [3]);
long _jrand48_r (struct _reent *, unsigned short [3]);
void lcong48 (unsigned short [7]);
void _lcong48_r (struct _reent *, unsigned short [7]);
long lrand48 (void);
long _lrand48_r (struct _reent *);
long mrand48 (void);
long _mrand48_r (struct _reent *);
long nrand48 (unsigned short [3]);
long _nrand48_r (struct _reent *, unsigned short [3]);
unsigned short *
       seed48 (unsigned short [3]);
unsigned short *
       _seed48_r (struct _reent *, unsigned short [3]);
void srand48 (long);
void _srand48_r (struct _reent *, long);
long long atoll (const char *__nptr);
long long _atoll_r (struct _reent *, const char *__nptr);
long long llabs (long long);
lldiv_t lldiv (long long __numer, long long __denom);


long long strtoll (const char *__restrict __n, char **__restrict __end_PTR, int __base);


long long _strtoll_r (struct _reent *, const char *__restrict __n, char **__restrict __end_PTR, int __base);


unsigned long long strtoull (const char *__restrict __n, char **__restrict __end_PTR, int __base);


unsigned long long _strtoull_r (struct _reent *, const char *__restrict __n, char **__restrict __end_PTR, int __base);


void cfree (void *);
int unsetenv (const char *__string);
int _unsetenv_r (struct _reent *, const char *__string);
# 221 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/stdlib.h" 3
char * _dtoa_r (struct _reent *, double, int, int, int *, int*, char**);

void * _malloc_r (struct _reent *, size_t) ;
void * _calloc_r (struct _reent *, size_t, size_t) ;
void _free_r (struct _reent *, void *) ;
void * _realloc_r (struct _reent *, void *, size_t) ;
void _mstats_r (struct _reent *, char *);

int _system_r (struct _reent *, const char *);

void __eprintf (const char *, const char *, unsigned int, const char *);




extern long double strtold (const char *__restrict, char **__restrict);




# 267 "./office_ispell/src/proto.h" 2
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/string.h" 1 3
# 12 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/string.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/cdefs.h" 1 3
# 45 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/cdefs.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 1 3 4
# 46 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/cdefs.h" 2 3
# 13 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/string.h" 2 3




# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/lib/gcc/microblaze-xilinx-elf/6.0.0/include/stddef.h" 1 3 4
# 18 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/string.h" 2 3



void * memchr (const void *, int, size_t);
int memcmp (const void *, const void *, size_t);
void * memcpy (void * restrict, const void * restrict, size_t);
void * memmove (void *, const void *, size_t);
void * memset (void *, int, size_t);
char *strcat (char *restrict, const char *restrict);
char *strchr (const char *, int);
int strcmp (const char *, const char *);
int strcoll (const char *, const char *);
char *strcpy (char *restrict, const char *restrict);
size_t strcspn (const char *, const char *);
char *strerror (int);
size_t strlen (const char *);
char *strncat (char *restrict, const char *restrict, size_t);
int strncmp (const char *, const char *, size_t);
char *strncpy (char *restrict, const char *restrict, size_t);
char *strpbrk (const char *, const char *);
char *strrchr (const char *, int);
size_t strspn (const char *, const char *);
char *strstr (const char *, const char *);


char *strtok (char *restrict, const char *restrict);


size_t strxfrm (char *restrict, const char *restrict, size_t);


char *strtok_r (char *restrict, const char *restrict, char **restrict);

int bcmp (const void *, const void *, size_t);
void bcopy (const void *, void *, size_t);
void bzero (void *, size_t);
int ffs (int);
char *index (const char *, int);
void * memccpy (void * restrict, const void * restrict, int, size_t);
void * mempcpy (void *, const void *, size_t);
void * memmem (const void *, size_t, const void *, size_t);
void * memrchr (const void *, int, size_t);
void * rawmemchr (const void *, int);
char *rindex (const char *, int);
char *stpcpy (char *restrict, const char *restrict);
char *stpncpy (char *restrict, const char *restrict, size_t);
int strcasecmp (const char *, const char *);
char *strcasestr (const char *, const char *);
char *strchrnul (const char *, int);


char *strdup (const char *);


char *_strdup_r (struct _reent *, const char *);


char *strndup (const char *, size_t);


char *_strndup_r (struct _reent *, const char *, size_t);
# 87 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/string.h" 3
int strerror_r (int, char *, size_t) __asm__ ("" "__xpg_strerror_r");





size_t strlcat (char *, const char *, size_t);
size_t strlcpy (char *, const char *, size_t);
int strncasecmp (const char *, const char *, size_t);
size_t strnlen (const char *, size_t);
char *strsep (char **, const char *);
char *strlwr (char *);
char *strupr (char *);

char *strsignal (int __signo);






char * _strerror_r (struct _reent *, int, int, int *);
# 140 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/string.h" 3
# 1 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/sys/string.h" 1 3
# 141 "/proj/esdt_sdk/aagarwa/rdi_build/GCC_4_9/output/gnu/microblaze/lin/microblaze-xilinx-elf/include/string.h" 2 3


# 268 "./office_ispell/src/proto.h" 2




# 271 "./office_ispell/src/proto.h"
extern char * index ();
extern char * rindex ();
# 283 "./office_ispell/src/proto.h"
extern int tgetent ();
extern int tgetnum ();
extern char * tgetstr ();
extern char * tgoto ();
extern char * tputs ();
# 137 "./office_ispell/src/correct.c" 2
# 1 "./office_ispell/src/msgs.h" 1
# 138 "./office_ispell/src/correct.c" 2
# 1 "./office_ispell/src/version.h" 1
# 10 "./office_ispell/src/version.h"
static char * Version_ID[] = {
    "@(#) International Ispell Version 3.1.20 10/10/95",
    "@(#) Copyright (c), 1983, by Pace Willisson",
    "@(#) International version Copyright (c) 1987, 1988, 1990-1995,",
    "@(#) by Geoff Kuenning, Granada Hills, CA.  All rights reserved.",
    "@(#)",
    "@(#) Redistribution and use in source and binary forms, with or without",
    "@(#) modification, are permitted provided that the following conditions",
    "@(#) are met:",
    "@(#)",
    "@(#) 1. Redistributions of source code must retain the above copyright",
    "@(#)    notice, this list of conditions and the following disclaimer.",
    "@(#) 2. Redistributions in binary form must reproduce the above",
    "@(#)    copyright notice, this list of conditions and the following",
    "@(#)    disclaimer in the documentation and/or other materials provided",
    "@(#)    with the distribution.",
    "@(#) 3. All modifications to the source code must be clearly marked as",
    "@(#)    such.  Binary redistributions based on modified source code",
    "@(#)    must be clearly marked as modified versions in the documentation",
    "@(#)    and/or other materials provided with the distribution.",
    "@(#) 4. All advertising materials mentioning features or use of this",
    "@(#)    software must display the following acknowledgment:",
    "@(#)      This product includes software developed by Geoff Kuenning and",
    "@(#)      other unpaid contributors.",
    "@(#) 5. The name of Geoff Kuenning may not be used to endorse or promote",
    "@(#)    products derived from this software without specific prior",
    "@(#)    written permission.",
    "@(#)",
    "@(#) THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS",
    "@(#) IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT",
    "@(#) LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS",
    "@(#) FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL GEOFF",
    "@(#) KUENNING OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,",
    "@(#) INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES",
    "@(#) (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR",
    "@(#) SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)",
    "@(#) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,",
    "@(#) STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)",
    "@(#) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED",
    "@(#) OF THE POSSIBILITY OF SUCH DAMAGE.",
    
# 50 "./office_ispell/src/version.h" 3 4
   ((void *)0)

# 51 "./office_ispell/src/version.h"
};

static char RCS_Version_ID[] =
    "$Id: //IP3/DEV/hw/microblaze/microblaze_v9_4/MB_ISA/testCases/suites/mibench_v1.3/office_ispell/src/version.h#1 $";
# 139 "./office_ispell/src/correct.c" 2

void givehelp (int interactive);
void checkfile (void);
void correct (char * ctok, int ctokl, ichar_t * itok, int itokl, char ** curchar)
                     ;
static void show_line (char * line, char * invstart, int invlen);
static int show_char (char ** cp, int linew, int output, int maxw);
static int line_size (char * buf, char * bufend);
static void inserttoken (char * buf, char * start, char * tok, char ** curchar)
                     ;
static int posscmp (char * a, char * b);
int casecmp (char * a, char * b, int canonical);
void makepossibilities (ichar_t * word);
static int insert (ichar_t * word);

static void wrongcapital (ichar_t * word);

static void wrongletter (ichar_t * word);
static void extraletter (ichar_t * word);
static void missingletter (ichar_t * word);
static void missingspace (ichar_t * word);
int compoundgood (ichar_t * word, int pfxopts);
static void transposedletter (ichar_t * word);
static void tryveryhard (ichar_t * word);
static int ins_cap (ichar_t * word, ichar_t * pattern);
static int save_cap (ichar_t * word, ichar_t * pattern, ichar_t savearea[10][100 + 20])
                                                            ;
int ins_root_cap (ichar_t * word, ichar_t * pattern, int prestrip, int preadd, int sufstrip, int sufadd, struct dent * firstdent, struct flagent * pfxent, struct flagent * sufent)


                             ;
static void save_root_cap (ichar_t * word, ichar_t * pattern, int prestrip, int preadd, int sufstrip, int sufadd, struct dent * firstdent, struct flagent * pfxent, struct flagent * sufent, ichar_t savearea[10][100 + 20], int * nsaved)




                  ;
static char * getline (char * buf);
void askmode (void);
void copyout (char ** cc, int cnt);
static void lookharder (char * string);




void givehelp (interactive)
    int interactive;
    {



    register FILE *helpout;

    if (interactive)
 {
 erase ();
 helpout = outfile1;
 }
    else
 helpout = 
# 198 "./office_ispell/src/correct.c" 3
          (_impure_ptr->_stderr)
# 198 "./office_ispell/src/correct.c"
                ;

    (void) fprintf (helpout, "Whenever a word is found that is not in the dictionary,\r\n");
    (void) fprintf (helpout, "it is printed on the first line of the screen.  If the dictionary\r\n");
    (void) fprintf (helpout, "contains any similar words, they are listed with a number\r\n");
    (void) fprintf (helpout, "next to each one.  You have the option of replacing the word\r\n");
    (void) fprintf (helpout, "completely, or choosing one of the suggested words.\r\n");
    (void) fprintf (helpout, "");
    (void) fprintf (helpout, "");
    (void) fprintf (helpout, "");
    (void) fprintf (helpout, "");

    (void) fprintf (helpout, "\r\nCommands are:\r\n\r\n");

    (void) fprintf (helpout, "R       Replace the misspelled word completely.\r\n");
    (void) fprintf (helpout, "Space   Accept the word this time only.\r\n");
    (void) fprintf (helpout, "A       Accept the word for the rest of this session.\r\n");
    (void) fprintf (helpout, "I       Accept the word, and put it in your private dictionary.\r\n");
    (void) fprintf (helpout, "U       Accept and add lowercase version to private dictionary.\r\n");
    (void) fprintf (helpout, "0-n     Replace with one of the suggested words.\r\n");
    (void) fprintf (helpout, "L       Look up words in system dictionary.\r\n");
    (void) fprintf (helpout, "X       Write the rest of this file, ignoring misspellings,\r\n        and start next file.\r\n");
    (void) fprintf (helpout, "Q       Quit immediately.  Asks for confirmation.\r\n        Leaves file unchanged.\r\n");
    (void) fprintf (helpout, "!       Shell escape.\r\n");
    (void) fprintf (helpout, "^L      Redraw screen.\r\n");
    (void) fprintf (helpout, "^Z      Suspend program.\r\n");
    (void) fprintf (helpout, "?       Show this help screen.\r\n");

    if (interactive)
 {
 (void) fprintf (helpout, "\r\n\r\n");
 (void) fprintf (helpout, "-- Type space to continue --");
 (void) fflush (helpout);





 while (
# 236 "./office_ispell/src/correct.c" 3
       (--((_impure_ptr->_stdin))->_r < 0 ? __srget_r(_impure_ptr, (_impure_ptr->_stdin)) : (int)(*((_impure_ptr->_stdin))->_p++)) 
# 236 "./office_ispell/src/correct.c"
                       != ' ')
     ;

 }
    }

void checkfile ()
    {
    int bufno;
    int bufsize;
    int ch;

    for (bufno = 0; bufno < contextsize; bufno++)
 contextbufs[bufno][0] = '\0';

    for ( ; ; )
 {
 for (bufno = contextsize; --bufno > 0; )
     (void) strcpy (contextbufs[bufno],
       contextbufs[bufno - 1]);
 if (quit)
     {
     while (fgets (contextbufs[0],
       sizeof contextbufs[0], infile) != 
# 259 "./office_ispell/src/correct.c" 3 4
                                        ((void *)0)
# 259 "./office_ispell/src/correct.c"
                                            )
  (void) fputs (contextbufs[0], outfile);
     break;
     }




 if (fgets (contextbufs[0], (sizeof contextbufs[0]) / 2, infile)
   == 
# 268 "./office_ispell/src/correct.c" 3 4
     ((void *)0)
# 268 "./office_ispell/src/correct.c"
         )
     break;







 bufsize = strlen (contextbufs[0]);
 if (bufsize == (sizeof contextbufs[0]) / 2 - 1)
     {
     ch = (unsigned char) contextbufs[0][bufsize - 1];
     while (bufsize < sizeof contextbufs[0] - 1
       && ((hashheader.wordchars[((ichar_t) ch)]) || (hashheader.boundarychars[((ichar_t) ch)])
       || (hashheader.stringstarts[(unsigned char) (ch)])))
  {
  ch = 
# 285 "./office_ispell/src/correct.c" 3
      (--(
# 285 "./office_ispell/src/correct.c"
      infile
# 285 "./office_ispell/src/correct.c" 3
      )->_r < 0 ? __srget_r(_impure_ptr, 
# 285 "./office_ispell/src/correct.c"
      infile
# 285 "./office_ispell/src/correct.c" 3
      ) : (int)(*(
# 285 "./office_ispell/src/correct.c"
      infile
# 285 "./office_ispell/src/correct.c" 3
      )->_p++))
# 285 "./office_ispell/src/correct.c"
                   ;
  if (ch == 
# 286 "./office_ispell/src/correct.c" 3
           (-1)
# 286 "./office_ispell/src/correct.c"
              )
      break;
  contextbufs[0][bufsize++] = (char) ch;
  contextbufs[0][bufsize] = '\0';
  }
     }
 checkline (outfile);
 }
    }

void correct (ctok, ctokl, itok, itokl, curchar)
    char * ctok;
    int ctokl;
    ichar_t * itok;
    int itokl;
    char ** curchar;
    {
    register int c;
    register int i;
    int col_ht;
    int ncols;
    char * start_l2;
    char * begintoken;

    begintoken = *curchar - strlen (ctok);

    if (strlen ((char *) (itok)) <= minword)
 return;

checkagain:
    if (good (itok, 0, 0, 0, 0) || compoundgood (itok, 0))
 return;

    erase ();
    (void) printf ("    %s", ctok);
    if (currentfile)
 (void) printf ("              File: %s", currentfile);
    if (readonly)
 (void) printf (" %s", "[READONLY]");
    (void) printf ("\r\n\r\n");

    makepossibilities (itok);
# 338 "./office_ispell/src/correct.c"
    col_ht = li - contextsize - 4 - minimenusize;
    ncols = co / (maxposslen + 8);
    if (pcount > ncols * col_ht)
 pcount = ncols * col_ht;
# 350 "./office_ispell/src/correct.c"
    for (i = 0; i < pcount; i++)
 {



 move (3 + contextsize + (i % col_ht), (maxposslen + 8) * (i / col_ht));

 if (i >= easypossibilities)
     (void) printf ("??: %s", possibilities[i]);
 else if (easypossibilities >= 10 && i < 10)
     (void) printf ("0%d: %s", i, possibilities[i]);
 else
     (void) printf ("%2d: %s", i, possibilities[i]);
 }




    move (2, 0);

    for (i = contextsize; --i > 0; )
 show_line (contextbufs[i], contextbufs[i], 0);

    start_l2 = contextbufs[0];
    if (line_size (contextbufs[0], *curchar) > co - (sg << 1) - 1)
 {
 start_l2 = begintoken - (co / 2);
 while (start_l2 < begintoken)
     {
     i = line_size (start_l2, *curchar) + 1;
     if (i + (sg << 1) <= co)
  break;
     start_l2 += i - co;
     }
 if (start_l2 > begintoken)
     start_l2 = begintoken;
 if (start_l2 < contextbufs[0])
     start_l2 = contextbufs[0];
 }
    show_line (start_l2, begintoken, (int) strlen (ctok));

    if (minimenusize != 0)
 {
 move (li - 2, 0);
 (void) printf ("[SP] <number> R)epl A)ccept I)nsert L)ookup U)ncap Q)uit e(X)it or ? for help\r\n");
 }

    for ( ; ; )
 {
 (void) fflush (outfile1);
 switch (c = (
# 400 "./office_ispell/src/correct.c" 3
             (--((_impure_ptr->_stdin))->_r < 0 ? __srget_r(_impure_ptr, (_impure_ptr->_stdin)) : (int)(*((_impure_ptr->_stdin))->_p++)) 
# 400 "./office_ispell/src/correct.c"
                             & 0x7f))
     {
     case 'Z' & 037:
  stop ();
  erase ();
  goto checkagain;
     case ' ':
  erase ();
  (void) fflush (outfile1);
  return;
     case 'q': case 'Q':
  if (changes)
      {
      (void) printf ("Are you sure you want to throw away your changes? ");
      (void) fflush (outfile1);
      c = (
# 415 "./office_ispell/src/correct.c" 3
          (--((_impure_ptr->_stdin))->_r < 0 ? __srget_r(_impure_ptr, (_impure_ptr->_stdin)) : (int)(*((_impure_ptr->_stdin))->_p++)) 
# 415 "./office_ispell/src/correct.c"
                          & 0x7f);
      }
  else
      c = 'y';
  if (c == 'y' || c == 'Y')
      {
      erase ();
      (void) fflush (outfile1);
      done (0);
      }
  goto checkagain;
     case 'i': case 'I':
  treeinsert (ichartosstr (strtosichar (ctok, 0), 1),
   (100 + 4 * 20 + 4), 1);
  erase ();
  (void) fflush (outfile1);
  changes = 1;
  return;
     case 'u': case 'U':
  itok = strtosichar (ctok, 0);
  lowcase (itok);
  treeinsert (ichartosstr (itok, 1), (100 + 4 * 20 + 4), 1);
  erase ();
  (void) fflush (outfile1);
  changes = 1;
  return;
     case 'a': case 'A':
  treeinsert (ichartosstr (strtosichar (ctok, 0), 1),
    (100 + 4 * 20 + 4), 0);
  erase ();
  (void) fflush (outfile1);
  return;
     case 'L' & 037:
  goto checkagain;
     case '?':
  givehelp (1);
  goto checkagain;
     case '!':
  {
  char buf[200];

  move (li - 1, 0);
  (void) 
# 457 "./office_ispell/src/correct.c" 3
        (--((_impure_ptr->_stdout))->_w < 0 ? ((_impure_ptr->_stdout))->_w >= ((_impure_ptr->_stdout))->_lbfsize ? (*((_impure_ptr->_stdout))->_p = (
# 457 "./office_ispell/src/correct.c"
        '!'
# 457 "./office_ispell/src/correct.c" 3
        )), *((_impure_ptr->_stdout))->_p != '\n' ? (int)*((_impure_ptr->_stdout))->_p++ : __swbuf_r(_impure_ptr, '\n', (_impure_ptr->_stdout)) : __swbuf_r(_impure_ptr, (int)(
# 457 "./office_ispell/src/correct.c"
        '!'
# 457 "./office_ispell/src/correct.c" 3
        ), (_impure_ptr->_stdout)) : (*((_impure_ptr->_stdout))->_p = (
# 457 "./office_ispell/src/correct.c"
        '!'
# 457 "./office_ispell/src/correct.c" 3
        ), (int)*((_impure_ptr->_stdout))->_p++))
# 457 "./office_ispell/src/correct.c"
                     ;
  if (getline (buf) == 
# 458 "./office_ispell/src/correct.c" 3 4
                      ((void *)0)
# 458 "./office_ispell/src/correct.c"
                          )
      {
      (void) 
# 460 "./office_ispell/src/correct.c" 3
            (--((_impure_ptr->_stdout))->_w < 0 ? ((_impure_ptr->_stdout))->_w >= ((_impure_ptr->_stdout))->_lbfsize ? (*((_impure_ptr->_stdout))->_p = (
# 460 "./office_ispell/src/correct.c"
            7
# 460 "./office_ispell/src/correct.c" 3
            )), *((_impure_ptr->_stdout))->_p != '\n' ? (int)*((_impure_ptr->_stdout))->_p++ : __swbuf_r(_impure_ptr, '\n', (_impure_ptr->_stdout)) : __swbuf_r(_impure_ptr, (int)(
# 460 "./office_ispell/src/correct.c"
            7
# 460 "./office_ispell/src/correct.c" 3
            ), (_impure_ptr->_stdout)) : (*((_impure_ptr->_stdout))->_p = (
# 460 "./office_ispell/src/correct.c"
            7
# 460 "./office_ispell/src/correct.c" 3
            ), (int)*((_impure_ptr->_stdout))->_p++))
# 460 "./office_ispell/src/correct.c"
                       ;
      erase ();
      (void) fflush (outfile1);
      goto checkagain;
      }
  (void) printf ("\r\n");
  (void) fflush (outfile1);



  (void) shellescape (buf);

  erase ();
  goto checkagain;
  }
     case 'r': case 'R':
  move (li - 1, 0);
  if (readonly)
      {
      (void) 
# 479 "./office_ispell/src/correct.c" 3
            (--((_impure_ptr->_stdout))->_w < 0 ? ((_impure_ptr->_stdout))->_w >= ((_impure_ptr->_stdout))->_lbfsize ? (*((_impure_ptr->_stdout))->_p = (
# 479 "./office_ispell/src/correct.c"
            7
# 479 "./office_ispell/src/correct.c" 3
            )), *((_impure_ptr->_stdout))->_p != '\n' ? (int)*((_impure_ptr->_stdout))->_p++ : __swbuf_r(_impure_ptr, '\n', (_impure_ptr->_stdout)) : __swbuf_r(_impure_ptr, (int)(
# 479 "./office_ispell/src/correct.c"
            7
# 479 "./office_ispell/src/correct.c" 3
            ), (_impure_ptr->_stdout)) : (*((_impure_ptr->_stdout))->_p = (
# 479 "./office_ispell/src/correct.c"
            7
# 479 "./office_ispell/src/correct.c" 3
            ), (int)*((_impure_ptr->_stdout))->_p++))
# 479 "./office_ispell/src/correct.c"
                       ;
      (void) printf ("%s ", "[READONLY]");
      }
  (void) printf ("Replace with: ");
  if (getline (ctok) == 
# 483 "./office_ispell/src/correct.c" 3 4
                       ((void *)0)
# 483 "./office_ispell/src/correct.c"
                           )