[Patch] Switch elimination pass for PR 54742

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [Patch] Switch elimination pass for PR 54742
@ 2014-08-19 20:40 Steve Ellcey
  2014-08-20 17:04 ` James Greenhalgh
  2014-08-21  8:58 ` Richard Biener
  0 siblings, 2 replies; 54+ messages in thread
From: Steve Ellcey @ 2014-08-19 20:40 UTC (permalink / raw)
  To: GCC Patches, Jeff Law, Richard Biener, Sebastian Pop

[-- Attachment #1: Type: text/plain, Size: 1422 bytes --]

Here is an official submission for the switch optimization described in
PR 54742.  I have addressed the formatting/comment issues that were raised
and also added a test case based on comment #27 from PR 54742 and I fixed a
bug I found while doing benchmarking with SPEC2006 (the perl benchmark was
generating an ICE in a routine with multiple switch statements).

I ran the benchmarking to see if I could find any more tests that are
helped like coremark is and while I found a number of benchmarks in
SPEC 2006 and EEMBC where the optimization is triggered, this optimization
generally didn't affect the performance of those benchmarks.  The biggest
impact I could find was on the perl benchmark in SPEC where I saw around
a 0.4% improvement on a MIPS 74k.  Not huge, but not nothing.

So, OK to checkin?

Steve Ellcey
sellcey@mips.com


2014-08-12  Steve Ellcey  <sellcey@mips.com>

	PR tree-opt/54742
	* Makefile.in (OBJS): Add tree-switch-shortcut.o.
	* common.opt (ftree-switch-shortcut): New.
	* opts.c (default_options_table): Add OPT_ftree_switch_shortcut.
	* params.def (PARAM_MAX_SWITCH_INSNS): New.
	(PARAM_MAX_SWITCH_PATHS): New.
	* passes.def (pass_tree_switch_shortcut): New.
	* timevar.def (TV_TREE_SWITCH_SHORTCUT): New.
	* tree-pass.h (make_pass_tree_switch_shortcut): New.
	* tree-switch-shortcut.c: New.


2014-08-12  Steve Ellcey  <sellcey@mips.com>

	PR tree-opt/54742
	* gcc.dg/pr54742.c: New test.


[-- Attachment #2: patch.testsuite --]
[-- Type: text/x-patch, Size: 1159 bytes --]

diff --git a/gcc/testsuite/gcc.dg/pr54742.c b/gcc/testsuite/gcc.dg/pr54742.c
new file mode 100644
index 0000000..77aa8ba
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr54742.c
@@ -0,0 +1,50 @@
+/* PR tree-optimization/54742
+   Verify that the tree-optimization-shortcut pass completely removes
+   the switch statement.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+int sum0, sum1, sum2, sum3;
+int foo(char * s, char** ret)
+{
+  int state=0;
+  char c;
+
+  for (; *s && state != 4; s++)
+    {
+      c = *s;
+      if (c == '*')
+	{
+	  s++;
+	  break;
+	}
+      switch (state) {
+	case 0:
+	  if (c == '+') state = 1;
+	  else if (c != '-') sum0+=c;
+	  break;
+	case 1:
+	  if (c == '+') state = 2;
+	  else if (c == '-') state = 0;
+	  else sum1+=c;
+	  break;
+	case 2:
+	  if (c == '+') state = 3;
+	  else if (c == '-') state = 1;
+	  else sum2+=c;
+	  break;
+	case 3:
+	  if (c == '-') state = 2;
+	  else if (c == 'x') state = 4;
+	  break;
+	default:
+	  break;
+      }
+    }
+  *ret = s;
+  return state;
+}
+
+/* { dg-final { scan-tree-dump-not "switch" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */

[-- Attachment #3: patch.gcc --]
[-- Type: text/x-patch, Size: 17606 bytes --]

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 31c1f4d..94e8ec4 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1411,6 +1411,7 @@ OBJS = \
 	tree-scalar-evolution.o \
 	tree-sra.o \
 	tree-switch-conversion.o \
+	tree-switch-shortcut.o \
 	tree-ssa-address.o \
 	tree-ssa-alias.o \
 	tree-ssa-ccp.o \
diff --git a/gcc/common.opt b/gcc/common.opt
index 0c4f86b..fe0664a 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2249,6 +2249,10 @@ ftree-sra
 Common Report Var(flag_tree_sra) Optimization
 Perform scalar replacement of aggregates
 
+ftree-switch-shortcut
+Common Report Var(flag_tree_switch_shortcut) Init(0) Optimization
+Convert jumps to switch statements into jumps to case statement.
+
 ftree-ter
 Common Report Var(flag_tree_ter) Optimization
 Replace temporary expressions in the SSA->normal pass
diff --git a/gcc/opts.c b/gcc/opts.c
index be1867c..f1ac2e5 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -514,6 +514,7 @@ static const struct default_options default_options_table[] =
     { OPT_LEVELS_3_PLUS, OPT_fvect_cost_model_, NULL, VECT_COST_MODEL_DYNAMIC },
     { OPT_LEVELS_3_PLUS, OPT_fipa_cp_clone, NULL, 1 },
     { OPT_LEVELS_3_PLUS, OPT_ftree_partial_pre, NULL, 1 },
+    { OPT_LEVELS_3_PLUS, OPT_ftree_switch_shortcut, NULL, 1 },
 
     /* -Ofast adds optimizations to -O3.  */
     { OPT_LEVELS_FAST, OPT_ffast_math, NULL, 1 },
diff --git a/gcc/params.def b/gcc/params.def
index cad00e2..65377d3 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1058,6 +1058,20 @@ DEFPARAM (PARAM_MAX_SLSR_CANDIDATE_SCAN,
 	  "strength reduction",
 	  50, 1, 999999)
 
+/* Maximum number of instructions to duplicate when shortcutting a switch.  */
+DEFPARAM (PARAM_MAX_SWITCH_INSNS,
+	  "max-switch-insns",
+	  "Maximum number of instructions to duplicate when "
+	  "shortcutting a switch statement",
+	  100, 1, 999999)
+
+/* Maximum number of paths to duplicate when shortcutting a switch.  */
+DEFPARAM (PARAM_MAX_SWITCH_PATHS,
+	  "max-switch-paths",
+	  "Maximum number of new paths to create when"
+	  " shortcutting a switch statement",
+	  50, 1, 999999)
+
 DEFPARAM (PARAM_ASAN_STACK,
          "asan-stack",
          "Enable asan stack protection",
diff --git a/gcc/passes.def b/gcc/passes.def
index f13df6c..8bbf2d0 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -157,6 +157,7 @@ along with GCC; see the file COPYING3.  If not see
       NEXT_PASS (pass_cselim);
       NEXT_PASS (pass_copy_prop);
       NEXT_PASS (pass_tree_ifcombine);
+      NEXT_PASS (pass_tree_switch_shortcut);
       NEXT_PASS (pass_phiopt);
       NEXT_PASS (pass_tail_recursion);
       NEXT_PASS (pass_ch);
diff --git a/gcc/timevar.def b/gcc/timevar.def
index a04d05c..d9ee915 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -170,6 +170,7 @@ DEFTIMEVAR (TV_TREE_LOOP_IVCANON     , "tree canonical iv")
 DEFTIMEVAR (TV_SCEV_CONST            , "scev constant prop")
 DEFTIMEVAR (TV_TREE_LOOP_UNSWITCH    , "tree loop unswitching")
 DEFTIMEVAR (TV_COMPLETE_UNROLL       , "complete unrolling")
+DEFTIMEVAR (TV_TREE_SWITCH_SHORTCUT  , "switch statement shortcuts")
 DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops")
 DEFTIMEVAR (TV_TREE_VECTORIZATION    , "tree vectorization")
 DEFTIMEVAR (TV_TREE_SLP_VECTORIZATION, "tree slp vectorization")
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 1477d1f..f898e27 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -575,6 +575,7 @@ extern gimple_opt_pass *make_pass_early_inline (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_inline_parameters (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_update_address_taken (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_convert_switch (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_tree_switch_shortcut (gcc::context *ctxt);
 
 /* Current optimization pass.  */
 extern opt_pass *current_pass;
diff --git a/gcc/tree-switch-shortcut.c b/gcc/tree-switch-shortcut.c
new file mode 100644
index 0000000..4518f79
--- /dev/null
+++ b/gcc/tree-switch-shortcut.c
@@ -0,0 +1,438 @@
+/* Switch shortcutting optimization for GNU C
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   Contributed by Steve Ellcey (steve.ellcey@imgtec.com).
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* This file implements an optimization where, when a variable is set
+   to a constant value and there is a path that leads from that definition
+   to a switch statement that uses that variable as its controlling expression
+   we duplicate the blocks on this path and change the jump to the switch
+   statement with a direct jump to the label of the switch block that control
+   would goto based on the value of the variable.  This can come up in
+   loops/switch statements that implement state machines.
+
+   Example (modified from PR 54742):
+
+   foo(char *str) {
+     int sum=0;
+     int state=0;
+     char *s=str;
+     for (; *s; s++) {
+       char c=*s;
+       <CODE BLOCK 1>
+       switch (state) {
+         case 0:
+           if (c == '+')       { state = 1; sum += 9; }
+           else if (c != '-')  { state = 2; sum += 3; }
+           break;
+         case 1:
+           if (c == '+')       { state = 2; sum += 4; }
+           else if (c == '-')  { state = 0; sum += 7; }
+           break;
+         case 2:
+           if (c == '+')       { state = 0; sum += 8; }
+           else if (c == '-')  { state = 1; sum += 2; }
+           break;
+       }
+       <CODE BLOCK 2>
+     }
+     return state;
+   }
+
+  This pass will convert the code inside 'case 0' to something like:
+
+    case 0:
+      if (c == '+')      { state = 1; sum += 9;
+                           <CODE BLOCK 2>
+                           s++; if (!s) goto loop_exit;
+                           <CODE BLOCK 1>
+                           goto case_1; }
+      else if (c != '-') { state = 2; sum += 3;
+                           <CODE BLOCK 2>
+                           s++; if (!s) goto loop_exit;
+                           <CODE BLOCK 1>
+                           goto case_2; }
+      else               { <CODE BLOCK 2>
+			   s++; if (!s) goto exit;
+                           <CODE BLOCK 1>
+                           goto case_0; }
+
+Similar transformations would apply to the other parts of the switch
+statement.  This obviously can lead to a lot of code duplication but
+it can also result in faster code since we are replacing two jumps
+(one indirect) with a single direct jump.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "params.h"
+#include "flags.h"
+#include "tree.h"
+#include "tree-pass.h"
+#include "basic-block.h"
+#include "function.h"
+#include "hash-table.h"
+#include "tree-ssa-alias.h"
+#include "tree-cfg.h"
+#include "tree-ssa-operands.h"
+#include "tree-inline.h"
+#include "gimple-expr.h"
+#include "is-a.h"
+#include "gimple.h"
+#include "tree-phinodes.h"
+#include "gimple-iterator.h"
+#include "gimple-ssa.h"
+#include "ssa-iterators.h"
+#include "tree-into-ssa.h"
+#include "cfgloop.h"
+
+/* Helper function for find_path, visited_bbs is used to make sure we don't
+   fall into an infinite loop.  */
+
+static int
+find_path_1 (basic_block start_bb, basic_block end_bb,
+	     hash_set<basic_block> *visited_bbs)
+{
+  edge_iterator ei;
+  edge e;
+
+  if (start_bb == end_bb) return 1;
+
+  if (!visited_bbs->add (start_bb))
+    {
+      FOR_EACH_EDGE (e, ei, start_bb->succs)
+	if (find_path_1 (e->dest, end_bb, visited_bbs))
+	  return 1;
+    }
+  return 0;
+}
+
+/* Return 1 if there is a path from start_bb to end_bb and 0 if there
+   is not.  There may be multiple paths from start_bb to end_bb.  */
+
+static int
+find_path (basic_block start_bb, basic_block end_bb)
+{
+  edge_iterator ei;
+  edge e;
+  hash_set<basic_block> visited_bbs;
+  int p = 0;
+
+  if (start_bb == end_bb) return 1;
+
+  if (!visited_bbs.add (start_bb))
+    {
+      FOR_EACH_EDGE (e, ei, start_bb->succs)
+	if (find_path_1 (e->dest, end_bb, &visited_bbs))
+	  {
+	    p = 1;
+	    break;
+	  }
+    }
+  return p;
+}
+
+
+/* We save the paths we want to copy in bbs_list_array.  n_bbs_list is the
+   number of paths saved, bbs_list_array[i] is the list of basic blocks in
+   one path.  Each path starts with the block where a variable is assigned
+   a constant value (bbs_list_array[i][0]) and ends with the switch statement
+   block (bbs_list_array[i][bbs_list_size[i]-2]) followed by the block that
+   the switch statement is going to go to given the constant value of the
+   variable (bbs_list_array[i][bbs_list_size[i]-1]).  */
+
+struct path_info
+{
+  basic_block **bbs_list_array;
+  int *val_array;
+  int *bbs_list_size;
+  int max_path_count;
+  int max_insn_count;
+  int n_bbs_list;
+};
+
+/* bbs_list[0] is the block with the switch statement,
+   bbs_list[n-1] is the block where the switch statement variable is assigned
+     a constant value,
+   The entries in between make a (reverse) path between the two.
+
+   We don't want to change bb_list, we want to leave that alone and
+   and copy the path to bbs_list_array so that we wind up with a list (array)
+   of paths that we want to update.  We also want to add the block that the
+   switch is going to go to on to the list so that we know which exit from
+   the switch statement is important.  */
+
+static void
+save_new_path (basic_block *bbs_list, int n, tree val, path_info *pi)
+{
+  int i;
+  int insn_count;
+  basic_block bb;
+  edge switch_taken_edge;
+  gimple_stmt_iterator gsi;
+
+  if (n <= 1) return;
+
+  if (pi->n_bbs_list >= pi->max_path_count)
+    return;
+
+  /* Put the blocks in 'correct' order and add in where we want to go after
+     the switch statement, We want to leave bbs_list untouched for future
+     calls.  */
+
+  pi->bbs_list_array[pi->n_bbs_list] = XNEWVEC (basic_block, n+1);
+  for (i = 0; i < n; i++)
+    pi->bbs_list_array[pi->n_bbs_list][i] = bbs_list[n-i-1];
+
+  switch_taken_edge = find_taken_edge (bbs_list[0], val);
+  pi->bbs_list_array[pi->n_bbs_list][n] = switch_taken_edge->dest;
+
+  pi->bbs_list_size[pi->n_bbs_list] = n + 1;
+  pi->val_array[pi->n_bbs_list] = (int) TREE_INT_CST_LOW (val);
+
+  /* Count how many instructions are in the blocks we are going to
+     duplicate and if there are too many do not save this path
+     (return without incrementing n_bbs_list).  */
+
+  insn_count = 0;
+  for (i = 1; i < n; i++)
+    {
+      bb = pi->bbs_list_array[pi->n_bbs_list][i];
+      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+	insn_count += estimate_num_insns (gsi_stmt (gsi), &eni_size_weights);
+    }
+
+  if (insn_count > pi->max_insn_count)
+    return;
+
+  pi->n_bbs_list = pi->n_bbs_list + 1;
+}
+
+/* switch_stmt is a switch statement whose switch index expression
+   is the variable expr.  We trace the value of the variable back
+   through any phi nodes looking for places where it gets a constant
+   value and save the path in bbs_list.  Then we call save_new_path
+   to create a list of such paths.  */
+
+static void
+process_switch (tree expr, gimple switch_stmt,
+		hash_set<gimple> *visited_phis,
+	        basic_block *bbs_list, int n,
+		path_info *pi)
+{
+  gimple def_stmt;
+  tree var;
+  unsigned int i;
+  edge e;
+  edge_iterator ei;
+  basic_block bbx;
+  basic_block var_bb;
+  int e_count;
+
+  gcc_assert (gimple_code (switch_stmt) == GIMPLE_SWITCH);
+  var = SSA_NAME_VAR (expr);
+  def_stmt = SSA_NAME_DEF_STMT (expr);
+  var_bb = gimple_bb (def_stmt);
+
+  if (var == NULL || var_bb == NULL) return;
+
+  /* We have a variable definition (var) that is defined in var_bb,
+     We want to put the path from var_bb to the current bb into the
+     bbs_list.  If there is more then one path, skip this and don't
+     try to do the optimization.  */
+
+  bbx = bbs_list[n-1];
+  while (bbx != var_bb)
+    {
+      e_count = 0;
+      FOR_EACH_EDGE (e, ei, bbx->preds)
+	if (find_path (var_bb, e->src))
+	  {
+	    bbs_list[n] = e->src;
+	    n = n + 1;
+	    e_count = e_count + 1;
+	  }
+      if (e_count != 1) return;
+      bbx = bbs_list[n-1];
+    }
+
+  if (gimple_code (def_stmt) == GIMPLE_PHI
+      && !visited_phis->add (def_stmt))
+    {
+      for (i = 0; i < gimple_phi_num_args (def_stmt); i++)
+	{
+	  tree arg = gimple_phi_arg_def (def_stmt, i);
+	  if (arg && TREE_CODE (arg) == INTEGER_CST)
+	    {
+	      /* const char *name = IDENTIFIER_POINTER (DECL_NAME (var)); */
+	      bbs_list[n] = gimple_phi_arg_edge (def_stmt, i)->src;
+	      save_new_path (bbs_list, n + 1, arg, pi);
+	    }
+	  else if (arg && TREE_CODE (arg) == SSA_NAME)
+	    {
+	      bbs_list[n] = gimple_phi_arg_edge (def_stmt, i)->src;
+	      process_switch (arg, switch_stmt, visited_phis, bbs_list, n+1, pi);
+	    }
+	}
+    }
+}
+
+/* Find paths that lead from blocks where a variable is assigned a constant
+   value to a switch statement where that variable is used as the switch
+   index.  Save the paths in bbs_list_array so that they can be processed
+   by copy_switch_paths.  */
+
+static unsigned int
+find_switch_shortcuts (function *fun, path_info *pi)
+{
+  basic_block bb;
+  hash_set<gimple> visited_phis;
+  basic_block *bbs_list;
+  int n = 1;
+
+  bbs_list = XNEWVEC (basic_block, n_basic_blocks_for_fn (fun));
+  FOR_EACH_BB_FN (bb, fun)
+    {
+      gimple stmt = last_stmt (bb);
+      if (stmt && gimple_code (stmt) == GIMPLE_SWITCH)
+	{
+	  tree op = gimple_switch_index (stmt);
+	  tree var = SSA_NAME_VAR (op);
+	  if (var)
+	    {
+	      bbs_list[0] = bb;
+	      process_switch (op, stmt, &visited_phis, bbs_list, n, pi);
+	    }
+	}
+    }
+  XDELETEVEC (bbs_list);
+  return 0;
+}
+
+/* Call gimple_duplicate_sese_region to douplicate the blocks in bb_list.
+   We free and recalculate all ssa and dominance information afterwords
+   because the region being copied is not really SESE and so we cannot
+   trust gimple_duplicate_sese_region to correctly update the dataflow
+   information.  */
+
+static void
+duplicate_blocks (basic_block *bb_list, int bb_count)
+{
+  edge orig_edge, exit_edge;
+  loop_p loop;
+
+  orig_edge = find_edge (bb_list[0], bb_list[1]);
+  exit_edge = find_edge (bb_list[bb_count-2], bb_list[bb_count-1]);
+  /* Earlier block duplications may have removed the path that we
+     saved earlier and are trying to duplicate here.  */
+  if (orig_edge != NULL && exit_edge != NULL)
+    {
+      gimple_duplicate_sese_region (orig_edge, exit_edge, &bb_list[1],
+				    bb_count-2, NULL, false);
+      free_dominance_info (CDI_DOMINATORS);
+      update_ssa (TODO_update_ssa);
+      calculate_dominance_info (CDI_DOMINATORS);
+      loops_state_set (LOOPS_NEED_FIXUP);
+    }
+}
+
+/* Go through the paths saved in bbs_list_array and make copies of them.  */
+
+static void
+copy_switch_paths (path_info *pi)
+{
+  int i;
+
+  /* Process each path in bbs_list_size.  */
+  for (i = 0; i < pi->n_bbs_list; i++)
+    {
+    /* For each path in bbs_list_size loop through and copy each block in
+       the path (except the first on where the constant is assigned and
+       the final one where the switch statement goes to.  */
+
+    if (!single_pred_p (pi->bbs_list_array[i][1]))
+      duplicate_blocks (pi->bbs_list_array[i], pi->bbs_list_size[i]);
+    }
+}
+
+
+/* Main entry for the tree if-conversion pass.  */
+
+namespace {
+
+const pass_data pass_data_tree_switch_shortcut =
+{
+  GIMPLE_PASS, /* type */
+  "switch_shortcut", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_TREE_SWITCH_SHORTCUT, /* tv_id */
+  ( PROP_cfg | PROP_ssa ), /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_update_ssa, /* todo_flags_finish */
+};
+
+class pass_tree_switch_shortcut : public gimple_opt_pass
+{
+public:
+  pass_tree_switch_shortcut (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_tree_switch_shortcut, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      return flag_tree_switch_shortcut;
+    }
+  virtual unsigned int execute (function *);
+
+}; // class pass_tree_switch_shortcut
+
+unsigned int
+pass_tree_switch_shortcut::execute (function *fun)
+{
+  int i;
+  path_info *pi;
+
+  pi = XNEW (path_info);
+  pi->n_bbs_list = 0;
+  pi->max_insn_count = PARAM_VALUE (PARAM_MAX_SWITCH_INSNS);
+  pi->max_path_count = PARAM_VALUE (PARAM_MAX_SWITCH_PATHS);
+  pi->val_array = XNEWVEC (int, pi->max_path_count);
+  pi->bbs_list_size = XNEWVEC (int, pi->max_path_count);
+  pi->bbs_list_array = XNEWVEC (basic_block *, pi->max_path_count);
+  find_switch_shortcuts (fun, pi);
+  copy_switch_paths (pi);
+  XDELETEVEC (pi->val_array);
+  XDELETEVEC (pi->bbs_list_size);
+  for (i = 0; i < pi->n_bbs_list; i++)
+    XDELETEVEC (pi->bbs_list_array[i]);
+  XDELETEVEC (pi->bbs_list_array);
+  XDELETE (pi);
+  return 0;
+}
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_tree_switch_shortcut (gcc::context *ctxt)
+{
+  return new pass_tree_switch_shortcut (ctxt);
+}

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Switch elimination pass for PR 54742
  2014-08-19 20:40 [Patch] Switch elimination pass for PR 54742 Steve Ellcey
@ 2014-08-20 17:04 ` James Greenhalgh
  2014-08-20 20:29   ` Sebastian Pop
  2014-08-21  8:58 ` Richard Biener
  1 sibling, 1 reply; 54+ messages in thread
From: James Greenhalgh @ 2014-08-20 17:04 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: GCC Patches, Jeff Law, Richard Biener, Sebastian Pop

On Tue, Aug 19, 2014 at 09:39:56PM +0100, Steve Ellcey wrote:
> Here is an official submission for the switch optimization described in
> PR 54742.  I have addressed the formatting/comment issues that were raised
> and also added a test case based on comment #27 from PR 54742 and I fixed a
> bug I found while doing benchmarking with SPEC2006 (the perl benchmark was
> generating an ICE in a routine with multiple switch statements).
> 
> I ran the benchmarking to see if I could find any more tests that are
> helped like coremark is and while I found a number of benchmarks in
> SPEC 2006 and EEMBC where the optimization is triggered, this optimization
> generally didn't affect the performance of those benchmarks.  The biggest
> impact I could find was on the perl benchmark in SPEC where I saw around
> a 0.4% improvement on a MIPS 74k.  Not huge, but not nothing.

For what it is worth, I see a nice (~4%) improvement in Crafty from
SPEC 2000. I haven't investigated too deeply, but at a first glance the
number of branch mispredictions has dropped just over 1%, as you
might hope from this optimisation.

I can also attest to there being a number of places the optimisation is
triggered (with high enough parameters; I was running with
--param max-switch-paths=1000 --param max-switch-insns=10000), but like
you I don't see much measurable change in execution time.

Thanks,
James

> 
> So, OK to checkin?
> 
> Steve Ellcey
> sellcey@mips.com
> 
> 
> 2014-08-12  Steve Ellcey  <sellcey@mips.com>
> 
> 	PR tree-opt/54742
> 	* Makefile.in (OBJS): Add tree-switch-shortcut.o.
> 	* common.opt (ftree-switch-shortcut): New.
> 	* opts.c (default_options_table): Add OPT_ftree_switch_shortcut.
> 	* params.def (PARAM_MAX_SWITCH_INSNS): New.
> 	(PARAM_MAX_SWITCH_PATHS): New.
> 	* passes.def (pass_tree_switch_shortcut): New.
> 	* timevar.def (TV_TREE_SWITCH_SHORTCUT): New.
> 	* tree-pass.h (make_pass_tree_switch_shortcut): New.
> 	* tree-switch-shortcut.c: New.
> 
> 
> 2014-08-12  Steve Ellcey  <sellcey@mips.com>
> 
> 	PR tree-opt/54742
> 	* gcc.dg/pr54742.c: New test.

> diff --git a/gcc/testsuite/gcc.dg/pr54742.c b/gcc/testsuite/gcc.dg/pr54742.c
> new file mode 100644
> index 0000000..77aa8ba
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/pr54742.c
> @@ -0,0 +1,50 @@
> +/* PR tree-optimization/54742
> +   Verify that the tree-optimization-shortcut pass completely removes
> +   the switch statement.  */
> +
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +int sum0, sum1, sum2, sum3;
> +int foo(char * s, char** ret)
> +{
> +  int state=0;
> +  char c;
> +
> +  for (; *s && state != 4; s++)
> +    {
> +      c = *s;
> +      if (c == '*')
> +	{
> +	  s++;
> +	  break;
> +	}
> +      switch (state) {
> +	case 0:
> +	  if (c == '+') state = 1;
> +	  else if (c != '-') sum0+=c;
> +	  break;
> +	case 1:
> +	  if (c == '+') state = 2;
> +	  else if (c == '-') state = 0;
> +	  else sum1+=c;
> +	  break;
> +	case 2:
> +	  if (c == '+') state = 3;
> +	  else if (c == '-') state = 1;
> +	  else sum2+=c;
> +	  break;
> +	case 3:
> +	  if (c == '-') state = 2;
> +	  else if (c == 'x') state = 4;
> +	  break;
> +	default:
> +	  break;
> +      }
> +    }
> +  *ret = s;
> +  return state;
> +}
> +
> +/* { dg-final { scan-tree-dump-not "switch" "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> diff --git a/gcc/Makefile.in b/gcc/Makefile.in
> index 31c1f4d..94e8ec4 100644
> --- a/gcc/Makefile.in
> +++ b/gcc/Makefile.in
> @@ -1411,6 +1411,7 @@ OBJS = \
>  	tree-scalar-evolution.o \
>  	tree-sra.o \
>  	tree-switch-conversion.o \
> +	tree-switch-shortcut.o \
>  	tree-ssa-address.o \
>  	tree-ssa-alias.o \
>  	tree-ssa-ccp.o \
> diff --git a/gcc/common.opt b/gcc/common.opt
> index 0c4f86b..fe0664a 100644
> --- a/gcc/common.opt
> +++ b/gcc/common.opt
> @@ -2249,6 +2249,10 @@ ftree-sra
>  Common Report Var(flag_tree_sra) Optimization
>  Perform scalar replacement of aggregates
>  
> +ftree-switch-shortcut
> +Common Report Var(flag_tree_switch_shortcut) Init(0) Optimization
> +Convert jumps to switch statements into jumps to case statement.
> +
>  ftree-ter
>  Common Report Var(flag_tree_ter) Optimization
>  Replace temporary expressions in the SSA->normal pass
> diff --git a/gcc/opts.c b/gcc/opts.c
> index be1867c..f1ac2e5 100644
> --- a/gcc/opts.c
> +++ b/gcc/opts.c
> @@ -514,6 +514,7 @@ static const struct default_options default_options_table[] =
>      { OPT_LEVELS_3_PLUS, OPT_fvect_cost_model_, NULL, VECT_COST_MODEL_DYNAMIC },
>      { OPT_LEVELS_3_PLUS, OPT_fipa_cp_clone, NULL, 1 },
>      { OPT_LEVELS_3_PLUS, OPT_ftree_partial_pre, NULL, 1 },
> +    { OPT_LEVELS_3_PLUS, OPT_ftree_switch_shortcut, NULL, 1 },
>  
>      /* -Ofast adds optimizations to -O3.  */
>      { OPT_LEVELS_FAST, OPT_ffast_math, NULL, 1 },
> diff --git a/gcc/params.def b/gcc/params.def
> index cad00e2..65377d3 100644
> --- a/gcc/params.def
> +++ b/gcc/params.def
> @@ -1058,6 +1058,20 @@ DEFPARAM (PARAM_MAX_SLSR_CANDIDATE_SCAN,
>  	  "strength reduction",
>  	  50, 1, 999999)
>  
> +/* Maximum number of instructions to duplicate when shortcutting a switch.  */
> +DEFPARAM (PARAM_MAX_SWITCH_INSNS,
> +	  "max-switch-insns",
> +	  "Maximum number of instructions to duplicate when "
> +	  "shortcutting a switch statement",
> +	  100, 1, 999999)
> +
> +/* Maximum number of paths to duplicate when shortcutting a switch.  */
> +DEFPARAM (PARAM_MAX_SWITCH_PATHS,
> +	  "max-switch-paths",
> +	  "Maximum number of new paths to create when"
> +	  " shortcutting a switch statement",
> +	  50, 1, 999999)
> +
>  DEFPARAM (PARAM_ASAN_STACK,
>           "asan-stack",
>           "Enable asan stack protection",
> diff --git a/gcc/passes.def b/gcc/passes.def
> index f13df6c..8bbf2d0 100644
> --- a/gcc/passes.def
> +++ b/gcc/passes.def
> @@ -157,6 +157,7 @@ along with GCC; see the file COPYING3.  If not see
>        NEXT_PASS (pass_cselim);
>        NEXT_PASS (pass_copy_prop);
>        NEXT_PASS (pass_tree_ifcombine);
> +      NEXT_PASS (pass_tree_switch_shortcut);
>        NEXT_PASS (pass_phiopt);
>        NEXT_PASS (pass_tail_recursion);
>        NEXT_PASS (pass_ch);
> diff --git a/gcc/timevar.def b/gcc/timevar.def
> index a04d05c..d9ee915 100644
> --- a/gcc/timevar.def
> +++ b/gcc/timevar.def
> @@ -170,6 +170,7 @@ DEFTIMEVAR (TV_TREE_LOOP_IVCANON     , "tree canonical iv")
>  DEFTIMEVAR (TV_SCEV_CONST            , "scev constant prop")
>  DEFTIMEVAR (TV_TREE_LOOP_UNSWITCH    , "tree loop unswitching")
>  DEFTIMEVAR (TV_COMPLETE_UNROLL       , "complete unrolling")
> +DEFTIMEVAR (TV_TREE_SWITCH_SHORTCUT  , "switch statement shortcuts")
>  DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops")
>  DEFTIMEVAR (TV_TREE_VECTORIZATION    , "tree vectorization")
>  DEFTIMEVAR (TV_TREE_SLP_VECTORIZATION, "tree slp vectorization")
> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
> index 1477d1f..f898e27 100644
> --- a/gcc/tree-pass.h
> +++ b/gcc/tree-pass.h
> @@ -575,6 +575,7 @@ extern gimple_opt_pass *make_pass_early_inline (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_inline_parameters (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_update_address_taken (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_convert_switch (gcc::context *ctxt);
> +extern gimple_opt_pass *make_pass_tree_switch_shortcut (gcc::context *ctxt);
>  
>  /* Current optimization pass.  */
>  extern opt_pass *current_pass;
> diff --git a/gcc/tree-switch-shortcut.c b/gcc/tree-switch-shortcut.c
> new file mode 100644
> index 0000000..4518f79
> --- /dev/null
> +++ b/gcc/tree-switch-shortcut.c
> @@ -0,0 +1,438 @@
> +/* Switch shortcutting optimization for GNU C
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +   Contributed by Steve Ellcey (steve.ellcey@imgtec.com).
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify it under
> +the terms of the GNU General Public License as published by the Free
> +Software Foundation; either version 3, or (at your option) any later
> +version.
> +
> +GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> +for more details.
> +
> +You should have received a copy of the GNU General Public License
> +along with GCC; see the file COPYING3.  If not see
> +<http://www.gnu.org/licenses/>.  */
> +
> +/* This file implements an optimization where, when a variable is set
> +   to a constant value and there is a path that leads from that definition
> +   to a switch statement that uses that variable as its controlling expression
> +   we duplicate the blocks on this path and change the jump to the switch
> +   statement with a direct jump to the label of the switch block that control
> +   would goto based on the value of the variable.  This can come up in
> +   loops/switch statements that implement state machines.
> +
> +   Example (modified from PR 54742):
> +
> +   foo(char *str) {
> +     int sum=0;
> +     int state=0;
> +     char *s=str;
> +     for (; *s; s++) {
> +       char c=*s;
> +       <CODE BLOCK 1>
> +       switch (state) {
> +         case 0:
> +           if (c == '+')       { state = 1; sum += 9; }
> +           else if (c != '-')  { state = 2; sum += 3; }
> +           break;
> +         case 1:
> +           if (c == '+')       { state = 2; sum += 4; }
> +           else if (c == '-')  { state = 0; sum += 7; }
> +           break;
> +         case 2:
> +           if (c == '+')       { state = 0; sum += 8; }
> +           else if (c == '-')  { state = 1; sum += 2; }
> +           break;
> +       }
> +       <CODE BLOCK 2>
> +     }
> +     return state;
> +   }
> +
> +  This pass will convert the code inside 'case 0' to something like:
> +
> +    case 0:
> +      if (c == '+')      { state = 1; sum += 9;
> +                           <CODE BLOCK 2>
> +                           s++; if (!s) goto loop_exit;
> +                           <CODE BLOCK 1>
> +                           goto case_1; }
> +      else if (c != '-') { state = 2; sum += 3;
> +                           <CODE BLOCK 2>
> +                           s++; if (!s) goto loop_exit;
> +                           <CODE BLOCK 1>
> +                           goto case_2; }
> +      else               { <CODE BLOCK 2>
> +			   s++; if (!s) goto exit;
> +                           <CODE BLOCK 1>
> +                           goto case_0; }
> +
> +Similar transformations would apply to the other parts of the switch
> +statement.  This obviously can lead to a lot of code duplication but
> +it can also result in faster code since we are replacing two jumps
> +(one indirect) with a single direct jump.  */
> +
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#include "tm.h"
> +#include "params.h"
> +#include "flags.h"
> +#include "tree.h"
> +#include "tree-pass.h"
> +#include "basic-block.h"
> +#include "function.h"
> +#include "hash-table.h"
> +#include "tree-ssa-alias.h"
> +#include "tree-cfg.h"
> +#include "tree-ssa-operands.h"
> +#include "tree-inline.h"
> +#include "gimple-expr.h"
> +#include "is-a.h"
> +#include "gimple.h"
> +#include "tree-phinodes.h"
> +#include "gimple-iterator.h"
> +#include "gimple-ssa.h"
> +#include "ssa-iterators.h"
> +#include "tree-into-ssa.h"
> +#include "cfgloop.h"
> +
> +/* Helper function for find_path, visited_bbs is used to make sure we don't
> +   fall into an infinite loop.  */
> +
> +static int
> +find_path_1 (basic_block start_bb, basic_block end_bb,
> +	     hash_set<basic_block> *visited_bbs)
> +{
> +  edge_iterator ei;
> +  edge e;
> +
> +  if (start_bb == end_bb) return 1;
> +
> +  if (!visited_bbs->add (start_bb))
> +    {
> +      FOR_EACH_EDGE (e, ei, start_bb->succs)
> +	if (find_path_1 (e->dest, end_bb, visited_bbs))
> +	  return 1;
> +    }
> +  return 0;
> +}
> +
> +/* Return 1 if there is a path from start_bb to end_bb and 0 if there
> +   is not.  There may be multiple paths from start_bb to end_bb.  */
> +
> +static int
> +find_path (basic_block start_bb, basic_block end_bb)
> +{
> +  edge_iterator ei;
> +  edge e;
> +  hash_set<basic_block> visited_bbs;
> +  int p = 0;
> +
> +  if (start_bb == end_bb) return 1;
> +
> +  if (!visited_bbs.add (start_bb))
> +    {
> +      FOR_EACH_EDGE (e, ei, start_bb->succs)
> +	if (find_path_1 (e->dest, end_bb, &visited_bbs))
> +	  {
> +	    p = 1;
> +	    break;
> +	  }
> +    }
> +  return p;
> +}
> +
> +
> +/* We save the paths we want to copy in bbs_list_array.  n_bbs_list is the
> +   number of paths saved, bbs_list_array[i] is the list of basic blocks in
> +   one path.  Each path starts with the block where a variable is assigned
> +   a constant value (bbs_list_array[i][0]) and ends with the switch statement
> +   block (bbs_list_array[i][bbs_list_size[i]-2]) followed by the block that
> +   the switch statement is going to go to given the constant value of the
> +   variable (bbs_list_array[i][bbs_list_size[i]-1]).  */
> +
> +struct path_info
> +{
> +  basic_block **bbs_list_array;
> +  int *val_array;
> +  int *bbs_list_size;
> +  int max_path_count;
> +  int max_insn_count;
> +  int n_bbs_list;
> +};
> +
> +/* bbs_list[0] is the block with the switch statement,
> +   bbs_list[n-1] is the block where the switch statement variable is assigned
> +     a constant value,
> +   The entries in between make a (reverse) path between the two.
> +
> +   We don't want to change bb_list, we want to leave that alone and
> +   and copy the path to bbs_list_array so that we wind up with a list (array)
> +   of paths that we want to update.  We also want to add the block that the
> +   switch is going to go to on to the list so that we know which exit from
> +   the switch statement is important.  */
> +
> +static void
> +save_new_path (basic_block *bbs_list, int n, tree val, path_info *pi)
> +{
> +  int i;
> +  int insn_count;
> +  basic_block bb;
> +  edge switch_taken_edge;
> +  gimple_stmt_iterator gsi;
> +
> +  if (n <= 1) return;
> +
> +  if (pi->n_bbs_list >= pi->max_path_count)
> +    return;
> +
> +  /* Put the blocks in 'correct' order and add in where we want to go after
> +     the switch statement, We want to leave bbs_list untouched for future
> +     calls.  */
> +
> +  pi->bbs_list_array[pi->n_bbs_list] = XNEWVEC (basic_block, n+1);
> +  for (i = 0; i < n; i++)
> +    pi->bbs_list_array[pi->n_bbs_list][i] = bbs_list[n-i-1];
> +
> +  switch_taken_edge = find_taken_edge (bbs_list[0], val);
> +  pi->bbs_list_array[pi->n_bbs_list][n] = switch_taken_edge->dest;
> +
> +  pi->bbs_list_size[pi->n_bbs_list] = n + 1;
> +  pi->val_array[pi->n_bbs_list] = (int) TREE_INT_CST_LOW (val);
> +
> +  /* Count how many instructions are in the blocks we are going to
> +     duplicate and if there are too many do not save this path
> +     (return without incrementing n_bbs_list).  */
> +
> +  insn_count = 0;
> +  for (i = 1; i < n; i++)
> +    {
> +      bb = pi->bbs_list_array[pi->n_bbs_list][i];
> +      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +	insn_count += estimate_num_insns (gsi_stmt (gsi), &eni_size_weights);
> +    }
> +
> +  if (insn_count > pi->max_insn_count)
> +    return;
> +
> +  pi->n_bbs_list = pi->n_bbs_list + 1;
> +}
> +
> +/* switch_stmt is a switch statement whose switch index expression
> +   is the variable expr.  We trace the value of the variable back
> +   through any phi nodes looking for places where it gets a constant
> +   value and save the path in bbs_list.  Then we call save_new_path
> +   to create a list of such paths.  */
> +
> +static void
> +process_switch (tree expr, gimple switch_stmt,
> +		hash_set<gimple> *visited_phis,
> +	        basic_block *bbs_list, int n,
> +		path_info *pi)
> +{
> +  gimple def_stmt;
> +  tree var;
> +  unsigned int i;
> +  edge e;
> +  edge_iterator ei;
> +  basic_block bbx;
> +  basic_block var_bb;
> +  int e_count;
> +
> +  gcc_assert (gimple_code (switch_stmt) == GIMPLE_SWITCH);
> +  var = SSA_NAME_VAR (expr);
> +  def_stmt = SSA_NAME_DEF_STMT (expr);
> +  var_bb = gimple_bb (def_stmt);
> +
> +  if (var == NULL || var_bb == NULL) return;
> +
> +  /* We have a variable definition (var) that is defined in var_bb,
> +     We want to put the path from var_bb to the current bb into the
> +     bbs_list.  If there is more then one path, skip this and don't
> +     try to do the optimization.  */
> +
> +  bbx = bbs_list[n-1];
> +  while (bbx != var_bb)
> +    {
> +      e_count = 0;
> +      FOR_EACH_EDGE (e, ei, bbx->preds)
> +	if (find_path (var_bb, e->src))
> +	  {
> +	    bbs_list[n] = e->src;
> +	    n = n + 1;
> +	    e_count = e_count + 1;
> +	  }
> +      if (e_count != 1) return;
> +      bbx = bbs_list[n-1];
> +    }
> +
> +  if (gimple_code (def_stmt) == GIMPLE_PHI
> +      && !visited_phis->add (def_stmt))
> +    {
> +      for (i = 0; i < gimple_phi_num_args (def_stmt); i++)
> +	{
> +	  tree arg = gimple_phi_arg_def (def_stmt, i);
> +	  if (arg && TREE_CODE (arg) == INTEGER_CST)
> +	    {
> +	      /* const char *name = IDENTIFIER_POINTER (DECL_NAME (var)); */
> +	      bbs_list[n] = gimple_phi_arg_edge (def_stmt, i)->src;
> +	      save_new_path (bbs_list, n + 1, arg, pi);
> +	    }
> +	  else if (arg && TREE_CODE (arg) == SSA_NAME)
> +	    {
> +	      bbs_list[n] = gimple_phi_arg_edge (def_stmt, i)->src;
> +	      process_switch (arg, switch_stmt, visited_phis, bbs_list, n+1, pi);
> +	    }
> +	}
> +    }
> +}
> +
> +/* Find paths that lead from blocks where a variable is assigned a constant
> +   value to a switch statement where that variable is used as the switch
> +   index.  Save the paths in bbs_list_array so that they can be processed
> +   by copy_switch_paths.  */
> +
> +static unsigned int
> +find_switch_shortcuts (function *fun, path_info *pi)
> +{
> +  basic_block bb;
> +  hash_set<gimple> visited_phis;
> +  basic_block *bbs_list;
> +  int n = 1;
> +
> +  bbs_list = XNEWVEC (basic_block, n_basic_blocks_for_fn (fun));
> +  FOR_EACH_BB_FN (bb, fun)
> +    {
> +      gimple stmt = last_stmt (bb);
> +      if (stmt && gimple_code (stmt) == GIMPLE_SWITCH)
> +	{
> +	  tree op = gimple_switch_index (stmt);
> +	  tree var = SSA_NAME_VAR (op);
> +	  if (var)
> +	    {
> +	      bbs_list[0] = bb;
> +	      process_switch (op, stmt, &visited_phis, bbs_list, n, pi);
> +	    }
> +	}
> +    }
> +  XDELETEVEC (bbs_list);
> +  return 0;
> +}
> +
> +/* Call gimple_duplicate_sese_region to douplicate the blocks in bb_list.
> +   We free and recalculate all ssa and dominance information afterwords
> +   because the region being copied is not really SESE and so we cannot
> +   trust gimple_duplicate_sese_region to correctly update the dataflow
> +   information.  */
> +
> +static void
> +duplicate_blocks (basic_block *bb_list, int bb_count)
> +{
> +  edge orig_edge, exit_edge;
> +  loop_p loop;
> +
> +  orig_edge = find_edge (bb_list[0], bb_list[1]);
> +  exit_edge = find_edge (bb_list[bb_count-2], bb_list[bb_count-1]);
> +  /* Earlier block duplications may have removed the path that we
> +     saved earlier and are trying to duplicate here.  */
> +  if (orig_edge != NULL && exit_edge != NULL)
> +    {
> +      gimple_duplicate_sese_region (orig_edge, exit_edge, &bb_list[1],
> +				    bb_count-2, NULL, false);
> +      free_dominance_info (CDI_DOMINATORS);
> +      update_ssa (TODO_update_ssa);
> +      calculate_dominance_info (CDI_DOMINATORS);
> +      loops_state_set (LOOPS_NEED_FIXUP);
> +    }
> +}
> +
> +/* Go through the paths saved in bbs_list_array and make copies of them.  */
> +
> +static void
> +copy_switch_paths (path_info *pi)
> +{
> +  int i;
> +
> +  /* Process each path in bbs_list_size.  */
> +  for (i = 0; i < pi->n_bbs_list; i++)
> +    {
> +    /* For each path in bbs_list_size loop through and copy each block in
> +       the path (except the first on where the constant is assigned and
> +       the final one where the switch statement goes to.  */
> +
> +    if (!single_pred_p (pi->bbs_list_array[i][1]))
> +      duplicate_blocks (pi->bbs_list_array[i], pi->bbs_list_size[i]);
> +    }
> +}
> +
> +
> +/* Main entry for the tree if-conversion pass.  */
> +
> +namespace {
> +
> +const pass_data pass_data_tree_switch_shortcut =
> +{
> +  GIMPLE_PASS, /* type */
> +  "switch_shortcut", /* name */
> +  OPTGROUP_NONE, /* optinfo_flags */
> +  TV_TREE_SWITCH_SHORTCUT, /* tv_id */
> +  ( PROP_cfg | PROP_ssa ), /* properties_required */
> +  0, /* properties_provided */
> +  0, /* properties_destroyed */
> +  0, /* todo_flags_start */
> +  TODO_update_ssa, /* todo_flags_finish */
> +};
> +
> +class pass_tree_switch_shortcut : public gimple_opt_pass
> +{
> +public:
> +  pass_tree_switch_shortcut (gcc::context *ctxt)
> +    : gimple_opt_pass (pass_data_tree_switch_shortcut, ctxt)
> +  {}
> +
> +  /* opt_pass methods: */
> +  virtual bool gate (function *)
> +    {
> +      return flag_tree_switch_shortcut;
> +    }
> +  virtual unsigned int execute (function *);
> +
> +}; // class pass_tree_switch_shortcut
> +
> +unsigned int
> +pass_tree_switch_shortcut::execute (function *fun)
> +{
> +  int i;
> +  path_info *pi;
> +
> +  pi = XNEW (path_info);
> +  pi->n_bbs_list = 0;
> +  pi->max_insn_count = PARAM_VALUE (PARAM_MAX_SWITCH_INSNS);
> +  pi->max_path_count = PARAM_VALUE (PARAM_MAX_SWITCH_PATHS);
> +  pi->val_array = XNEWVEC (int, pi->max_path_count);
> +  pi->bbs_list_size = XNEWVEC (int, pi->max_path_count);
> +  pi->bbs_list_array = XNEWVEC (basic_block *, pi->max_path_count);
> +  find_switch_shortcuts (fun, pi);
> +  copy_switch_paths (pi);
> +  XDELETEVEC (pi->val_array);
> +  XDELETEVEC (pi->bbs_list_size);
> +  for (i = 0; i < pi->n_bbs_list; i++)
> +    XDELETEVEC (pi->bbs_list_array[i]);
> +  XDELETEVEC (pi->bbs_list_array);
> +  XDELETE (pi);
> +  return 0;
> +}
> +
> +} // anon namespace
> +
> +gimple_opt_pass *
> +make_pass_tree_switch_shortcut (gcc::context *ctxt)
> +{
> +  return new pass_tree_switch_shortcut (ctxt);
> +}

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Switch elimination pass for PR 54742
  2014-08-20 17:04 ` James Greenhalgh
@ 2014-08-20 20:29   ` Sebastian Pop
  2014-08-21  8:53     ` Richard Biener
  0 siblings, 1 reply; 54+ messages in thread
From: Sebastian Pop @ 2014-08-20 20:29 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: Steve Ellcey, GCC Patches, Jeff Law, Richard Biener

James Greenhalgh wrote:
> On Tue, Aug 19, 2014 at 09:39:56PM +0100, Steve Ellcey wrote:
> > Here is an official submission for the switch optimization described in
> > PR 54742.  I have addressed the formatting/comment issues that were raised
> > and also added a test case based on comment #27 from PR 54742 and I fixed a
> > bug I found while doing benchmarking with SPEC2006 (the perl benchmark was
> > generating an ICE in a routine with multiple switch statements).
> > 
> > I ran the benchmarking to see if I could find any more tests that are
> > helped like coremark is and while I found a number of benchmarks in
> > SPEC 2006 and EEMBC where the optimization is triggered, this optimization
> > generally didn't affect the performance of those benchmarks.  The biggest
> > impact I could find was on the perl benchmark in SPEC where I saw around
> > a 0.4% improvement on a MIPS 74k.  Not huge, but not nothing.
> 
> For what it is worth, I see a nice (~4%) improvement in Crafty from
> SPEC 2000. I haven't investigated too deeply, but at a first glance the
> number of branch mispredictions has dropped just over 1%, as you
> might hope from this optimisation.
> 
> I can also attest to there being a number of places the optimisation is
> triggered (with high enough parameters; I was running with
> --param max-switch-paths=1000 --param max-switch-insns=10000), but like
> you I don't see much measurable change in execution time.

Without change to the default params, I see the switch shortcut having a
performance impact on both png and jpeg, compress and decompress mode.

I think that's enough to remove the "benchmarketing" label from the switch
shortcut transform.

Sebastian

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Switch elimination pass for PR 54742
  2014-08-20 20:29   ` Sebastian Pop
@ 2014-08-21  8:53     ` Richard Biener
  2014-08-22 20:13       ` Sebastian Pop
  0 siblings, 1 reply; 54+ messages in thread
From: Richard Biener @ 2014-08-21  8:53 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: James Greenhalgh, Steve Ellcey, GCC Patches, Jeff Law

On Wed, Aug 20, 2014 at 10:29 PM, Sebastian Pop <sebpop@gmail.com> wrote:
> James Greenhalgh wrote:
>> On Tue, Aug 19, 2014 at 09:39:56PM +0100, Steve Ellcey wrote:
>> > Here is an official submission for the switch optimization described in
>> > PR 54742.  I have addressed the formatting/comment issues that were raised
>> > and also added a test case based on comment #27 from PR 54742 and I fixed a
>> > bug I found while doing benchmarking with SPEC2006 (the perl benchmark was
>> > generating an ICE in a routine with multiple switch statements).
>> >
>> > I ran the benchmarking to see if I could find any more tests that are
>> > helped like coremark is and while I found a number of benchmarks in
>> > SPEC 2006 and EEMBC where the optimization is triggered, this optimization
>> > generally didn't affect the performance of those benchmarks.  The biggest
>> > impact I could find was on the perl benchmark in SPEC where I saw around
>> > a 0.4% improvement on a MIPS 74k.  Not huge, but not nothing.
>>
>> For what it is worth, I see a nice (~4%) improvement in Crafty from
>> SPEC 2000. I haven't investigated too deeply, but at a first glance the
>> number of branch mispredictions has dropped just over 1%, as you
>> might hope from this optimisation.
>>
>> I can also attest to there being a number of places the optimisation is
>> triggered (with high enough parameters; I was running with
>> --param max-switch-paths=1000 --param max-switch-insns=10000), but like
>> you I don't see much measurable change in execution time.
>
> Without change to the default params, I see the switch shortcut having a
> performance impact on both png and jpeg, compress and decompress mode.
>
> I think that's enough to remove the "benchmarketing" label from the switch
> shortcut transform.

Did you look at the actual code transformation the pass does to these?
(what is 'png' and 'jpeg'?)  What's the code size impact?

Richard.

> Sebastian

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Switch elimination pass for PR 54742
  2014-08-19 20:40 [Patch] Switch elimination pass for PR 54742 Steve Ellcey
  2014-08-20 17:04 ` James Greenhalgh
@ 2014-08-21  8:58 ` Richard Biener
  2014-08-21  9:41   ` James Greenhalgh
  1 sibling, 1 reply; 54+ messages in thread
From: Richard Biener @ 2014-08-21  8:58 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: GCC Patches, Jeff Law, Sebastian Pop

On Tue, Aug 19, 2014 at 10:39 PM, Steve Ellcey <sellcey@mips.com> wrote:
> Here is an official submission for the switch optimization described in
> PR 54742.  I have addressed the formatting/comment issues that were raised
> and also added a test case based on comment #27 from PR 54742 and I fixed a
> bug I found while doing benchmarking with SPEC2006 (the perl benchmark was
> generating an ICE in a routine with multiple switch statements).
>
> I ran the benchmarking to see if I could find any more tests that are
> helped like coremark is and while I found a number of benchmarks in
> SPEC 2006 and EEMBC where the optimization is triggered, this optimization
> generally didn't affect the performance of those benchmarks.  The biggest
> impact I could find was on the perl benchmark in SPEC where I saw around
> a 0.4% improvement on a MIPS 74k.  Not huge, but not nothing.
>
> So, OK to checkin?

Without looking at the patch in detail what is the rationale for the
pass placement (looks quite early)?  I would have guessed that
the pass could benefit from value-range analysis.

Jeff, Steve is it possible to trigger the transform by simply
"manually forcing" the right "path" jump-threads from
inside VRP?  That is, basically integrate the transform part
with the existing jump threading framework but do an
alternate discovery pass?

Thanks,
Richard.

> Steve Ellcey
> sellcey@mips.com
>
>
> 2014-08-12  Steve Ellcey  <sellcey@mips.com>
>
>         PR tree-opt/54742
>         * Makefile.in (OBJS): Add tree-switch-shortcut.o.
>         * common.opt (ftree-switch-shortcut): New.
>         * opts.c (default_options_table): Add OPT_ftree_switch_shortcut.
>         * params.def (PARAM_MAX_SWITCH_INSNS): New.
>         (PARAM_MAX_SWITCH_PATHS): New.
>         * passes.def (pass_tree_switch_shortcut): New.
>         * timevar.def (TV_TREE_SWITCH_SHORTCUT): New.
>         * tree-pass.h (make_pass_tree_switch_shortcut): New.
>         * tree-switch-shortcut.c: New.
>
>
> 2014-08-12  Steve Ellcey  <sellcey@mips.com>
>
>         PR tree-opt/54742
>         * gcc.dg/pr54742.c: New test.
>

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Switch elimination pass for PR 54742
  2014-08-21  8:58 ` Richard Biener
@ 2014-08-21  9:41   ` James Greenhalgh
  2014-08-21 10:30     ` Richard Biener
  0 siblings, 1 reply; 54+ messages in thread
From: James Greenhalgh @ 2014-08-21  9:41 UTC (permalink / raw)
  To: Richard Biener; +Cc: Steve Ellcey, GCC Patches, Jeff Law, Sebastian Pop

On Thu, Aug 21, 2014 at 09:57:56AM +0100, Richard Biener wrote:
> On Tue, Aug 19, 2014 at 10:39 PM, Steve Ellcey <sellcey@mips.com> wrote:
> > Here is an official submission for the switch optimization described in
> > PR 54742.  I have addressed the formatting/comment issues that were raised
> > and also added a test case based on comment #27 from PR 54742 and I fixed a
> > bug I found while doing benchmarking with SPEC2006 (the perl benchmark was
> > generating an ICE in a routine with multiple switch statements).
> >
> > I ran the benchmarking to see if I could find any more tests that are
> > helped like coremark is and while I found a number of benchmarks in
> > SPEC 2006 and EEMBC where the optimization is triggered, this optimization
> > generally didn't affect the performance of those benchmarks.  The biggest
> > impact I could find was on the perl benchmark in SPEC where I saw around
> > a 0.4% improvement on a MIPS 74k.  Not huge, but not nothing.
> >
> > So, OK to checkin?
> 
> Without looking at the patch in detail what is the rationale for the
> pass placement (looks quite early)?  I would have guessed that
> the pass could benefit from value-range analysis.
> 
> Jeff, Steve is it possible to trigger the transform by simply
> "manually forcing" the right "path" jump-threads from
> inside VRP?  That is, basically integrate the transform part
> with the existing jump threading framework but do an
> alternate discovery pass?

This seems like what I tried to do last year with:

  https://gcc.gnu.org/ml/gcc-patches/2013-06/msg01121.html

It turns Jeff's jump-threading code in to a strange franken-pass of bits and
pieces of detection and optimisation, and would need some substantial
reworking to fit in with Jeff's changes last Autumn, but if it is more
likely to be acceptable for trunk then perhaps we could look to revive it.
It would be nice to reuse the path copy code Jeff added last year, but I
don't have much intuition as to how feasible that is.

Was this the sort of thing that you were imagining? 

Steve, Jeff?

James

> 
> Thanks,
> Richard.
> 
> > Steve Ellcey
> > sellcey@mips.com
> >
> >
> > 2014-08-12  Steve Ellcey  <sellcey@mips.com>
> >
> >         PR tree-opt/54742
> >         * Makefile.in (OBJS): Add tree-switch-shortcut.o.
> >         * common.opt (ftree-switch-shortcut): New.
> >         * opts.c (default_options_table): Add OPT_ftree_switch_shortcut.
> >         * params.def (PARAM_MAX_SWITCH_INSNS): New.
> >         (PARAM_MAX_SWITCH_PATHS): New.
> >         * passes.def (pass_tree_switch_shortcut): New.
> >         * timevar.def (TV_TREE_SWITCH_SHORTCUT): New.
> >         * tree-pass.h (make_pass_tree_switch_shortcut): New.
> >         * tree-switch-shortcut.c: New.
> >
> >
> > 2014-08-12  Steve Ellcey  <sellcey@mips.com>
> >
> >         PR tree-opt/54742
> >         * gcc.dg/pr54742.c: New test.
> >
> 

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Switch elimination pass for PR 54742
  2014-08-21  9:41   ` James Greenhalgh
@ 2014-08-21 10:30     ` Richard Biener
  2014-08-25 17:35       ` Jeff Law
  0 siblings, 1 reply; 54+ messages in thread
From: Richard Biener @ 2014-08-21 10:30 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: Steve Ellcey, GCC Patches, Jeff Law, Sebastian Pop

On Thu, Aug 21, 2014 at 11:41 AM, James Greenhalgh
<james.greenhalgh@arm.com> wrote:
> On Thu, Aug 21, 2014 at 09:57:56AM +0100, Richard Biener wrote:
>> On Tue, Aug 19, 2014 at 10:39 PM, Steve Ellcey <sellcey@mips.com> wrote:
>> > Here is an official submission for the switch optimization described in
>> > PR 54742.  I have addressed the formatting/comment issues that were raised
>> > and also added a test case based on comment #27 from PR 54742 and I fixed a
>> > bug I found while doing benchmarking with SPEC2006 (the perl benchmark was
>> > generating an ICE in a routine with multiple switch statements).
>> >
>> > I ran the benchmarking to see if I could find any more tests that are
>> > helped like coremark is and while I found a number of benchmarks in
>> > SPEC 2006 and EEMBC where the optimization is triggered, this optimization
>> > generally didn't affect the performance of those benchmarks.  The biggest
>> > impact I could find was on the perl benchmark in SPEC where I saw around
>> > a 0.4% improvement on a MIPS 74k.  Not huge, but not nothing.
>> >
>> > So, OK to checkin?
>>
>> Without looking at the patch in detail what is the rationale for the
>> pass placement (looks quite early)?  I would have guessed that
>> the pass could benefit from value-range analysis.
>>
>> Jeff, Steve is it possible to trigger the transform by simply
>> "manually forcing" the right "path" jump-threads from
>> inside VRP?  That is, basically integrate the transform part
>> with the existing jump threading framework but do an
>> alternate discovery pass?
>
> This seems like what I tried to do last year with:
>
>   https://gcc.gnu.org/ml/gcc-patches/2013-06/msg01121.html
>
> It turns Jeff's jump-threading code in to a strange franken-pass of bits and
> pieces of detection and optimisation, and would need some substantial
> reworking to fit in with Jeff's changes last Autumn, but if it is more
> likely to be acceptable for trunk then perhaps we could look to revive it.
> It would be nice to reuse the path copy code Jeff added last year, but I
> don't have much intuition as to how feasible that is.
>
> Was this the sort of thing that you were imagining?

Yeah, didn't look too closely though.

Richard.

> Steve, Jeff?
>
> James
>
>>
>> Thanks,
>> Richard.
>>
>> > Steve Ellcey
>> > sellcey@mips.com
>> >
>> >
>> > 2014-08-12  Steve Ellcey  <sellcey@mips.com>
>> >
>> >         PR tree-opt/54742
>> >         * Makefile.in (OBJS): Add tree-switch-shortcut.o.
>> >         * common.opt (ftree-switch-shortcut): New.
>> >         * opts.c (default_options_table): Add OPT_ftree_switch_shortcut.
>> >         * params.def (PARAM_MAX_SWITCH_INSNS): New.
>> >         (PARAM_MAX_SWITCH_PATHS): New.
>> >         * passes.def (pass_tree_switch_shortcut): New.
>> >         * timevar.def (TV_TREE_SWITCH_SHORTCUT): New.
>> >         * tree-pass.h (make_pass_tree_switch_shortcut): New.
>> >         * tree-switch-shortcut.c: New.
>> >
>> >
>> > 2014-08-12  Steve Ellcey  <sellcey@mips.com>
>> >
>> >         PR tree-opt/54742
>> >         * gcc.dg/pr54742.c: New test.
>> >
>>

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Switch elimination pass for PR 54742
  2014-08-21  8:53     ` Richard Biener
@ 2014-08-22 20:13       ` Sebastian Pop
  0 siblings, 0 replies; 54+ messages in thread
From: Sebastian Pop @ 2014-08-22 20:13 UTC (permalink / raw)
  To: Richard Biener; +Cc: James Greenhalgh, Steve Ellcey, GCC Patches, Jeff Law

Richard Biener wrote:
> On Wed, Aug 20, 2014 at 10:29 PM, Sebastian Pop <sebpop@gmail.com> wrote:
> > James Greenhalgh wrote:
> >> On Tue, Aug 19, 2014 at 09:39:56PM +0100, Steve Ellcey wrote:
> >> > Here is an official submission for the switch optimization described in
> >> > PR 54742.  I have addressed the formatting/comment issues that were raised
> >> > and also added a test case based on comment #27 from PR 54742 and I fixed a
> >> > bug I found while doing benchmarking with SPEC2006 (the perl benchmark was
> >> > generating an ICE in a routine with multiple switch statements).
> >> >
> >> > I ran the benchmarking to see if I could find any more tests that are
> >> > helped like coremark is and while I found a number of benchmarks in
> >> > SPEC 2006 and EEMBC where the optimization is triggered, this optimization
> >> > generally didn't affect the performance of those benchmarks.  The biggest
> >> > impact I could find was on the perl benchmark in SPEC where I saw around
> >> > a 0.4% improvement on a MIPS 74k.  Not huge, but not nothing.
> >>
> >> For what it is worth, I see a nice (~4%) improvement in Crafty from
> >> SPEC 2000. I haven't investigated too deeply, but at a first glance the
> >> number of branch mispredictions has dropped just over 1%, as you
> >> might hope from this optimisation.
> >>
> >> I can also attest to there being a number of places the optimisation is
> >> triggered (with high enough parameters; I was running with
> >> --param max-switch-paths=1000 --param max-switch-insns=10000), but like
> >> you I don't see much measurable change in execution time.
> >
> > Without change to the default params, I see the switch shortcut having a
> > performance impact on both png and jpeg, compress and decompress mode.
> >
> > I think that's enough to remove the "benchmarketing" label from the switch
> > shortcut transform.
> 
> Did you look at the actual code transformation the pass does to these?
> (what is 'png' and 'jpeg'?)  What's the code size impact?

google("jddctmgr.c") the start_pass function contains a for loop with a switch
stmt that is optimized by Steve's pass.

The png one occurs in google("pngread.c png_image_read_colormap")

There is not much code duplicated in both cases.

Sebastian

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Switch elimination pass for PR 54742
  2014-08-21 10:30     ` Richard Biener
@ 2014-08-25 17:35       ` Jeff Law
  2014-09-26 20:14         ` Sebastian Pop
  0 siblings, 1 reply; 54+ messages in thread
From: Jeff Law @ 2014-08-25 17:35 UTC (permalink / raw)
  To: Richard Biener, James Greenhalgh; +Cc: Steve Ellcey, GCC Patches, Sebastian Pop

On 08/21/14 04:30, Richard Biener wrote:
>> It turns Jeff's jump-threading code in to a strange franken-pass of bits and
>> pieces of detection and optimisation, and would need some substantial
>> reworking to fit in with Jeff's changes last Autumn, but if it is more
>> likely to be acceptable for trunk then perhaps we could look to revive it.
>> It would be nice to reuse the path copy code Jeff added last year, but I
>> don't have much intuition as to how feasible that is.
>>
>> Was this the sort of thing that you were imagining?
>
> Yeah, didn't look too closely though.
It'd be pretty ugly I suspect.  But it's probably worth pondering since 
that approach would eliminate the concerns about the cost of detection 
(which is problematical for the jump threader) by using Steve's code for 
that.

On the update side, I suspect most, if not all of the framework is in 
place to handle this kind of update if the right threading paths were 
passed to the updater.  I can probably cobble together that by-hand and 
see what the tree-ssa-threadupdate does with it.  But it'll be a week or 
so before I could look at it.

jeff

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Switch elimination pass for PR 54742
  2014-08-25 17:35       ` Jeff Law
@ 2014-09-26 20:14         ` Sebastian Pop
  2014-10-26 21:34           ` [Patch] Improving jump-thread " Sebastian Pop
  0 siblings, 1 reply; 54+ messages in thread
From: Sebastian Pop @ 2014-09-26 20:14 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1640 bytes --]

Jeff Law wrote:
> On 08/21/14 04:30, Richard Biener wrote:
> >>It turns Jeff's jump-threading code in to a strange franken-pass of bits and
> >>pieces of detection and optimisation, and would need some substantial
> >>reworking to fit in with Jeff's changes last Autumn, but if it is more
> >>likely to be acceptable for trunk then perhaps we could look to revive it.
> >>It would be nice to reuse the path copy code Jeff added last year, but I
> >>don't have much intuition as to how feasible that is.
> >>
> >>Was this the sort of thing that you were imagining?
> >
> >Yeah, didn't look too closely though.
> It'd be pretty ugly I suspect.  But it's probably worth pondering
> since that approach would eliminate the concerns about the cost of
> detection (which is problematical for the jump threader) by using
> Steve's code for that.
> 
> On the update side, I suspect most, if not all of the framework is
> in place to handle this kind of update if the right threading paths
> were passed to the updater.  I can probably cobble together that
> by-hand and see what the tree-ssa-threadupdate does with it.  But
> it'll be a week or so before I could look at it.

I adapted the patch James has sent last year to use the new update paths
mechanism.  I verified that the attached patch does register all the paths that
need to be threaded.  Thread updater seems to have some problems handling the
attached testcase (a simplified version of the testcase attached to the bug.)

Jeff, could you please have a look at why the jump-thread updater is crashing?

Let me know if you want me to continue looking at the problem.

Thanks,
Sebastian

[-- Attachment #2: 0001-jump-thread-for-PR-54742.patch --]
[-- Type: text/x-diff, Size: 8415 bytes --]

From 1f09b819559865be5a366e11a9c0f9bf495f91bc Mon Sep 17 00:00:00 2001
From: Sebastian Pop <s.pop@samsung.com>
Date: Fri, 26 Sep 2014 14:54:20 -0500
Subject: [PATCH] jump thread for PR 54742

Adapted from a patch from James Greenhalgh.

	* Makefile.in: Add dependence on pointer-set.o.

	* tree-ssa-threadedge.c: Include pointer-set.h.
	(simplify_control_stmt_condition): Restore the original value of cond
	when simplification fails.
	(find_thread_path): New.
	(find_control_statement_thread_paths): New.
	(thread_through_normal_block): Call find_control_statement_thread_paths.

	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c: New.
---
 gcc/Makefile.in                                  |   1 +
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c |  32 +++++
 gcc/tree-ssa-threadedge.c                        | 170 ++++++++++++++++++++++-
 3 files changed, 202 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 6f251a5..ebaed55 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1310,6 +1310,7 @@ OBJS = \
 	opts-global.o \
 	passes.o \
 	plugin.o \
+	pointer-set.o \
 	postreload-gcse.o \
 	postreload.o \
 	predict.o \
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
new file mode 100644
index 0000000..f3ef725
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
@@ -0,0 +1,32 @@
+int sum0, sum1, sum2, sum3;
+int foo(char * s, char** ret)
+{
+  int state=0;
+  char c;
+
+  for (; *s && state != 4; s++)
+    {
+      c = *s;
+      if (c == '*')
+	{
+	  s++;
+	  break;
+	}
+      switch (state) {
+	case 0:
+	  if (c == '+') state = 1;
+	  else if (c != '-') sum0+=c;
+	  break;
+	case 1:
+	  if (c == '+') state = 2;
+	  else if (c == '-') state = 0;
+	  else sum1+=c;
+	  break;
+	default:
+	  break;
+      }
+
+    }
+  *ret = s;
+  return state;
+}
diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
index 3dee5ba..ee09841 100644
--- a/gcc/tree-ssa-threadedge.c
+++ b/gcc/tree-ssa-threadedge.c
@@ -49,6 +49,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "tree-ssa-threadedge.h"
 #include "builtins.h"
+#include "pointer-set.h"
 
 /* To avoid code explosion due to jump threading, we limit the
    number of statements we are going to copy.  This variable
@@ -628,6 +629,7 @@ simplify_control_stmt_condition (edge e,
      rather than use a relational operator.  These are simpler to handle.  */
   if (TREE_CODE (cond) == SSA_NAME)
     {
+      tree original_lhs = cond;
       cached_lhs = cond;
 
       /* Get the variable's current value from the equivalence chains.
@@ -656,6 +658,12 @@ simplify_control_stmt_condition (edge e,
 	 pass specific callback to try and simplify it further.  */
       if (cached_lhs && ! is_gimple_min_invariant (cached_lhs))
         cached_lhs = (*simplify) (stmt, stmt);
+
+      /* We couldn't find an invariant.  But, callers of this
+	 function may be able to do something useful with the
+	 unmodified destination.  */
+      if (!cached_lhs)
+	cached_lhs = original_lhs;
     }
   else
     cached_lhs = NULL;
@@ -915,6 +923,145 @@ thread_around_empty_blocks (edge taken_edge,
   return false;
 }
 
+/* Return true if there is at least one path from START_BB to END_BB.
+   VISITED_BBS is used to make sure we don't fall into an infinite loop.  */
+
+static bool
+find_thread_path (basic_block start_bb, basic_block end_bb,
+		    vec<basic_block, va_gc> *&path,
+		    struct pointer_set_t *visited_bbs)
+{
+  if (start_bb == end_bb)
+    {
+      vec_safe_push (path, start_bb);
+      return true;
+    }
+
+  if (!pointer_set_insert (visited_bbs, start_bb))
+    {
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, start_bb->succs)
+	if (find_thread_path (e->dest, end_bb, path, visited_bbs))
+	  {
+	    vec_safe_push (path, start_bb);
+	    return true;
+	  }
+    }
+
+  return false;
+}
+
+/* CTRL_STMT is a COND_EXPR or SWITCH_EXPR statement whose controlling
+   expression is the variable EXPR.  We trace the value of the variable back
+   through any phi nodes looking for places where it gets a constant
+   value and save the path.  */
+
+static void
+find_control_statement_thread_paths (tree expr,
+				     struct pointer_set_t *visited_phis,
+				     vec<basic_block, va_gc> *&path)
+{
+  tree var = SSA_NAME_VAR (expr);
+  gimple def_stmt = SSA_NAME_DEF_STMT (expr);
+  basic_block var_bb = gimple_bb (def_stmt);
+
+  if (var == NULL || var_bb == NULL)
+    return;
+
+  vec<basic_block, va_gc> *next_path;
+  vec_alloc (next_path, n_basic_blocks_for_fn (cfun));
+
+  basic_block last_bb_in_path = path->last ();
+
+  /* Put the path from var_bb to last_bb_in_path into next_path.  */
+  if (var_bb != last_bb_in_path)
+    {
+      edge e;
+      int e_count = 0;
+      edge_iterator ei;
+
+      FOR_EACH_EDGE (e, ei, last_bb_in_path->preds)
+	{
+	  struct pointer_set_t *visited_bbs = pointer_set_create ();
+
+	  if (find_thread_path (var_bb, e->src, next_path, visited_bbs))
+	    e_count = e_count + 1;
+
+	  pointer_set_destroy (visited_bbs);
+
+	  /* If there is more than one path, stop.  */
+	  if (e_count > 1)
+	    {
+	      vec_free (next_path);
+	      return;
+	    }
+	}
+    }
+
+  // Visit PHI nodes once.
+  if (gimple_code (def_stmt) != GIMPLE_PHI
+      || pointer_set_insert (visited_phis, def_stmt)) {
+    vec_free (next_path);
+    return;
+  }
+
+  // Append all the nodes from next_path to path.
+  vec_safe_splice (path, next_path);
+  gcc_assert (path->last () == var_bb);
+
+  // Iterate over the arguments of PHI.
+  unsigned int i;
+  for (i = 0; i < gimple_phi_num_args (def_stmt); i++)
+    {
+      tree arg = gimple_phi_arg_def (def_stmt, i);
+      basic_block bbi = gimple_phi_arg_edge (def_stmt, i)->src;
+
+      // Skip edges pointing outside the current loop.
+      if (!arg || var_bb->loop_father != bbi->loop_father)
+	continue;
+
+      // Add BBI to the path.
+      vec_safe_push (path, bbi);
+
+      if (TREE_CODE (arg) == INTEGER_CST)
+	{
+	  int j, n = path->length ();
+	  vec<jump_thread_edge *> *jump_thread_path
+	    = new vec<jump_thread_edge *> ();
+
+	  for (j = 0; j < n-1; j++)
+	    {
+	      edge e = find_edge ((*path)[n - j - 1],
+				  (*path)[n - j - 2]);
+	      gcc_assert (e);
+	      enum jump_thread_edge_type kind;
+
+	      if (j == 0)
+		kind = EDGE_START_JUMP_THREAD;
+	      else if (single_pred_p (e->dest))
+		kind = EDGE_COPY_SRC_BLOCK;
+	      else
+		kind = EDGE_COPY_SRC_JOINER_BLOCK;
+
+	      jump_thread_edge *x = new jump_thread_edge (e, kind);
+	      jump_thread_path->safe_push (x);
+	    }
+
+	  register_jump_thread (jump_thread_path);
+	}
+      else if (TREE_CODE (arg) == SSA_NAME)
+	find_control_statement_thread_paths (arg, visited_phis, path);
+
+      /* Remove BBI from the path.  */
+      path->pop ();
+    }
+
+  /* Remove all the nodes that we added from next_path.  */
+  vec_safe_truncate (path, (path->length () - next_path->length ()));
+  vec_free (next_path);
+}
+
 /* We are exiting E->src, see if E->dest ends with a conditional
    jump which has a known value when reached via E.
 
@@ -1000,7 +1147,10 @@ thread_through_normal_block (edge e,
       cond = simplify_control_stmt_condition (e, stmt, dummy_cond, simplify,
 					      handle_dominating_asserts);
 
-      if (cond && is_gimple_min_invariant (cond))
+      if (!cond)
+	return 0;
+
+      if (is_gimple_min_invariant (cond))
 	{
 	  edge taken_edge = find_taken_edge (e->dest, cond);
 	  basic_block dest = (taken_edge ? taken_edge->dest : NULL);
@@ -1046,6 +1196,24 @@ thread_through_normal_block (edge e,
 				      backedge_seen_p);
 	  return 1;
 	}
+
+      if (TREE_CODE (cond) != SSA_NAME)
+	return 0;
+
+      /* When COND cannot be simplified, try to find paths from a control
+	 statement back through the PHI nodes which would affect that control
+	 statement.  */
+      vec<basic_block, va_gc> *bb_path;
+      vec_alloc (bb_path, n_basic_blocks_for_fn (cfun));
+      vec_safe_push (bb_path, e->dest);
+      struct pointer_set_t *visited_phis = pointer_set_create ();
+
+      find_control_statement_thread_paths (cond, visited_phis, bb_path);
+
+      pointer_set_destroy (visited_phis);
+      vec_free (bb_path);
+
+      return -1;
     }
   return 0;
 }
-- 
2.1.0.243.g30d45f7


^ permalink raw reply	[flat|nested] 54+ messages in thread

* [Patch] Improving jump-thread pass for PR 54742
  2014-09-26 20:14         ` Sebastian Pop
@ 2014-10-26 21:34           ` Sebastian Pop
  2014-11-11  1:40             ` Sebastian Pop
  2014-11-19 22:35             ` Jeff Law
  0 siblings, 2 replies; 54+ messages in thread
From: Sebastian Pop @ 2014-10-26 21:34 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 11129 bytes --]

Sebastian Pop wrote:
> Jeff Law wrote:
> > On 08/21/14 04:30, Richard Biener wrote:
> > >>It turns Jeff's jump-threading code in to a strange franken-pass of bits and
> > >>pieces of detection and optimisation, and would need some substantial
> > >>reworking to fit in with Jeff's changes last Autumn, but if it is more
> > >>likely to be acceptable for trunk then perhaps we could look to revive it.
> > >>It would be nice to reuse the path copy code Jeff added last year, but I
> > >>don't have much intuition as to how feasible that is.
> > >>
> > >>Was this the sort of thing that you were imagining?
> > >
> > >Yeah, didn't look too closely though.
> > It'd be pretty ugly I suspect.  But it's probably worth pondering
> > since that approach would eliminate the concerns about the cost of
> > detection (which is problematical for the jump threader) by using
> > Steve's code for that.
> > 
> > On the update side, I suspect most, if not all of the framework is
> > in place to handle this kind of update if the right threading paths
> > were passed to the updater.  I can probably cobble together that
> > by-hand and see what the tree-ssa-threadupdate does with it.  But
> > it'll be a week or so before I could look at it.
> 
> I adapted the patch James has sent last year to use the new update paths

Attached an updated version of the patch.

> mechanism.  I verified that the attached patch does register all the paths that
> need to be threaded.  Thread updater seems to have some problems handling the
> attached testcase (a simplified version of the testcase attached to the bug.)
> 
> Jeff, could you please have a look at why the jump-thread updater is crashing?

I have tried to understand why the code generation part ICEs on coremark: the
first problem that I have seen is that tree-ssa-threadupdate.c does not handle
more than a joiner block per path to be threaded, so we would not be able to
jump thread accross the joiners of the if condition and the joiner of the switch
condition: i.e., these paths

patch:   Registering jump thread: (7, 10) incoming edge;  (10, 25) joiner;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 14) nocopy;
patch:   Registering jump thread: (28, 10) incoming edge;  (10, 25) joiner;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 11) nocopy;
patch:   Registering jump thread: (8, 10) incoming edge;  (10, 25) joiner;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
patch:   Registering jump thread: (9, 10) incoming edge;  (10, 25) joiner;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;

Another problem is that we attach the path to be threaded to the ->aux field of
the first edge in the path, such that we would have to cancel some of the paths
because we cannot keep track of all the paths to be threaded.

For coremark, we would discover some jump-thread paths from one of the switch
cases over the loop exit condition, either to bb_27 outside the loop, or to bb_4
staying inside the loop.  Then with the "patch:" we would discover jump threads
that would thread switch cases to switch cases, and because these paths start
with the same edges for which we have already assigned a path to e->aux, we
would have to cancel the interesting threads added by the patch:

  Registering jump thread: (12, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
  Registering jump thread: (13, 25) incoming edge;  (25, 26) joiner;  (26, 27) nocopy;
  Registering jump thread: (29, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
  Registering jump thread: (31, 25) incoming edge;  (25, 26) joiner;  (26, 27) nocopy;
  Registering jump thread: (16, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
  Registering jump thread: (15, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
  Registering jump thread: (32, 25) incoming edge;  (25, 26) joiner;  (26, 27) nocopy;
  Registering jump thread: (19, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
  Registering jump thread: (18, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
  Registering jump thread: (22, 25) incoming edge;  (25, 26) joiner;  (26, 27) nocopy;
  Registering jump thread: (21, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
  Registering jump thread: (34, 25) incoming edge;  (25, 26) joiner;  (26, 27) nocopy;
  Registering jump thread: (33, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
  Registering jump thread: (35, 25) incoming edge;  (25, 26) joiner;  (26, 27) nocopy;
  Registering jump thread: (24, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
patch:   Registering jump thread: (12, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
patch:   Registering jump thread: (16, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 14) nocopy;
patch:   Registering jump thread: (19, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
patch:   Registering jump thread: (22, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
patch:   Registering jump thread: (34, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
patch:   Registering jump thread: (35, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
patch:   Registering jump thread: (29, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 14) nocopy;
patch:   Registering jump thread: (13, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
patch:   Registering jump thread: (15, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
patch:   Registering jump thread: (31, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
patch:   Registering jump thread: (18, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 20) nocopy;
patch:   Registering jump thread: (32, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
patch:   Registering jump thread: (21, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 23) nocopy;
patch:   Registering jump thread: (33, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 24) nocopy;
patch:   Registering jump thread: (24, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 24) nocopy;
  Registering jump thread: (6, 36) incoming edge;  (36, 7) normal;
  Cancelling jump thread: (12, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
  Cancelling jump thread: (16, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 14) nocopy;
  Cancelling jump thread: (19, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
  Cancelling jump thread: (22, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
  Cancelling jump thread: (34, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
  Cancelling jump thread: (35, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
  Cancelling jump thread: (29, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 14) nocopy;
  Cancelling jump thread: (13, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
  Cancelling jump thread: (15, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
  Cancelling jump thread: (31, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
  Cancelling jump thread: (18, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 20) nocopy;
  Cancelling jump thread: (32, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
  Cancelling jump thread: (21, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 23) nocopy;
  Cancelling jump thread: (33, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 24) nocopy;
  Cancelling jump thread: (24, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 24) nocopy;

Here is the structure of the CFG with the loops:

(gdb) p debug_loops (2)
loop_0 (header = 0, latch = 1, niter = )
{
  bb_2 (preds = {bb_0 }, succs = {bb_3 bb_27 })
  bb_3 (preds = {bb_2 }, succs = {bb_5 bb_6 })
  bb_5 (preds = {bb_4 bb_3 }, succs = {bb_27 })
  bb_6 (preds = {bb_3 }, succs = {bb_36 })
  bb_27 (preds = {bb_5 bb_25 bb_26 bb_2 }, succs = {bb_1 })
  loop_1 (header = 36, latch = 37, niter = )
  {
    bb_4 (preds = {bb_26 }, succs = {bb_5 bb_37 })
    bb_37 (preds = {bb_4 }, succs = {bb_36 })
    bb_36 (preds = {bb_6 bb_37 }, succs = {bb_25 bb_7 bb_11 bb_20 bb_14 bb_17 bb_23 bb_24 })
    bb_7 (preds = {bb_36 }, succs = {bb_10 bb_28 })
    bb_8 (preds = {bb_28 }, succs = {bb_10 bb_9 })
    bb_9 (preds = {bb_8 }, succs = {bb_10 })
    bb_10 (preds = {bb_7 bb_28 bb_8 bb_9 }, succs = {bb_25 })
    bb_11 (preds = {bb_36 }, succs = {bb_29 bb_30 })
    bb_12 (preds = {bb_30 }, succs = {bb_25 })
    bb_13 (preds = {bb_30 }, succs = {bb_25 })
    bb_14 (preds = {bb_36 }, succs = {bb_15 bb_16 })
    bb_15 (preds = {bb_14 }, succs = {bb_25 })
    bb_16 (preds = {bb_14 }, succs = {bb_25 bb_31 })
    bb_17 (preds = {bb_36 }, succs = {bb_18 bb_19 })
    bb_18 (preds = {bb_17 }, succs = {bb_25 })
    bb_19 (preds = {bb_17 }, succs = {bb_25 bb_32 })
    bb_20 (preds = {bb_36 }, succs = {bb_21 bb_22 })
    bb_21 (preds = {bb_20 }, succs = {bb_25 })
    bb_22 (preds = {bb_20 }, succs = {bb_25 })
    bb_23 (preds = {bb_36 }, succs = {bb_33 bb_34 })
    bb_24 (preds = {bb_36 }, succs = {bb_25 bb_35 })
    bb_25 (preds = {bb_10 bb_12 bb_16 bb_19 bb_22 bb_34 bb_35 bb_36 bb_29 bb_13 bb_15 bb_31 bb_18 bb_32 bb_21 bb_33 bb_24 }, succs = {bb_26 bb_27 })
    bb_26 (preds = {bb_25 }, succs = {bb_4 bb_27 })
    bb_28 (preds = {bb_7 }, succs = {bb_10 bb_8 })
    bb_29 (preds = {bb_11 }, succs = {bb_25 })
    bb_30 (preds = {bb_11 }, succs = {bb_12 bb_13 })
    bb_31 (preds = {bb_16 }, succs = {bb_25 })
    bb_32 (preds = {bb_19 }, succs = {bb_25 })
    bb_33 (preds = {bb_23 }, succs = {bb_25 })
    bb_34 (preds = {bb_23 }, succs = {bb_25 })
    bb_35 (preds = {bb_24 }, succs = {bb_25 })
  }
}

What about removing the use of e->aux in threadupdate.c, to be able to jump
thread across all the recorded paths?

Thanks,
Sebastian

[-- Attachment #2: 0001-jump-thread-for-PR-54742.patch --]
[-- Type: text/x-diff, Size: 8144 bytes --]

From bac0f2a390048652910f77503b21b3e208daeae1 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <s.pop@samsung.com>
Date: Fri, 26 Sep 2014 14:54:20 -0500
Subject: [PATCH] jump thread for PR 54742

Adapted from a patch from James Greenhalgh.

	* tree-ssa-threadedge.c (simplify_control_stmt_condition): Restore the
	original value of cond when simplification fails.
	(find_thread_path): New.
	(find_control_statement_thread_paths): New.
	(thread_through_normal_block): Call find_control_statement_thread_paths.

	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c: New.
---
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c |  32 ++++
 gcc/tree-ssa-threadedge.c                        | 180 ++++++++++++++++++++++-
 gcc/tree-ssa-threadupdate.c                      |   4 +
 3 files changed, 215 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
new file mode 100644
index 0000000..f3ef725
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
@@ -0,0 +1,32 @@
+int sum0, sum1, sum2, sum3;
+int foo(char * s, char** ret)
+{
+  int state=0;
+  char c;
+
+  for (; *s && state != 4; s++)
+    {
+      c = *s;
+      if (c == '*')
+	{
+	  s++;
+	  break;
+	}
+      switch (state) {
+	case 0:
+	  if (c == '+') state = 1;
+	  else if (c != '-') sum0+=c;
+	  break;
+	case 1:
+	  if (c == '+') state = 2;
+	  else if (c == '-') state = 0;
+	  else sum1+=c;
+	  break;
+	default:
+	  break;
+      }
+
+    }
+  *ret = s;
+  return state;
+}
diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
index 3dee5ba..7b9e5b6 100644
--- a/gcc/tree-ssa-threadedge.c
+++ b/gcc/tree-ssa-threadedge.c
@@ -628,6 +628,7 @@ simplify_control_stmt_condition (edge e,
      rather than use a relational operator.  These are simpler to handle.  */
   if (TREE_CODE (cond) == SSA_NAME)
     {
+      tree original_lhs = cond;
       cached_lhs = cond;
 
       /* Get the variable's current value from the equivalence chains.
@@ -656,6 +657,12 @@ simplify_control_stmt_condition (edge e,
 	 pass specific callback to try and simplify it further.  */
       if (cached_lhs && ! is_gimple_min_invariant (cached_lhs))
         cached_lhs = (*simplify) (stmt, stmt);
+
+      /* We couldn't find an invariant.  But, callers of this
+	 function may be able to do something useful with the
+	 unmodified destination.  */
+      if (!cached_lhs)
+	cached_lhs = original_lhs;
     }
   else
     cached_lhs = NULL;
@@ -915,6 +922,155 @@ thread_around_empty_blocks (edge taken_edge,
   return false;
 }
 
+/* Return true if there is at least one path from START_BB to END_BB.
+   VISITED_BBS is used to make sure we don't fall into an infinite loop.  */
+
+static bool
+find_thread_path (basic_block start_bb, basic_block end_bb,
+		    vec<basic_block, va_gc> *&path,
+		    hash_set<basic_block> *visited_bbs)
+{
+  if (start_bb == end_bb)
+    {
+      vec_safe_push (path, start_bb);
+      return true;
+    }
+
+  if (!visited_bbs->add(start_bb))
+    {
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, start_bb->succs)
+	if (find_thread_path (e->dest, end_bb, path, visited_bbs))
+	  {
+	    vec_safe_push (path, start_bb);
+	    return true;
+	  }
+    }
+
+  return false;
+}
+
+/* We trace the value of the variable EXPR back through any phi nodes looking
+   for places where it gets a constant value and save the path.  */
+
+static void
+find_control_statement_thread_paths (tree expr,
+				     hash_set<gimple> *visited_phis,
+				     vec<basic_block, va_gc> *&path)
+{
+  tree var = SSA_NAME_VAR (expr);
+  gimple def_stmt = SSA_NAME_DEF_STMT (expr);
+  basic_block var_bb = gimple_bb (def_stmt);
+
+  if (var == NULL || var_bb == NULL)
+    return;
+
+  vec<basic_block, va_gc> *next_path;
+  vec_alloc (next_path, n_basic_blocks_for_fn (cfun));
+
+  basic_block last_bb_in_path = path->last ();
+
+  /* Put the path from var_bb to last_bb_in_path into next_path.  */
+  if (var_bb != last_bb_in_path)
+    {
+      edge e;
+      int e_count = 0;
+      edge_iterator ei;
+
+      FOR_EACH_EDGE (e, ei, last_bb_in_path->preds)
+	{
+	  hash_set<basic_block> *visited_bbs = new hash_set<basic_block>;
+
+	  if (find_thread_path (var_bb, e->src, next_path, visited_bbs))
+	    e_count = e_count + 1;
+
+	  delete visited_bbs;
+
+	  /* If there is more than one path, stop.  */
+	  if (e_count > 1)
+	    {
+	      vec_free (next_path);
+	      return;
+	    }
+	}
+    }
+
+  /* Visit PHI nodes once.  */
+  if (gimple_code (def_stmt) != GIMPLE_PHI
+      || visited_phis->add(def_stmt)) {
+    vec_free (next_path);
+    return;
+  }
+
+  /* Append all the nodes from next_path to path.  */
+  vec_safe_splice (path, next_path);
+  gcc_assert (path->last () == var_bb);
+
+  /* Iterate over the arguments of PHI.  */
+  unsigned int i;
+  for (i = 0; i < gimple_phi_num_args (def_stmt); i++)
+    {
+      tree arg = gimple_phi_arg_def (def_stmt, i);
+      basic_block bbi = gimple_phi_arg_edge (def_stmt, i)->src;
+
+      /* Skip edges pointing outside the current loop.  */
+      if (!arg || var_bb->loop_father != bbi->loop_father)
+	continue;
+
+      /* Add BBI to the path.  */
+      vec_safe_push (path, bbi);
+
+      if (TREE_CODE (arg) == INTEGER_CST)
+	{
+	  int j, n = path->length ();
+	  vec<jump_thread_edge *> *jump_thread_path
+	    = new vec<jump_thread_edge *> ();
+	  int joiners = 0;
+
+	  for (j = 0; j < n - 1; j++)
+	    {
+	      edge e = find_edge ((*path)[n - j - 1],
+				  (*path)[n - j - 2]);
+	      gcc_assert (e);
+	      enum jump_thread_edge_type kind;
+
+	      if (j == 0)
+		kind = EDGE_START_JUMP_THREAD;
+	      else if (single_pred_p (e->src))
+		kind = EDGE_NO_COPY_SRC_BLOCK;
+	      else {
+		kind = EDGE_COPY_SRC_JOINER_BLOCK;
+		++joiners;
+	      }
+
+	      jump_thread_edge *x = new jump_thread_edge (e, kind);
+	      jump_thread_path->safe_push (x);
+	    }
+
+	  /* Add the edge taken when the control variable has value ARG.  */
+	  edge taken_edge = find_taken_edge ((*path)[0], arg);
+	  jump_thread_edge *x
+	    = new jump_thread_edge (taken_edge, EDGE_NO_COPY_SRC_BLOCK);
+	  jump_thread_path->safe_push (x);
+
+	  /* Thread-update does not handle more than two joiners.  A path with
+	     less than 3 nodes should not be jump-threaded.  */
+	  if (joiners < 2 && n > 2)
+	    register_jump_thread (jump_thread_path);
+	}
+      else if (TREE_CODE (arg) == SSA_NAME)
+	find_control_statement_thread_paths (arg, visited_phis, path);
+
+      /* Remove BBI from the path.  */
+      path->pop ();
+    }
+
+  /* Remove all the nodes that we added from next_path.  */
+  vec_safe_truncate (path, (path->length () - next_path->length ()));
+  vec_free (next_path);
+}
+
 /* We are exiting E->src, see if E->dest ends with a conditional
    jump which has a known value when reached via E.
 
@@ -1000,7 +1156,10 @@ thread_through_normal_block (edge e,
       cond = simplify_control_stmt_condition (e, stmt, dummy_cond, simplify,
 					      handle_dominating_asserts);
 
-      if (cond && is_gimple_min_invariant (cond))
+      if (!cond)
+	return 0;
+
+      if (is_gimple_min_invariant (cond))
 	{
 	  edge taken_edge = find_taken_edge (e->dest, cond);
 	  basic_block dest = (taken_edge ? taken_edge->dest : NULL);
@@ -1046,7 +1205,25 @@ thread_through_normal_block (edge e,
 				      backedge_seen_p);
 	  return 1;
 	}
+
+      if (TREE_CODE (cond) != SSA_NAME
+	  || e->dest->loop_father != e->src->loop_father)
+	return 0;
+
+      /* When COND cannot be simplified, try to find paths from a control
+	 statement back through the PHI nodes which would affect that control
+	 statement.  */
+      vec<basic_block, va_gc> *bb_path;
+      vec_alloc (bb_path, n_basic_blocks_for_fn (cfun));
+      vec_safe_push (bb_path, e->dest);
+      hash_set<gimple> *visited_phis = new hash_set<gimple>;
+
+      find_control_statement_thread_paths (cond, visited_phis, bb_path);
+
+      delete visited_phis;
+      vec_free (bb_path);
+
+      return -1;
     }
   return 0;
 }
-- 
2.1.0.243.g30d45f7


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-10-26 21:34           ` [Patch] Improving jump-thread " Sebastian Pop
@ 2014-11-11  1:40             ` Sebastian Pop
  2014-11-17  9:29               ` James Greenhalgh
  2014-11-17 12:47               ` Richard Biener
  2014-11-19 22:35             ` Jeff Law
  1 sibling, 2 replies; 54+ messages in thread
From: Sebastian Pop @ 2014-11-11  1:40 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 20373 bytes --]

Hi Jeff,

I have adapted the code generation part from James' patch to current trunk, and
the resulting patch gets the 30% speedup on coremark and passes bootstrap of GCC.

Ok for trunk?

Thanks,
Sebastian

Sebastian Pop wrote:
> Sebastian Pop wrote:
> > Jeff Law wrote:
> > > On 08/21/14 04:30, Richard Biener wrote:
> > > >>It turns Jeff's jump-threading code in to a strange franken-pass of bits and
> > > >>pieces of detection and optimisation, and would need some substantial
> > > >>reworking to fit in with Jeff's changes last Autumn, but if it is more
> > > >>likely to be acceptable for trunk then perhaps we could look to revive it.
> > > >>It would be nice to reuse the path copy code Jeff added last year, but I
> > > >>don't have much intuition as to how feasible that is.
> > > >>
> > > >>Was this the sort of thing that you were imagining?
> > > >
> > > >Yeah, didn't look too closely though.
> > > It'd be pretty ugly I suspect.  But it's probably worth pondering
> > > since that approach would eliminate the concerns about the cost of
> > > detection (which is problematical for the jump threader) by using
> > > Steve's code for that.
> > > 
> > > On the update side, I suspect most, if not all of the framework is
> > > in place to handle this kind of update if the right threading paths
> > > were passed to the updater.  I can probably cobble together that
> > > by-hand and see what the tree-ssa-threadupdate does with it.  But
> > > it'll be a week or so before I could look at it.
> > 
> > I adapted the patch James has sent last year to use the new update paths
> 
> Attached an updated version of the patch.
> 
> > mechanism.  I verified that the attached patch does register all the paths that
> > need to be threaded.  Thread updater seems to have some problems handling the
> > attached testcase (a simplified version of the testcase attached to the bug.)
> > 
> > Jeff, could you please have a look at why the jump-thread updater is crashing?
> 
> I have tried to understand why the code generation part ICEs on coremark: the
> first problem that I have seen is that tree-ssa-threadupdate.c does not handle
> more than a joiner block per path to be threaded, so we would not be able to
> jump thread accross the joiners of the if condition and the joiner of the switch
> condition: i.e., these paths
> 
> patch:   Registering jump thread: (7, 10) incoming edge;  (10, 25) joiner;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 14) nocopy;
> patch:   Registering jump thread: (28, 10) incoming edge;  (10, 25) joiner;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 11) nocopy;
> patch:   Registering jump thread: (8, 10) incoming edge;  (10, 25) joiner;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
> patch:   Registering jump thread: (9, 10) incoming edge;  (10, 25) joiner;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
> 
> Another problem is that we attach the path to be threaded to the ->aux field of
> the first edge in the path, such that we would have to cancel some of the paths
> because we cannot keep track of all the paths to be threaded.
> 
> For coremark, we would discover some jump-thread paths from one of the switch
> cases over the loop exit condition, either to bb_27 outside the loop, or to bb_4
> staying inside the loop.  Then with the "patch:" we would discover jump threads
> that would thread switch cases to switch cases, and because these paths start
> with the same edges for which we have already assigned a path to e->aux, we
> would have to cancel the interesting threads added by the patch:
> 
>   Registering jump thread: (12, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
>   Registering jump thread: (13, 25) incoming edge;  (25, 26) joiner;  (26, 27) nocopy;
>   Registering jump thread: (29, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
>   Registering jump thread: (31, 25) incoming edge;  (25, 26) joiner;  (26, 27) nocopy;
>   Registering jump thread: (16, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
>   Registering jump thread: (15, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
>   Registering jump thread: (32, 25) incoming edge;  (25, 26) joiner;  (26, 27) nocopy;
>   Registering jump thread: (19, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
>   Registering jump thread: (18, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
>   Registering jump thread: (22, 25) incoming edge;  (25, 26) joiner;  (26, 27) nocopy;
>   Registering jump thread: (21, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
>   Registering jump thread: (34, 25) incoming edge;  (25, 26) joiner;  (26, 27) nocopy;
>   Registering jump thread: (33, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
>   Registering jump thread: (35, 25) incoming edge;  (25, 26) joiner;  (26, 27) nocopy;
>   Registering jump thread: (24, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
> patch:   Registering jump thread: (12, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
> patch:   Registering jump thread: (16, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 14) nocopy;
> patch:   Registering jump thread: (19, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
> patch:   Registering jump thread: (22, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
> patch:   Registering jump thread: (34, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
> patch:   Registering jump thread: (35, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
> patch:   Registering jump thread: (29, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 14) nocopy;
> patch:   Registering jump thread: (13, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
> patch:   Registering jump thread: (15, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
> patch:   Registering jump thread: (31, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
> patch:   Registering jump thread: (18, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 20) nocopy;
> patch:   Registering jump thread: (32, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
> patch:   Registering jump thread: (21, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 23) nocopy;
> patch:   Registering jump thread: (33, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 24) nocopy;
> patch:   Registering jump thread: (24, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 24) nocopy;
>   Registering jump thread: (6, 36) incoming edge;  (36, 7) normal;
>   Cancelling jump thread: (12, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
>   Cancelling jump thread: (16, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 14) nocopy;
>   Cancelling jump thread: (19, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
>   Cancelling jump thread: (22, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>   Cancelling jump thread: (34, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>   Cancelling jump thread: (35, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>   Cancelling jump thread: (29, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 14) nocopy;
>   Cancelling jump thread: (13, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>   Cancelling jump thread: (15, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
>   Cancelling jump thread: (31, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>   Cancelling jump thread: (18, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 20) nocopy;
>   Cancelling jump thread: (32, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>   Cancelling jump thread: (21, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 23) nocopy;
>   Cancelling jump thread: (33, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 24) nocopy;
>   Cancelling jump thread: (24, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 24) nocopy;
> 
> Here is the structure of the CFG with the loops:
> 
> (gdb) p debug_loops (2)
> loop_0 (header = 0, latch = 1, niter = )
> {
>   bb_2 (preds = {bb_0 }, succs = {bb_3 bb_27 })
>   bb_3 (preds = {bb_2 }, succs = {bb_5 bb_6 })
>   bb_5 (preds = {bb_4 bb_3 }, succs = {bb_27 })
>   bb_6 (preds = {bb_3 }, succs = {bb_36 })
>   bb_27 (preds = {bb_5 bb_25 bb_26 bb_2 }, succs = {bb_1 })
>   loop_1 (header = 36, latch = 37, niter = )
>   {
>     bb_4 (preds = {bb_26 }, succs = {bb_5 bb_37 })
>     bb_37 (preds = {bb_4 }, succs = {bb_36 })
>     bb_36 (preds = {bb_6 bb_37 }, succs = {bb_25 bb_7 bb_11 bb_20 bb_14 bb_17 bb_23 bb_24 })
>     bb_7 (preds = {bb_36 }, succs = {bb_10 bb_28 })
>     bb_8 (preds = {bb_28 }, succs = {bb_10 bb_9 })
>     bb_9 (preds = {bb_8 }, succs = {bb_10 })
>     bb_10 (preds = {bb_7 bb_28 bb_8 bb_9 }, succs = {bb_25 })
>     bb_11 (preds = {bb_36 }, succs = {bb_29 bb_30 })
>     bb_12 (preds = {bb_30 }, succs = {bb_25 })
>     bb_13 (preds = {bb_30 }, succs = {bb_25 })
>     bb_14 (preds = {bb_36 }, succs = {bb_15 bb_16 })
>     bb_15 (preds = {bb_14 }, succs = {bb_25 })
>     bb_16 (preds = {bb_14 }, succs = {bb_25 bb_31 })
>     bb_17 (preds = {bb_36 }, succs = {bb_18 bb_19 })
>     bb_18 (preds = {bb_17 }, succs = {bb_25 })
>     bb_19 (preds = {bb_17 }, succs = {bb_25 bb_32 })
>     bb_20 (preds = {bb_36 }, succs = {bb_21 bb_22 })
>     bb_21 (preds = {bb_20 }, succs = {bb_25 })
>     bb_22 (preds = {bb_20 }, succs = {bb_25 })
>     bb_23 (preds = {bb_36 }, succs = {bb_33 bb_34 })
>     bb_24 (preds = {bb_36 }, succs = {bb_25 bb_35 })
>     bb_25 (preds = {bb_10 bb_12 bb_16 bb_19 bb_22 bb_34 bb_35 bb_36 bb_29 bb_13 bb_15 bb_31 bb_18 bb_32 bb_21 bb_33 bb_24 }, succs = {bb_26 bb_27 })
>     bb_26 (preds = {bb_25 }, succs = {bb_4 bb_27 })
>     bb_28 (preds = {bb_7 }, succs = {bb_10 bb_8 })
>     bb_29 (preds = {bb_11 }, succs = {bb_25 })
>     bb_30 (preds = {bb_11 }, succs = {bb_12 bb_13 })
>     bb_31 (preds = {bb_16 }, succs = {bb_25 })
>     bb_32 (preds = {bb_19 }, succs = {bb_25 })
>     bb_33 (preds = {bb_23 }, succs = {bb_25 })
>     bb_34 (preds = {bb_23 }, succs = {bb_25 })
>     bb_35 (preds = {bb_24 }, succs = {bb_25 })
>   }
> }
> 
> What about removing the use of e->aux in threadupdate.c, to be able to jump
> thread across all the recorded paths?
> 
> Thanks,
> Sebastian

> From bac0f2a390048652910f77503b21b3e208daeae1 Mon Sep 17 00:00:00 2001
> From: Sebastian Pop <s.pop@samsung.com>
> Date: Fri, 26 Sep 2014 14:54:20 -0500
> Subject: [PATCH] jump thread for PR 54742
> 
> Adapted from a patch from James Greenhalgh.
> 
> 	* tree-ssa-threadedge.c (simplify_control_stmt_condition): Restore the
> 	original value of cond when simplification fails.
> 	(find_thread_path): New.
> 	(find_control_statement_thread_paths): New.
> 	(thread_through_normal_block): Call find_control_statement_thread_paths.
> 
> 	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c: New.
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c |  32 ++++
>  gcc/tree-ssa-threadedge.c                        | 180 ++++++++++++++++++++++-
>  gcc/tree-ssa-threadupdate.c                      |   4 +
>  3 files changed, 215 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
> 
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
> new file mode 100644
> index 0000000..f3ef725
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
> @@ -0,0 +1,32 @@
> +int sum0, sum1, sum2, sum3;
> +int foo(char * s, char** ret)
> +{
> +  int state=0;
> +  char c;
> +
> +  for (; *s && state != 4; s++)
> +    {
> +      c = *s;
> +      if (c == '*')
> +	{
> +	  s++;
> +	  break;
> +	}
> +      switch (state) {
> +	case 0:
> +	  if (c == '+') state = 1;
> +	  else if (c != '-') sum0+=c;
> +	  break;
> +	case 1:
> +	  if (c == '+') state = 2;
> +	  else if (c == '-') state = 0;
> +	  else sum1+=c;
> +	  break;
> +	default:
> +	  break;
> +      }
> +
> +    }
> +  *ret = s;
> +  return state;
> +}
> diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
> index 3dee5ba..7b9e5b6 100644
> --- a/gcc/tree-ssa-threadedge.c
> +++ b/gcc/tree-ssa-threadedge.c
> @@ -628,6 +628,7 @@ simplify_control_stmt_condition (edge e,
>       rather than use a relational operator.  These are simpler to handle.  */
>    if (TREE_CODE (cond) == SSA_NAME)
>      {
> +      tree original_lhs = cond;
>        cached_lhs = cond;
>  
>        /* Get the variable's current value from the equivalence chains.
> @@ -656,6 +657,12 @@ simplify_control_stmt_condition (edge e,
>  	 pass specific callback to try and simplify it further.  */
>        if (cached_lhs && ! is_gimple_min_invariant (cached_lhs))
>          cached_lhs = (*simplify) (stmt, stmt);
> +
> +      /* We couldn't find an invariant.  But, callers of this
> +	 function may be able to do something useful with the
> +	 unmodified destination.  */
> +      if (!cached_lhs)
> +	cached_lhs = original_lhs;
>      }
>    else
>      cached_lhs = NULL;
> @@ -915,6 +922,155 @@ thread_around_empty_blocks (edge taken_edge,
>    return false;
>  }
>  
> +/* Return true if there is at least one path from START_BB to END_BB.
> +   VISITED_BBS is used to make sure we don't fall into an infinite loop.  */
> +
> +static bool
> +find_thread_path (basic_block start_bb, basic_block end_bb,
> +		    vec<basic_block, va_gc> *&path,
> +		    hash_set<basic_block> *visited_bbs)
> +{
> +  if (start_bb == end_bb)
> +    {
> +      vec_safe_push (path, start_bb);
> +      return true;
> +    }
> +
> +  if (!visited_bbs->add(start_bb))
> +    {
> +      edge e;
> +      edge_iterator ei;
> +      FOR_EACH_EDGE (e, ei, start_bb->succs)
> +	if (find_thread_path (e->dest, end_bb, path, visited_bbs))
> +	  {
> +	    vec_safe_push (path, start_bb);
> +	    return true;
> +	  }
> +    }
> +
> +  return false;
> +}
> +
> +/* We trace the value of the variable EXPR back through any phi nodes looking
> +   for places where it gets a constant value and save the path.  */
> +
> +static void
> +find_control_statement_thread_paths (tree expr,
> +				     hash_set<gimple> *visited_phis,
> +				     vec<basic_block, va_gc> *&path)
> +{
> +  tree var = SSA_NAME_VAR (expr);
> +  gimple def_stmt = SSA_NAME_DEF_STMT (expr);
> +  basic_block var_bb = gimple_bb (def_stmt);
> +
> +  if (var == NULL || var_bb == NULL)
> +    return;
> +
> +  vec<basic_block, va_gc> *next_path;
> +  vec_alloc (next_path, n_basic_blocks_for_fn (cfun));
> +
> +  basic_block last_bb_in_path = path->last ();
> +
> +  /* Put the path from var_bb to last_bb_in_path into next_path.  */
> +  if (var_bb != last_bb_in_path)
> +    {
> +      edge e;
> +      int e_count = 0;
> +      edge_iterator ei;
> +
> +      FOR_EACH_EDGE (e, ei, last_bb_in_path->preds)
> +	{
> +	  hash_set<basic_block> *visited_bbs = new hash_set<basic_block>;
> +
> +	  if (find_thread_path (var_bb, e->src, next_path, visited_bbs))
> +	    e_count = e_count + 1;
> +
> +	  delete visited_bbs;
> +
> +	  /* If there is more than one path, stop.  */
> +	  if (e_count > 1)
> +	    {
> +	      vec_free (next_path);
> +	      return;
> +	    }
> +	}
> +    }
> +
> +  /* Visit PHI nodes once.  */
> +  if (gimple_code (def_stmt) != GIMPLE_PHI
> +      || visited_phis->add(def_stmt)) {
> +    vec_free (next_path);
> +    return;
> +  }
> +
> +  /* Append all the nodes from next_path to path.  */
> +  vec_safe_splice (path, next_path);
> +  gcc_assert (path->last () == var_bb);
> +
> +  /* Iterate over the arguments of PHI.  */
> +  unsigned int i;
> +  for (i = 0; i < gimple_phi_num_args (def_stmt); i++)
> +    {
> +      tree arg = gimple_phi_arg_def (def_stmt, i);
> +      basic_block bbi = gimple_phi_arg_edge (def_stmt, i)->src;
> +
> +      /* Skip edges pointing outside the current loop.  */
> +      if (!arg || var_bb->loop_father != bbi->loop_father)
> +	continue;
> +
> +      /* Add BBI to the path.  */
> +      vec_safe_push (path, bbi);
> +
> +      if (TREE_CODE (arg) == INTEGER_CST)
> +	{
> +	  int j, n = path->length ();
> +	  vec<jump_thread_edge *> *jump_thread_path
> +	    = new vec<jump_thread_edge *> ();
> +	  int joiners = 0;
> +
> +	  for (j = 0; j < n - 1; j++)
> +	    {
> +	      edge e = find_edge ((*path)[n - j - 1],
> +				  (*path)[n - j - 2]);
> +	      gcc_assert (e);
> +	      enum jump_thread_edge_type kind;
> +
> +	      if (j == 0)
> +		kind = EDGE_START_JUMP_THREAD;
> +	      else if (single_pred_p (e->src))
> +		kind = EDGE_NO_COPY_SRC_BLOCK;
> +	      else {
> +		kind = EDGE_COPY_SRC_JOINER_BLOCK;
> +		++joiners;
> +	      }
> +
> +	      jump_thread_edge *x = new jump_thread_edge (e, kind);
> +	      jump_thread_path->safe_push (x);
> +	    }
> +
> +	  /* Add the edge taken when the control variable has value ARG.  */
> +	  edge taken_edge = find_taken_edge ((*path)[0], arg);
> +	  jump_thread_edge *x
> +	    = new jump_thread_edge (taken_edge, EDGE_NO_COPY_SRC_BLOCK);
> +	  jump_thread_path->safe_push (x);
> +
> +	  /* Thread-update does not handle more than two joiners.  A path with
> +	     less than 3 nodes should not be jump-threaded.  */
> +	  if (joiners < 2 && n > 2)
> +	    register_jump_thread (jump_thread_path);
> +	}
> +      else if (TREE_CODE (arg) == SSA_NAME)
> +	find_control_statement_thread_paths (arg, visited_phis, path);
> +
> +      /* Remove BBI from the path.  */
> +      path->pop ();
> +    }
> +
> +  /* Remove all the nodes that we added from next_path.  */
> +  vec_safe_truncate (path, (path->length () - next_path->length ()));
> +  vec_free (next_path);
> +}
> +
>  /* We are exiting E->src, see if E->dest ends with a conditional
>     jump which has a known value when reached via E.
>  
> @@ -1000,7 +1156,10 @@ thread_through_normal_block (edge e,
>        cond = simplify_control_stmt_condition (e, stmt, dummy_cond, simplify,
>  					      handle_dominating_asserts);
>  
> -      if (cond && is_gimple_min_invariant (cond))
> +      if (!cond)
> +	return 0;
> +
> +      if (is_gimple_min_invariant (cond))
>  	{
>  	  edge taken_edge = find_taken_edge (e->dest, cond);
>  	  basic_block dest = (taken_edge ? taken_edge->dest : NULL);
> @@ -1046,7 +1205,25 @@ thread_through_normal_block (edge e,
>  				      backedge_seen_p);
>  	  return 1;
>  	}
> +
> +      if (TREE_CODE (cond) != SSA_NAME
> +	  || e->dest->loop_father != e->src->loop_father)
> +	return 0;
> +
> +      /* When COND cannot be simplified, try to find paths from a control
> +	 statement back through the PHI nodes which would affect that control
> +	 statement.  */
> +      vec<basic_block, va_gc> *bb_path;
> +      vec_alloc (bb_path, n_basic_blocks_for_fn (cfun));
> +      vec_safe_push (bb_path, e->dest);
> +      hash_set<gimple> *visited_phis = new hash_set<gimple>;
> +
> +      find_control_statement_thread_paths (cond, visited_phis, bb_path);
> +
> +      delete visited_phis;
> +      vec_free (bb_path);
> +
> +      return -1;
>      }
>    return 0;
>  }
> -- 
> 2.1.0.243.g30d45f7
> 


[-- Attachment #2: 0001-jump-thread-for-PR-54742.patch --]
[-- Type: text/x-diff, Size: 15132 bytes --]

From 23b1ac8fa92e9e4cd05edb2967aba564126f75a1 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <s.pop@samsung.com>
Date: Fri, 26 Sep 2014 14:54:20 -0500
Subject: [PATCH] jump thread for PR 54742

Adapted from a patch from James Greenhalgh.

	* params.def (max-thread-path-insns, max-thread-length,
	max-thread-paths): New.

	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c: New.

	* tree-cfg.c (gimple_duplicate_sese_region): Save and restore loop
	header and latch.

	* tree-ssa-threadedge.c (simplify_control_stmt_condition): Restore the
	original value of cond when simplification fails.
	(fsm_find_thread_path): New.
	(fsm_find_control_statement_thread_paths): New.
	(fsm_thread_through_normal_block): Call find_control_statement_thread_paths.

	* tree-ssa-threadupdate.c (dump_jump_thread_path): Pretty print
	EDGE_START_FSM_THREAD.
	(thread_through_all_blocks): Generate code for EDGE_START_FSM_THREAD edges
	calling gimple_duplicate_sese_region.

	* tree-ssa-threadupdate.h (jump_thread_edge_type): Add EDGE_START_FSM_THREAD.
---
 gcc/params.def                                   |  19 +++
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c |  32 ++++
 gcc/tree-cfg.c                                   |  21 ++-
 gcc/tree-ssa-threadedge.c                        | 200 ++++++++++++++++++++++-
 gcc/tree-ssa-threadupdate.c                      |  52 +++++-
 gcc/tree-ssa-threadupdate.h                      |   1 +
 6 files changed, 320 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c

diff --git a/gcc/params.def b/gcc/params.def
index d2d2add..749f962 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -123,6 +123,25 @@ DEFPARAM (PARAM_PARTIAL_INLINING_ENTRY_PROBABILITY,
 	  "Maximum probability of the entry BB of split region (in percent relative to entry BB of the function) to make partial inlining happen",
 	  70, 0, 0)
 
+/* Maximum number of instructions to copy when duplicating blocks
+   on a jump thread path.  */
+DEFPARAM (PARAM_MAX_THREAD_PATH_INSNS,
+	  "max-thread-path-insns",
+	  "Maximum number of instructions to copy when duplicating blocks on a jump thread path",
+	  100, 1, 999999)
+
+/* Maximum length of a jump thread path.  */
+DEFPARAM (PARAM_MAX_THREAD_LENGTH,
+	  "max-thread-length",
+	  "Maximum number of basic blocks on a jump thread path",
+	  10, 1, 999999)
+
+/* Maximum number of jump thread paths to duplicate.  */
+DEFPARAM (PARAM_MAX_THREAD_PATHS,
+	  "max-thread-paths",
+	  "Maximum number of new jump thread paths to create",
+	  50, 1, 999999)
+
 /* Limit the number of expansions created by the variable expansion
    optimization to avoid register pressure.  */
 DEFPARAM (PARAM_MAX_VARIABLE_EXPANSIONS,
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
new file mode 100644
index 0000000..f3ef725
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
@@ -0,0 +1,32 @@
+int sum0, sum1, sum2, sum3;
+int foo(char * s, char** ret)
+{
+  int state=0;
+  char c;
+
+  for (; *s && state != 4; s++)
+    {
+      c = *s;
+      if (c == '*')
+	{
+	  s++;
+	  break;
+	}
+      switch (state) {
+	case 0:
+	  if (c == '+') state = 1;
+	  else if (c != '-') sum0+=c;
+	  break;
+	case 1:
+	  if (c == '+') state = 2;
+	  else if (c == '-') state = 0;
+	  else sum1+=c;
+	  break;
+	default:
+	  break;
+      }
+
+    }
+  *ret = s;
+  return state;
+}
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index ee10bc6..565cfe3 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -5949,10 +5949,12 @@ gimple_duplicate_sese_region (edge entry, edge exit,
 {
   unsigned i;
   bool free_region_copy = false, copying_header = false;
+  bool save_loop_details = false;
   struct loop *loop = entry->dest->loop_father;
   edge exit_copy;
   vec<basic_block> doms;
   edge redirected;
+  int memo_loop_header_no = 0, memo_loop_latch_no = 0;
   int total_freq = 0, entry_freq = 0;
   gcov_type total_count = 0, entry_count = 0;
 
@@ -5970,9 +5972,15 @@ gimple_duplicate_sese_region (edge entry, edge exit,
       if (region[i]->loop_father != loop)
 	return false;
 
-      if (region[i] != entry->dest
-	  && region[i] == loop->header)
-	return false;
+      /* If we are copying an abnormally shaped region, keep track of
+	 which block will become our loop header.  */
+      if ((region[i] != entry->dest && region[i] == loop->header)
+	  || (region[i] != entry->src && region[i] == loop->latch))
+	{
+	  save_loop_details = true;
+	  memo_loop_latch_no = i;
+	  memo_loop_header_no = i;
+	}
     }
 
   /* In case the function is used for loop header copying (which is the primary
@@ -6055,6 +6063,13 @@ gimple_duplicate_sese_region (edge entry, edge exit,
       loop->latch = exit->src;
     }
 
+  /* Restore loop details if we were asked to save them.  */
+  if (save_loop_details)
+    {
+      loop->header = region_copy[memo_loop_header_no];
+      loop->latch = region_copy[memo_loop_latch_no];
+    }
+
   /* Redirect the entry and add the phi node arguments.  */
   redirected = redirect_edge_and_branch (entry, get_bb_copy (entry->dest));
   gcc_assert (redirected != NULL);
diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
index d5b9696..de9b3fe 100644
--- a/gcc/tree-ssa-threadedge.c
+++ b/gcc/tree-ssa-threadedge.c
@@ -56,6 +56,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "tree-ssa-threadedge.h"
 #include "builtins.h"
+#include "cfg.h"
+#include "cfganal.h"
 
 /* To avoid code explosion due to jump threading, we limit the
    number of statements we are going to copy.  This variable
@@ -660,6 +662,7 @@ simplify_control_stmt_condition (edge e,
      rather than use a relational operator.  These are simpler to handle.  */
   if (TREE_CODE (cond) == SSA_NAME)
     {
+      tree original_lhs = cond;
       cached_lhs = cond;
 
       /* Get the variable's current value from the equivalence chains.
@@ -688,6 +691,12 @@ simplify_control_stmt_condition (edge e,
 	 pass specific callback to try and simplify it further.  */
       if (cached_lhs && ! is_gimple_min_invariant (cached_lhs))
         cached_lhs = (*simplify) (stmt, stmt);
+
+      /* We couldn't find an invariant.  But, callers of this
+	 function may be able to do something useful with the
+	 unmodified destination.  */
+      if (!cached_lhs)
+	cached_lhs = original_lhs;
     }
   else
     cached_lhs = NULL;
@@ -947,6 +956,172 @@ thread_around_empty_blocks (edge taken_edge,
   return false;
 }
 
+/* Return true if there is at least one path from START_BB to END_BB.
+   VISITED_BBS is used to make sure we don't fall into an infinite loop.  */
+
+static bool
+fsm_find_thread_path (basic_block start_bb, basic_block end_bb,
+		      vec<basic_block, va_gc> *&path,
+		      hash_set<basic_block> *visited_bbs, int n_insns)
+{
+  if (start_bb == end_bb)
+    {
+      vec_safe_push (path, start_bb);
+      return true;
+    }
+
+  if (!visited_bbs->add(start_bb))
+    {
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, start_bb->succs)
+	if (fsm_find_thread_path (e->dest, end_bb, path, visited_bbs, n_insns))
+	  {
+	    vec_safe_push (path, start_bb);
+	    return true;
+	  }
+    }
+
+  return false;
+}
+
+static int max_threaded_paths;
+
+/* We trace the value of the variable EXPR back through any phi nodes looking
+   for places where it gets a constant value and save the path.  Stop after
+   having recorded MAX_PATHS jump threading paths.  */
+
+static void
+fsm_find_control_statement_thread_paths (tree expr,
+					 hash_set<gimple> *visited_phis,
+					 vec<basic_block, va_gc> *&path)
+{
+  tree var = SSA_NAME_VAR (expr);
+  gimple def_stmt = SSA_NAME_DEF_STMT (expr);
+  basic_block var_bb = gimple_bb (def_stmt);
+
+  if (var == NULL || var_bb == NULL)
+    return;
+
+  vec<basic_block, va_gc> *next_path;
+  vec_alloc (next_path, n_basic_blocks_for_fn (cfun));
+
+  basic_block last_bb_in_path = path->last ();
+
+  /* Put the path from var_bb to last_bb_in_path into next_path.  */
+  if (var_bb != last_bb_in_path)
+    {
+      edge e;
+      int e_count = 0;
+      edge_iterator ei;
+
+      FOR_EACH_EDGE (e, ei, last_bb_in_path->preds)
+	{
+	  hash_set<basic_block> *visited_bbs = new hash_set<basic_block>;
+
+	  if (fsm_find_thread_path (var_bb, e->src, next_path, visited_bbs, 0))
+	    ++e_count;
+
+	  delete visited_bbs;
+
+	  /* If there is more than one path, stop.  */
+	  if (e_count > 1)
+	    {
+	      vec_free (next_path);
+	      return;
+	    }
+	}
+    }
+
+  /* Visit PHI nodes once.  */
+  if (gimple_code (def_stmt) != GIMPLE_PHI
+      || visited_phis->add(def_stmt))
+    {
+      vec_free (next_path);
+      return;
+    }
+
+  /* Append all the nodes from next_path to path.  */
+  vec_safe_splice (path, next_path);
+  gcc_assert (path->last () == var_bb);
+
+  /* Iterate over the arguments of PHI.  */
+  unsigned int i;
+  for (i = 0; i < gimple_phi_num_args (def_stmt); i++)
+    {
+      tree arg = gimple_phi_arg_def (def_stmt, i);
+      basic_block bbi = gimple_phi_arg_edge (def_stmt, i)->src;
+
+      /* Skip edges pointing outside the current loop.  */
+      if (!arg || var_bb->loop_father != bbi->loop_father)
+	continue;
+
+      /* Add BBI to the path.  */
+      vec_safe_push (path, bbi);
+
+      if (TREE_CODE (arg) == INTEGER_CST)
+	{
+	  int j, n = path->length ();
+	  vec<jump_thread_edge *> *jump_thread_path
+	    = new vec<jump_thread_edge *> ();
+	  int joiners = 0;
+
+	  for (j = 0; j < n - 1; j++)
+	    {
+	      edge e = find_edge ((*path)[n - j - 1],
+				  (*path)[n - j - 2]);
+	      gcc_assert (e);
+	      enum jump_thread_edge_type kind;
+
+	      if (j == 0)
+		kind = EDGE_START_FSM_THREAD;
+	      else if (single_pred_p (e->src))
+		kind = EDGE_NO_COPY_SRC_BLOCK;
+	      else {
+		kind = EDGE_COPY_SRC_JOINER_BLOCK;
+		++joiners;
+	      }
+
+	      jump_thread_edge *x = new jump_thread_edge (e, kind);
+	      jump_thread_path->safe_push (x);
+	    }
+
+	  /* Add the edge taken when the control variable has value ARG.  */
+	  edge taken_edge = find_taken_edge ((*path)[0], arg);
+	  jump_thread_edge *x
+	    = new jump_thread_edge (taken_edge, EDGE_NO_COPY_SRC_BLOCK);
+	  jump_thread_path->safe_push (x);
+
+	  /* A path with less than 3 nodes should not be jump-threaded.  */
+	  if (n > 2 && n < PARAM_VALUE (PARAM_MAX_THREAD_LENGTH)
+	      && max_threaded_paths > 0)
+	    {
+	      int n_insns = 0;
+	      gimple_stmt_iterator gsi;
+
+	      for (j = 1; j < n - 1; j++)
+		for (gsi = gsi_start_bb ((*path)[j]); !gsi_end_p (gsi); gsi_next (&gsi))
+		  ++n_insns;
+
+	      if (n_insns < PARAM_VALUE (PARAM_MAX_THREAD_PATH_INSNS))
+		{
+		  register_jump_thread (jump_thread_path);
+		  --max_threaded_paths;
+		}
+	    }
+	}
+      else if (TREE_CODE (arg) == SSA_NAME)
+	fsm_find_control_statement_thread_paths (arg, visited_phis, path);
+
+      /* Remove BBI from the path.  */
+      path->pop ();
+    }
+
+  /* Remove all the nodes that we added from next_path.  */
+  vec_safe_truncate (path, (path->length () - next_path->length ()));
+  vec_free (next_path);
+}
+
 /* We are exiting E->src, see if E->dest ends with a conditional
    jump which has a known value when reached via E.
 
@@ -1032,7 +1207,10 @@ thread_through_normal_block (edge e,
       cond = simplify_control_stmt_condition (e, stmt, dummy_cond, simplify,
 					      handle_dominating_asserts);
 
-      if (cond && is_gimple_min_invariant (cond))
+      if (!cond)
+	return 0;
+
+      if (is_gimple_min_invariant (cond))
 	{
 	  edge taken_edge = find_taken_edge (e->dest, cond);
 	  basic_block dest = (taken_edge ? taken_edge->dest : NULL);
@@ -1078,6 +1256,26 @@ thread_through_normal_block (edge e,
 				      backedge_seen_p);
 	  return 1;
 	}
+
+      if (TREE_CODE (cond) != SSA_NAME
+	  || e->dest->loop_father != e->src->loop_father)
+	return 0;
+
+      /* When COND cannot be simplified, try to find paths from a control
+	 statement back through the PHI nodes which would affect that control
+	 statement.  */
+      vec<basic_block, va_gc> *bb_path;
+      vec_alloc (bb_path, n_basic_blocks_for_fn (cfun));
+      vec_safe_push (bb_path, e->dest);
+      hash_set<gimple> *visited_phis = new hash_set<gimple>;
+
+      max_threaded_paths = PARAM_VALUE (PARAM_MAX_THREAD_PATHS);
+      fsm_find_control_statement_thread_paths (cond, visited_phis, bb_path);
+
+      delete visited_phis;
+      vec_free (bb_path);
+
+      return -1;
     }
   return 0;
 }
diff --git a/gcc/tree-ssa-threadupdate.c b/gcc/tree-ssa-threadupdate.c
index 151ed83..5847078 100644
--- a/gcc/tree-ssa-threadupdate.c
+++ b/gcc/tree-ssa-threadupdate.c
@@ -167,8 +167,9 @@ dump_jump_thread_path (FILE *dump_file, vec<jump_thread_edge *> path,
 		       bool registering)
 {
   fprintf (dump_file,
-	   "  %s jump thread: (%d, %d) incoming edge; ",
+	   "  %s%s jump thread: (%d, %d) incoming edge; ",
 	   (registering ? "Registering" : "Cancelling"),
+	   (path[0]->type == EDGE_START_FSM_THREAD ? " FSM": ""),
 	   path[0]->e->src->index, path[0]->e->dest->index);
 
   for (unsigned int i = 1; i < path.length (); i++)
@@ -2343,6 +2344,55 @@ thread_through_all_blocks (bool may_peel_loop_headers)
   threaded_blocks = BITMAP_ALLOC (NULL);
   memset (&thread_stats, 0, sizeof (thread_stats));
 
+  for (i = 0; i < paths.length (); )
+    {
+      vec<jump_thread_edge *> *path = paths[i];
+      edge entry = (*path)[0]->e;
+
+      if ((*path)[0]->type != EDGE_START_FSM_THREAD
+	  /* Do not jump-thread twice from the same block.  */
+	  || bitmap_bit_p (threaded_blocks, entry->src->index)) {
+	i++;
+	continue;
+      }
+
+      unsigned len = path->length ();
+      edge exit = (*path)[len - 1]->e;
+      basic_block *region = XNEWVEC (basic_block, len - 1);
+
+      for (unsigned int j = 0; j < len - 1; j++)
+	region[j] = (*path)[j]->e->dest;
+
+      bool success = gimple_duplicate_sese_region (entry, exit, region,
+						   len - 1, NULL, 0);
+      delete_jump_thread_path (path);
+      paths.unordered_remove (i);
+
+      if (success) {
+	/* We do not update dominance info.  */
+	free_dominance_info (CDI_DOMINATORS);
+
+	bitmap_set_bit (threaded_blocks, entry->src->index);
+      }
+    }
+
+  for (i = 0; i < paths.length (); )
+    {
+      vec<jump_thread_edge *> *path = paths[i];
+      edge entry = (*path)[0]->e;
+
+      /* Do not jump-thread twice from the same block.  */
+      if (bitmap_bit_p (threaded_blocks, entry->src->index))
+	{
+	  delete_jump_thread_path (path);
+	  paths.unordered_remove (i);
+	}
+      else
+	i++;
+    }
+
+  bitmap_clear (threaded_blocks);
+
   mark_threaded_blocks (threaded_blocks);
 
   initialize_original_copy_tables ();
diff --git a/gcc/tree-ssa-threadupdate.h b/gcc/tree-ssa-threadupdate.h
index 426aca5..42c3a9e 100644
--- a/gcc/tree-ssa-threadupdate.h
+++ b/gcc/tree-ssa-threadupdate.h
@@ -26,6 +26,7 @@ extern bool thread_through_all_blocks (bool);
 enum jump_thread_edge_type
 {
   EDGE_START_JUMP_THREAD,
+  EDGE_START_FSM_THREAD,
   EDGE_COPY_SRC_BLOCK,
   EDGE_COPY_SRC_JOINER_BLOCK,
   EDGE_NO_COPY_SRC_BLOCK
-- 
2.1.0.243.g30d45f7


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-11  1:40             ` Sebastian Pop
@ 2014-11-17  9:29               ` James Greenhalgh
  2014-11-18 19:36                 ` Steve Ellcey
  2014-11-17 12:47               ` Richard Biener
  1 sibling, 1 reply; 54+ messages in thread
From: James Greenhalgh @ 2014-11-17  9:29 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Jeff Law, Richard Biener, Steve Ellcey, GCC Patches

On Tue, Nov 11, 2014 at 01:14:04AM +0000, Sebastian Pop wrote:
> Hi Jeff,
> 
> I have adapted the code generation part from James' patch to current trunk, and
> the resulting patch gets the 30% speedup on coremark and passes bootstrap of GCC.

For what it is worth, I've bootstrapped and tested this patch on
aarch64-none-linux-gnu and arm-none-linux-gnueabi with no issues, and
both targets get the expected speedup in the interesting benchmark.
I've also thrown some of the larger popular benchmark suites at it, and
they've compiled and run without any compilation issues or miscompares.

I'm happy to help out with the testing and bug-triaging effort once this
patch goes in.

Some very shallow comments: you should document the new parameters
in doc/invoke.texi and you ought to run contrib/check_GNU_style.sh
on the patch and clean up the coding style issues it highlights.

Thanks,
James Greenhalgh

> diff --git a/gcc/params.def b/gcc/params.def
> index d2d2add..749f962 100644
> --- a/gcc/params.def
> +++ b/gcc/params.def
> @@ -123,6 +123,25 @@ DEFPARAM (PARAM_PARTIAL_INLINING_ENTRY_PROBABILITY,
>  	  "Maximum probability of the entry BB of split region (in percent relative to entry BB of the function) to make partial inlining happen",
>  	  70, 0, 0)
>  
> +/* Maximum number of instructions to copy when duplicating blocks
> +   on a jump thread path.  */
> +DEFPARAM (PARAM_MAX_THREAD_PATH_INSNS,
> +	  "max-thread-path-insns",
> +	  "Maximum number of instructions to copy when duplicating blocks on a jump thread path",
> +	  100, 1, 999999)
> +
> +/* Maximum length of a jump thread path.  */
> +DEFPARAM (PARAM_MAX_THREAD_LENGTH,
> +	  "max-thread-length",
> +	  "Maximum number of basic blocks on a jump thread path",
> +	  10, 1, 999999)
> +
> +/* Maximum number of jump thread paths to duplicate.  */
> +DEFPARAM (PARAM_MAX_THREAD_PATHS,
> +	  "max-thread-paths",
> +	  "Maximum number of new jump thread paths to create",
> +	  50, 1, 999999)
> +
>  /* Limit the number of expansions created by the variable expansion
>     optimization to avoid register pressure.  */
>  DEFPARAM (PARAM_MAX_VARIABLE_EXPANSIONS,
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
> new file mode 100644
> index 0000000..f3ef725
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
> @@ -0,0 +1,32 @@
> +int sum0, sum1, sum2, sum3;
> +int foo(char * s, char** ret)
> +{
> +  int state=0;
> +  char c;
> +
> +  for (; *s && state != 4; s++)
> +    {
> +      c = *s;
> +      if (c == '*')
> +	{
> +	  s++;
> +	  break;
> +	}
> +      switch (state) {
> +	case 0:
> +	  if (c == '+') state = 1;
> +	  else if (c != '-') sum0+=c;
> +	  break;
> +	case 1:
> +	  if (c == '+') state = 2;
> +	  else if (c == '-') state = 0;
> +	  else sum1+=c;
> +	  break;
> +	default:
> +	  break;
> +      }
> +
> +    }
> +  *ret = s;
> +  return state;
> +}
> diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
> index ee10bc6..565cfe3 100644
> --- a/gcc/tree-cfg.c
> +++ b/gcc/tree-cfg.c
> @@ -5949,10 +5949,12 @@ gimple_duplicate_sese_region (edge entry, edge exit,
>  {
>    unsigned i;
>    bool free_region_copy = false, copying_header = false;
> +  bool save_loop_details = false;
>    struct loop *loop = entry->dest->loop_father;
>    edge exit_copy;
>    vec<basic_block> doms;
>    edge redirected;
> +  int memo_loop_header_no = 0, memo_loop_latch_no = 0;
>    int total_freq = 0, entry_freq = 0;
>    gcov_type total_count = 0, entry_count = 0;
>  
> @@ -5970,9 +5972,15 @@ gimple_duplicate_sese_region (edge entry, edge exit,
>        if (region[i]->loop_father != loop)
>  	return false;
>  
> -      if (region[i] != entry->dest
> -	  && region[i] == loop->header)
> -	return false;
> +      /* If we are copying an abnormally shaped region, keep track of
> +	 which block will become our loop header.  */
> +      if ((region[i] != entry->dest && region[i] == loop->header)
> +	  || (region[i] != entry->src && region[i] == loop->latch))
> +	{
> +	  save_loop_details = true;
> +	  memo_loop_latch_no = i;
> +	  memo_loop_header_no = i;
> +	}
>      }
>  
>    /* In case the function is used for loop header copying (which is the primary
> @@ -6055,6 +6063,13 @@ gimple_duplicate_sese_region (edge entry, edge exit,
>        loop->latch = exit->src;
>      }
>  
> +  /* Restore loop details if we were asked to save them.  */
> +  if (save_loop_details)
> +    {
> +      loop->header = region_copy[memo_loop_header_no];
> +      loop->latch = region_copy[memo_loop_latch_no];
> +    }
> +
>    /* Redirect the entry and add the phi node arguments.  */
>    redirected = redirect_edge_and_branch (entry, get_bb_copy (entry->dest));
>    gcc_assert (redirected != NULL);
> diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
> index d5b9696..de9b3fe 100644
> --- a/gcc/tree-ssa-threadedge.c
> +++ b/gcc/tree-ssa-threadedge.c
> @@ -56,6 +56,8 @@ along with GCC; see the file COPYING3.  If not see
>  #include "params.h"
>  #include "tree-ssa-threadedge.h"
>  #include "builtins.h"
> +#include "cfg.h"
> +#include "cfganal.h"
>  
>  /* To avoid code explosion due to jump threading, we limit the
>     number of statements we are going to copy.  This variable
> @@ -660,6 +662,7 @@ simplify_control_stmt_condition (edge e,
>       rather than use a relational operator.  These are simpler to handle.  */
>    if (TREE_CODE (cond) == SSA_NAME)
>      {
> +      tree original_lhs = cond;
>        cached_lhs = cond;
>  
>        /* Get the variable's current value from the equivalence chains.
> @@ -688,6 +691,12 @@ simplify_control_stmt_condition (edge e,
>  	 pass specific callback to try and simplify it further.  */
>        if (cached_lhs && ! is_gimple_min_invariant (cached_lhs))
>          cached_lhs = (*simplify) (stmt, stmt);
> +
> +      /* We couldn't find an invariant.  But, callers of this
> +	 function may be able to do something useful with the
> +	 unmodified destination.  */
> +      if (!cached_lhs)
> +	cached_lhs = original_lhs;
>      }
>    else
>      cached_lhs = NULL;
> @@ -947,6 +956,172 @@ thread_around_empty_blocks (edge taken_edge,
>    return false;
>  }
>  
> +/* Return true if there is at least one path from START_BB to END_BB.
> +   VISITED_BBS is used to make sure we don't fall into an infinite loop.  */
> +
> +static bool
> +fsm_find_thread_path (basic_block start_bb, basic_block end_bb,
> +		      vec<basic_block, va_gc> *&path,
> +		      hash_set<basic_block> *visited_bbs, int n_insns)
> +{
> +  if (start_bb == end_bb)
> +    {
> +      vec_safe_push (path, start_bb);
> +      return true;
> +    }
> +
> +  if (!visited_bbs->add(start_bb))
> +    {
> +      edge e;
> +      edge_iterator ei;
> +      FOR_EACH_EDGE (e, ei, start_bb->succs)
> +	if (fsm_find_thread_path (e->dest, end_bb, path, visited_bbs, n_insns))
> +	  {
> +	    vec_safe_push (path, start_bb);
> +	    return true;
> +	  }
> +    }
> +
> +  return false;
> +}
> +
> +static int max_threaded_paths;
> +
> +/* We trace the value of the variable EXPR back through any phi nodes looking
> +   for places where it gets a constant value and save the path.  Stop after
> +   having recorded MAX_PATHS jump threading paths.  */
> +
> +static void
> +fsm_find_control_statement_thread_paths (tree expr,
> +					 hash_set<gimple> *visited_phis,
> +					 vec<basic_block, va_gc> *&path)
> +{
> +  tree var = SSA_NAME_VAR (expr);
> +  gimple def_stmt = SSA_NAME_DEF_STMT (expr);
> +  basic_block var_bb = gimple_bb (def_stmt);
> +
> +  if (var == NULL || var_bb == NULL)
> +    return;
> +
> +  vec<basic_block, va_gc> *next_path;
> +  vec_alloc (next_path, n_basic_blocks_for_fn (cfun));
> +
> +  basic_block last_bb_in_path = path->last ();
> +
> +  /* Put the path from var_bb to last_bb_in_path into next_path.  */
> +  if (var_bb != last_bb_in_path)
> +    {
> +      edge e;
> +      int e_count = 0;
> +      edge_iterator ei;
> +
> +      FOR_EACH_EDGE (e, ei, last_bb_in_path->preds)
> +	{
> +	  hash_set<basic_block> *visited_bbs = new hash_set<basic_block>;
> +
> +	  if (fsm_find_thread_path (var_bb, e->src, next_path, visited_bbs, 0))
> +	    ++e_count;
> +
> +	  delete visited_bbs;
> +
> +	  /* If there is more than one path, stop.  */
> +	  if (e_count > 1)
> +	    {
> +	      vec_free (next_path);
> +	      return;
> +	    }
> +	}
> +    }
> +
> +  /* Visit PHI nodes once.  */
> +  if (gimple_code (def_stmt) != GIMPLE_PHI
> +      || visited_phis->add(def_stmt))
> +    {
> +      vec_free (next_path);
> +      return;
> +    }
> +
> +  /* Append all the nodes from next_path to path.  */
> +  vec_safe_splice (path, next_path);
> +  gcc_assert (path->last () == var_bb);
> +
> +  /* Iterate over the arguments of PHI.  */
> +  unsigned int i;
> +  for (i = 0; i < gimple_phi_num_args (def_stmt); i++)
> +    {
> +      tree arg = gimple_phi_arg_def (def_stmt, i);
> +      basic_block bbi = gimple_phi_arg_edge (def_stmt, i)->src;
> +
> +      /* Skip edges pointing outside the current loop.  */
> +      if (!arg || var_bb->loop_father != bbi->loop_father)
> +	continue;
> +
> +      /* Add BBI to the path.  */
> +      vec_safe_push (path, bbi);
> +
> +      if (TREE_CODE (arg) == INTEGER_CST)
> +	{
> +	  int j, n = path->length ();
> +	  vec<jump_thread_edge *> *jump_thread_path
> +	    = new vec<jump_thread_edge *> ();
> +	  int joiners = 0;
> +
> +	  for (j = 0; j < n - 1; j++)
> +	    {
> +	      edge e = find_edge ((*path)[n - j - 1],
> +				  (*path)[n - j - 2]);
> +	      gcc_assert (e);
> +	      enum jump_thread_edge_type kind;
> +
> +	      if (j == 0)
> +		kind = EDGE_START_FSM_THREAD;
> +	      else if (single_pred_p (e->src))
> +		kind = EDGE_NO_COPY_SRC_BLOCK;
> +	      else {
> +		kind = EDGE_COPY_SRC_JOINER_BLOCK;
> +		++joiners;
> +	      }
> +
> +	      jump_thread_edge *x = new jump_thread_edge (e, kind);
> +	      jump_thread_path->safe_push (x);
> +	    }
> +
> +	  /* Add the edge taken when the control variable has value ARG.  */
> +	  edge taken_edge = find_taken_edge ((*path)[0], arg);
> +	  jump_thread_edge *x
> +	    = new jump_thread_edge (taken_edge, EDGE_NO_COPY_SRC_BLOCK);
> +	  jump_thread_path->safe_push (x);
> +
> +	  /* A path with less than 3 nodes should not be jump-threaded.  */
> +	  if (n > 2 && n < PARAM_VALUE (PARAM_MAX_THREAD_LENGTH)
> +	      && max_threaded_paths > 0)
> +	    {
> +	      int n_insns = 0;
> +	      gimple_stmt_iterator gsi;
> +
> +	      for (j = 1; j < n - 1; j++)
> +		for (gsi = gsi_start_bb ((*path)[j]); !gsi_end_p (gsi); gsi_next (&gsi))
> +		  ++n_insns;
> +
> +	      if (n_insns < PARAM_VALUE (PARAM_MAX_THREAD_PATH_INSNS))
> +		{
> +		  register_jump_thread (jump_thread_path);
> +		  --max_threaded_paths;
> +		}
> +	    }
> +	}
> +      else if (TREE_CODE (arg) == SSA_NAME)
> +	fsm_find_control_statement_thread_paths (arg, visited_phis, path);
> +
> +      /* Remove BBI from the path.  */
> +      path->pop ();
> +    }
> +
> +  /* Remove all the nodes that we added from next_path.  */
> +  vec_safe_truncate (path, (path->length () - next_path->length ()));
> +  vec_free (next_path);
> +}
> +
>  /* We are exiting E->src, see if E->dest ends with a conditional
>     jump which has a known value when reached via E.
>  
> @@ -1032,7 +1207,10 @@ thread_through_normal_block (edge e,
>        cond = simplify_control_stmt_condition (e, stmt, dummy_cond, simplify,
>  					      handle_dominating_asserts);
>  
> -      if (cond && is_gimple_min_invariant (cond))
> +      if (!cond)
> +	return 0;
> +
> +      if (is_gimple_min_invariant (cond))
>  	{
>  	  edge taken_edge = find_taken_edge (e->dest, cond);
>  	  basic_block dest = (taken_edge ? taken_edge->dest : NULL);
> @@ -1078,6 +1256,26 @@ thread_through_normal_block (edge e,
>  				      backedge_seen_p);
>  	  return 1;
>  	}
> +
> +      if (TREE_CODE (cond) != SSA_NAME
> +	  || e->dest->loop_father != e->src->loop_father)
> +	return 0;
> +
> +      /* When COND cannot be simplified, try to find paths from a control
> +	 statement back through the PHI nodes which would affect that control
> +	 statement.  */
> +      vec<basic_block, va_gc> *bb_path;
> +      vec_alloc (bb_path, n_basic_blocks_for_fn (cfun));
> +      vec_safe_push (bb_path, e->dest);
> +      hash_set<gimple> *visited_phis = new hash_set<gimple>;
> +
> +      max_threaded_paths = PARAM_VALUE (PARAM_MAX_THREAD_PATHS);
> +      fsm_find_control_statement_thread_paths (cond, visited_phis, bb_path);
> +
> +      delete visited_phis;
> +      vec_free (bb_path);
> +
> +      return -1;
>      }
>    return 0;
>  }
> diff --git a/gcc/tree-ssa-threadupdate.c b/gcc/tree-ssa-threadupdate.c
> index 151ed83..5847078 100644
> --- a/gcc/tree-ssa-threadupdate.c
> +++ b/gcc/tree-ssa-threadupdate.c
> @@ -167,8 +167,9 @@ dump_jump_thread_path (FILE *dump_file, vec<jump_thread_edge *> path,
>  		       bool registering)
>  {
>    fprintf (dump_file,
> -	   "  %s jump thread: (%d, %d) incoming edge; ",
> +	   "  %s%s jump thread: (%d, %d) incoming edge; ",
>  	   (registering ? "Registering" : "Cancelling"),
> +	   (path[0]->type == EDGE_START_FSM_THREAD ? " FSM": ""),
>  	   path[0]->e->src->index, path[0]->e->dest->index);
>  
>    for (unsigned int i = 1; i < path.length (); i++)
> @@ -2343,6 +2344,55 @@ thread_through_all_blocks (bool may_peel_loop_headers)
>    threaded_blocks = BITMAP_ALLOC (NULL);
>    memset (&thread_stats, 0, sizeof (thread_stats));
>  
> +  for (i = 0; i < paths.length (); )
> +    {
> +      vec<jump_thread_edge *> *path = paths[i];
> +      edge entry = (*path)[0]->e;
> +
> +      if ((*path)[0]->type != EDGE_START_FSM_THREAD
> +	  /* Do not jump-thread twice from the same block.  */
> +	  || bitmap_bit_p (threaded_blocks, entry->src->index)) {
> +	i++;
> +	continue;
> +      }
> +
> +      unsigned len = path->length ();
> +      edge exit = (*path)[len - 1]->e;
> +      basic_block *region = XNEWVEC (basic_block, len - 1);
> +
> +      for (unsigned int j = 0; j < len - 1; j++)
> +	region[j] = (*path)[j]->e->dest;
> +
> +      bool success = gimple_duplicate_sese_region (entry, exit, region,
> +						   len - 1, NULL, 0);
> +      delete_jump_thread_path (path);
> +      paths.unordered_remove (i);
> +
> +      if (success) {
> +	/* We do not update dominance info.  */
> +	free_dominance_info (CDI_DOMINATORS);
> +
> +	bitmap_set_bit (threaded_blocks, entry->src->index);
> +      }
> +    }
> +
> +  for (i = 0; i < paths.length (); )
> +    {
> +      vec<jump_thread_edge *> *path = paths[i];
> +      edge entry = (*path)[0]->e;
> +
> +      /* Do not jump-thread twice from the same block.  */
> +      if (bitmap_bit_p (threaded_blocks, entry->src->index))
> +	{
> +	  delete_jump_thread_path (path);
> +	  paths.unordered_remove (i);
> +	}
> +      else
> +	i++;
> +    }
> +
> +  bitmap_clear (threaded_blocks);
> +
>    mark_threaded_blocks (threaded_blocks);
>  
>    initialize_original_copy_tables ();
> diff --git a/gcc/tree-ssa-threadupdate.h b/gcc/tree-ssa-threadupdate.h
> index 426aca5..42c3a9e 100644
> --- a/gcc/tree-ssa-threadupdate.h
> +++ b/gcc/tree-ssa-threadupdate.h
> @@ -26,6 +26,7 @@ extern bool thread_through_all_blocks (bool);
>  enum jump_thread_edge_type
>  {
>    EDGE_START_JUMP_THREAD,
> +  EDGE_START_FSM_THREAD,
>    EDGE_COPY_SRC_BLOCK,
>    EDGE_COPY_SRC_JOINER_BLOCK,
>    EDGE_NO_COPY_SRC_BLOCK
> -- 
> 2.1.0.243.g30d45f7

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-11  1:40             ` Sebastian Pop
  2014-11-17  9:29               ` James Greenhalgh
@ 2014-11-17 12:47               ` Richard Biener
  2014-11-18 22:29                 ` Sebastian Pop
  1 sibling, 1 reply; 54+ messages in thread
From: Richard Biener @ 2014-11-17 12:47 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Jeff Law, James Greenhalgh, Steve Ellcey, GCC Patches

On Tue, Nov 11, 2014 at 2:14 AM, Sebastian Pop <sebpop@gmail.com> wrote:
> Hi Jeff,
>
> I have adapted the code generation part from James' patch to current trunk, and
> the resulting patch gets the 30% speedup on coremark and passes bootstrap of GCC.
>
> Ok for trunk?

In addition to missing documentation for the parameters

+      /* If we are copying an abnormally shaped region, keep track of
+        which block will become our loop header.  */
+      if ((region[i] != entry->dest && region[i] == loop->header)
+         || (region[i] != entry->src && region[i] == loop->latch))
+       {
+         save_loop_details = true;
+         memo_loop_latch_no = i;
+         memo_loop_header_no = i;
+       }

this looks bogus as you overwrite latch/header.  I wonder what you
tried to fix with this as "abnormally shaped" isn't sth we support
given the check before (all blocks must belong to the same loop
and thus entry is always the loop header or there is no loop)?

I'll leave the rest to Jeff but it looks good to me from an overall
structure.

Thanks,
Richard.



> Thanks,
> Sebastian
>
> Sebastian Pop wrote:
>> Sebastian Pop wrote:
>> > Jeff Law wrote:
>> > > On 08/21/14 04:30, Richard Biener wrote:
>> > > >>It turns Jeff's jump-threading code in to a strange franken-pass of bits and
>> > > >>pieces of detection and optimisation, and would need some substantial
>> > > >>reworking to fit in with Jeff's changes last Autumn, but if it is more
>> > > >>likely to be acceptable for trunk then perhaps we could look to revive it.
>> > > >>It would be nice to reuse the path copy code Jeff added last year, but I
>> > > >>don't have much intuition as to how feasible that is.
>> > > >>
>> > > >>Was this the sort of thing that you were imagining?
>> > > >
>> > > >Yeah, didn't look too closely though.
>> > > It'd be pretty ugly I suspect.  But it's probably worth pondering
>> > > since that approach would eliminate the concerns about the cost of
>> > > detection (which is problematical for the jump threader) by using
>> > > Steve's code for that.
>> > >
>> > > On the update side, I suspect most, if not all of the framework is
>> > > in place to handle this kind of update if the right threading paths
>> > > were passed to the updater.  I can probably cobble together that
>> > > by-hand and see what the tree-ssa-threadupdate does with it.  But
>> > > it'll be a week or so before I could look at it.
>> >
>> > I adapted the patch James has sent last year to use the new update paths
>>
>> Attached an updated version of the patch.
>>
>> > mechanism.  I verified that the attached patch does register all the paths that
>> > need to be threaded.  Thread updater seems to have some problems handling the
>> > attached testcase (a simplified version of the testcase attached to the bug.)
>> >
>> > Jeff, could you please have a look at why the jump-thread updater is crashing?
>>
>> I have tried to understand why the code generation part ICEs on coremark: the
>> first problem that I have seen is that tree-ssa-threadupdate.c does not handle
>> more than a joiner block per path to be threaded, so we would not be able to
>> jump thread accross the joiners of the if condition and the joiner of the switch
>> condition: i.e., these paths
>>
>> patch:   Registering jump thread: (7, 10) incoming edge;  (10, 25) joiner;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 14) nocopy;
>> patch:   Registering jump thread: (28, 10) incoming edge;  (10, 25) joiner;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 11) nocopy;
>> patch:   Registering jump thread: (8, 10) incoming edge;  (10, 25) joiner;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
>> patch:   Registering jump thread: (9, 10) incoming edge;  (10, 25) joiner;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>>
>> Another problem is that we attach the path to be threaded to the ->aux field of
>> the first edge in the path, such that we would have to cancel some of the paths
>> because we cannot keep track of all the paths to be threaded.
>>
>> For coremark, we would discover some jump-thread paths from one of the switch
>> cases over the loop exit condition, either to bb_27 outside the loop, or to bb_4
>> staying inside the loop.  Then with the "patch:" we would discover jump threads
>> that would thread switch cases to switch cases, and because these paths start
>> with the same edges for which we have already assigned a path to e->aux, we
>> would have to cancel the interesting threads added by the patch:
>>
>>   Registering jump thread: (12, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
>>   Registering jump thread: (13, 25) incoming edge;  (25, 26) joiner;  (26, 27) nocopy;
>>   Registering jump thread: (29, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
>>   Registering jump thread: (31, 25) incoming edge;  (25, 26) joiner;  (26, 27) nocopy;
>>   Registering jump thread: (16, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
>>   Registering jump thread: (15, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
>>   Registering jump thread: (32, 25) incoming edge;  (25, 26) joiner;  (26, 27) nocopy;
>>   Registering jump thread: (19, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
>>   Registering jump thread: (18, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
>>   Registering jump thread: (22, 25) incoming edge;  (25, 26) joiner;  (26, 27) nocopy;
>>   Registering jump thread: (21, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
>>   Registering jump thread: (34, 25) incoming edge;  (25, 26) joiner;  (26, 27) nocopy;
>>   Registering jump thread: (33, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
>>   Registering jump thread: (35, 25) incoming edge;  (25, 26) joiner;  (26, 27) nocopy;
>>   Registering jump thread: (24, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy;
>> patch:   Registering jump thread: (12, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
>> patch:   Registering jump thread: (16, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 14) nocopy;
>> patch:   Registering jump thread: (19, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
>> patch:   Registering jump thread: (22, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>> patch:   Registering jump thread: (34, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>> patch:   Registering jump thread: (35, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>> patch:   Registering jump thread: (29, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 14) nocopy;
>> patch:   Registering jump thread: (13, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>> patch:   Registering jump thread: (15, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
>> patch:   Registering jump thread: (31, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>> patch:   Registering jump thread: (18, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 20) nocopy;
>> patch:   Registering jump thread: (32, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>> patch:   Registering jump thread: (21, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 23) nocopy;
>> patch:   Registering jump thread: (33, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 24) nocopy;
>> patch:   Registering jump thread: (24, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 24) nocopy;
>>   Registering jump thread: (6, 36) incoming edge;  (36, 7) normal;
>>   Cancelling jump thread: (12, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
>>   Cancelling jump thread: (16, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 14) nocopy;
>>   Cancelling jump thread: (19, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
>>   Cancelling jump thread: (22, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>>   Cancelling jump thread: (34, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>>   Cancelling jump thread: (35, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>>   Cancelling jump thread: (29, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 14) nocopy;
>>   Cancelling jump thread: (13, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>>   Cancelling jump thread: (15, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 17) nocopy;
>>   Cancelling jump thread: (31, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>>   Cancelling jump thread: (18, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 20) nocopy;
>>   Cancelling jump thread: (32, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 25) nocopy;
>>   Cancelling jump thread: (21, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 23) nocopy;
>>   Cancelling jump thread: (33, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 24) nocopy;
>>   Cancelling jump thread: (24, 25) incoming edge;  (25, 26) joiner;  (26, 4) nocopy; (4, 37) nocopy; (37, 36) nocopy; (36, 24) nocopy;
>>
>> Here is the structure of the CFG with the loops:
>>
>> (gdb) p debug_loops (2)
>> loop_0 (header = 0, latch = 1, niter = )
>> {
>>   bb_2 (preds = {bb_0 }, succs = {bb_3 bb_27 })
>>   bb_3 (preds = {bb_2 }, succs = {bb_5 bb_6 })
>>   bb_5 (preds = {bb_4 bb_3 }, succs = {bb_27 })
>>   bb_6 (preds = {bb_3 }, succs = {bb_36 })
>>   bb_27 (preds = {bb_5 bb_25 bb_26 bb_2 }, succs = {bb_1 })
>>   loop_1 (header = 36, latch = 37, niter = )
>>   {
>>     bb_4 (preds = {bb_26 }, succs = {bb_5 bb_37 })
>>     bb_37 (preds = {bb_4 }, succs = {bb_36 })
>>     bb_36 (preds = {bb_6 bb_37 }, succs = {bb_25 bb_7 bb_11 bb_20 bb_14 bb_17 bb_23 bb_24 })
>>     bb_7 (preds = {bb_36 }, succs = {bb_10 bb_28 })
>>     bb_8 (preds = {bb_28 }, succs = {bb_10 bb_9 })
>>     bb_9 (preds = {bb_8 }, succs = {bb_10 })
>>     bb_10 (preds = {bb_7 bb_28 bb_8 bb_9 }, succs = {bb_25 })
>>     bb_11 (preds = {bb_36 }, succs = {bb_29 bb_30 })
>>     bb_12 (preds = {bb_30 }, succs = {bb_25 })
>>     bb_13 (preds = {bb_30 }, succs = {bb_25 })
>>     bb_14 (preds = {bb_36 }, succs = {bb_15 bb_16 })
>>     bb_15 (preds = {bb_14 }, succs = {bb_25 })
>>     bb_16 (preds = {bb_14 }, succs = {bb_25 bb_31 })
>>     bb_17 (preds = {bb_36 }, succs = {bb_18 bb_19 })
>>     bb_18 (preds = {bb_17 }, succs = {bb_25 })
>>     bb_19 (preds = {bb_17 }, succs = {bb_25 bb_32 })
>>     bb_20 (preds = {bb_36 }, succs = {bb_21 bb_22 })
>>     bb_21 (preds = {bb_20 }, succs = {bb_25 })
>>     bb_22 (preds = {bb_20 }, succs = {bb_25 })
>>     bb_23 (preds = {bb_36 }, succs = {bb_33 bb_34 })
>>     bb_24 (preds = {bb_36 }, succs = {bb_25 bb_35 })
>>     bb_25 (preds = {bb_10 bb_12 bb_16 bb_19 bb_22 bb_34 bb_35 bb_36 bb_29 bb_13 bb_15 bb_31 bb_18 bb_32 bb_21 bb_33 bb_24 }, succs = {bb_26 bb_27 })
>>     bb_26 (preds = {bb_25 }, succs = {bb_4 bb_27 })
>>     bb_28 (preds = {bb_7 }, succs = {bb_10 bb_8 })
>>     bb_29 (preds = {bb_11 }, succs = {bb_25 })
>>     bb_30 (preds = {bb_11 }, succs = {bb_12 bb_13 })
>>     bb_31 (preds = {bb_16 }, succs = {bb_25 })
>>     bb_32 (preds = {bb_19 }, succs = {bb_25 })
>>     bb_33 (preds = {bb_23 }, succs = {bb_25 })
>>     bb_34 (preds = {bb_23 }, succs = {bb_25 })
>>     bb_35 (preds = {bb_24 }, succs = {bb_25 })
>>   }
>> }
>>
>> What about removing the use of e->aux in threadupdate.c, to be able to jump
>> thread across all the recorded paths?
>>
>> Thanks,
>> Sebastian
>
>> From bac0f2a390048652910f77503b21b3e208daeae1 Mon Sep 17 00:00:00 2001
>> From: Sebastian Pop <s.pop@samsung.com>
>> Date: Fri, 26 Sep 2014 14:54:20 -0500
>> Subject: [PATCH] jump thread for PR 54742
>>
>> Adapted from a patch from James Greenhalgh.
>>
>>       * tree-ssa-threadedge.c (simplify_control_stmt_condition): Restore the
>>       original value of cond when simplification fails.
>>       (find_thread_path): New.
>>       (find_control_statement_thread_paths): New.
>>       (thread_through_normal_block): Call find_control_statement_thread_paths.
>>
>>       * testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c: New.
>> ---
>>  gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c |  32 ++++
>>  gcc/tree-ssa-threadedge.c                        | 180 ++++++++++++++++++++++-
>>  gcc/tree-ssa-threadupdate.c                      |   4 +
>>  3 files changed, 215 insertions(+), 1 deletion(-)
>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
>>
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
>> new file mode 100644
>> index 0000000..f3ef725
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
>> @@ -0,0 +1,32 @@
>> +int sum0, sum1, sum2, sum3;
>> +int foo(char * s, char** ret)
>> +{
>> +  int state=0;
>> +  char c;
>> +
>> +  for (; *s && state != 4; s++)
>> +    {
>> +      c = *s;
>> +      if (c == '*')
>> +     {
>> +       s++;
>> +       break;
>> +     }
>> +      switch (state) {
>> +     case 0:
>> +       if (c == '+') state = 1;
>> +       else if (c != '-') sum0+=c;
>> +       break;
>> +     case 1:
>> +       if (c == '+') state = 2;
>> +       else if (c == '-') state = 0;
>> +       else sum1+=c;
>> +       break;
>> +     default:
>> +       break;
>> +      }
>> +
>> +    }
>> +  *ret = s;
>> +  return state;
>> +}
>> diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
>> index 3dee5ba..7b9e5b6 100644
>> --- a/gcc/tree-ssa-threadedge.c
>> +++ b/gcc/tree-ssa-threadedge.c
>> @@ -628,6 +628,7 @@ simplify_control_stmt_condition (edge e,
>>       rather than use a relational operator.  These are simpler to handle.  */
>>    if (TREE_CODE (cond) == SSA_NAME)
>>      {
>> +      tree original_lhs = cond;
>>        cached_lhs = cond;
>>
>>        /* Get the variable's current value from the equivalence chains.
>> @@ -656,6 +657,12 @@ simplify_control_stmt_condition (edge e,
>>        pass specific callback to try and simplify it further.  */
>>        if (cached_lhs && ! is_gimple_min_invariant (cached_lhs))
>>          cached_lhs = (*simplify) (stmt, stmt);
>> +
>> +      /* We couldn't find an invariant.  But, callers of this
>> +      function may be able to do something useful with the
>> +      unmodified destination.  */
>> +      if (!cached_lhs)
>> +     cached_lhs = original_lhs;
>>      }
>>    else
>>      cached_lhs = NULL;
>> @@ -915,6 +922,155 @@ thread_around_empty_blocks (edge taken_edge,
>>    return false;
>>  }
>>
>> +/* Return true if there is at least one path from START_BB to END_BB.
>> +   VISITED_BBS is used to make sure we don't fall into an infinite loop.  */
>> +
>> +static bool
>> +find_thread_path (basic_block start_bb, basic_block end_bb,
>> +                 vec<basic_block, va_gc> *&path,
>> +                 hash_set<basic_block> *visited_bbs)
>> +{
>> +  if (start_bb == end_bb)
>> +    {
>> +      vec_safe_push (path, start_bb);
>> +      return true;
>> +    }
>> +
>> +  if (!visited_bbs->add(start_bb))
>> +    {
>> +      edge e;
>> +      edge_iterator ei;
>> +      FOR_EACH_EDGE (e, ei, start_bb->succs)
>> +     if (find_thread_path (e->dest, end_bb, path, visited_bbs))
>> +       {
>> +         vec_safe_push (path, start_bb);
>> +         return true;
>> +       }
>> +    }
>> +
>> +  return false;
>> +}
>> +
>> +/* We trace the value of the variable EXPR back through any phi nodes looking
>> +   for places where it gets a constant value and save the path.  */
>> +
>> +static void
>> +find_control_statement_thread_paths (tree expr,
>> +                                  hash_set<gimple> *visited_phis,
>> +                                  vec<basic_block, va_gc> *&path)
>> +{
>> +  tree var = SSA_NAME_VAR (expr);
>> +  gimple def_stmt = SSA_NAME_DEF_STMT (expr);
>> +  basic_block var_bb = gimple_bb (def_stmt);
>> +
>> +  if (var == NULL || var_bb == NULL)
>> +    return;
>> +
>> +  vec<basic_block, va_gc> *next_path;
>> +  vec_alloc (next_path, n_basic_blocks_for_fn (cfun));
>> +
>> +  basic_block last_bb_in_path = path->last ();
>> +
>> +  /* Put the path from var_bb to last_bb_in_path into next_path.  */
>> +  if (var_bb != last_bb_in_path)
>> +    {
>> +      edge e;
>> +      int e_count = 0;
>> +      edge_iterator ei;
>> +
>> +      FOR_EACH_EDGE (e, ei, last_bb_in_path->preds)
>> +     {
>> +       hash_set<basic_block> *visited_bbs = new hash_set<basic_block>;
>> +
>> +       if (find_thread_path (var_bb, e->src, next_path, visited_bbs))
>> +         e_count = e_count + 1;
>> +
>> +       delete visited_bbs;
>> +
>> +       /* If there is more than one path, stop.  */
>> +       if (e_count > 1)
>> +         {
>> +           vec_free (next_path);
>> +           return;
>> +         }
>> +     }
>> +    }
>> +
>> +  /* Visit PHI nodes once.  */
>> +  if (gimple_code (def_stmt) != GIMPLE_PHI
>> +      || visited_phis->add(def_stmt)) {
>> +    vec_free (next_path);
>> +    return;
>> +  }
>> +
>> +  /* Append all the nodes from next_path to path.  */
>> +  vec_safe_splice (path, next_path);
>> +  gcc_assert (path->last () == var_bb);
>> +
>> +  /* Iterate over the arguments of PHI.  */
>> +  unsigned int i;
>> +  for (i = 0; i < gimple_phi_num_args (def_stmt); i++)
>> +    {
>> +      tree arg = gimple_phi_arg_def (def_stmt, i);
>> +      basic_block bbi = gimple_phi_arg_edge (def_stmt, i)->src;
>> +
>> +      /* Skip edges pointing outside the current loop.  */
>> +      if (!arg || var_bb->loop_father != bbi->loop_father)
>> +     continue;
>> +
>> +      /* Add BBI to the path.  */
>> +      vec_safe_push (path, bbi);
>> +
>> +      if (TREE_CODE (arg) == INTEGER_CST)
>> +     {
>> +       int j, n = path->length ();
>> +       vec<jump_thread_edge *> *jump_thread_path
>> +         = new vec<jump_thread_edge *> ();
>> +       int joiners = 0;
>> +
>> +       for (j = 0; j < n - 1; j++)
>> +         {
>> +           edge e = find_edge ((*path)[n - j - 1],
>> +                               (*path)[n - j - 2]);
>> +           gcc_assert (e);
>> +           enum jump_thread_edge_type kind;
>> +
>> +           if (j == 0)
>> +             kind = EDGE_START_JUMP_THREAD;
>> +           else if (single_pred_p (e->src))
>> +             kind = EDGE_NO_COPY_SRC_BLOCK;
>> +           else {
>> +             kind = EDGE_COPY_SRC_JOINER_BLOCK;
>> +             ++joiners;
>> +           }
>> +
>> +           jump_thread_edge *x = new jump_thread_edge (e, kind);
>> +           jump_thread_path->safe_push (x);
>> +         }
>> +
>> +       /* Add the edge taken when the control variable has value ARG.  */
>> +       edge taken_edge = find_taken_edge ((*path)[0], arg);
>> +       jump_thread_edge *x
>> +         = new jump_thread_edge (taken_edge, EDGE_NO_COPY_SRC_BLOCK);
>> +       jump_thread_path->safe_push (x);
>> +
>> +       /* Thread-update does not handle more than two joiners.  A path with
>> +          less than 3 nodes should not be jump-threaded.  */
>> +       if (joiners < 2 && n > 2)
>> +         register_jump_thread (jump_thread_path);
>> +     }
>> +      else if (TREE_CODE (arg) == SSA_NAME)
>> +     find_control_statement_thread_paths (arg, visited_phis, path);
>> +
>> +      /* Remove BBI from the path.  */
>> +      path->pop ();
>> +    }
>> +
>> +  /* Remove all the nodes that we added from next_path.  */
>> +  vec_safe_truncate (path, (path->length () - next_path->length ()));
>> +  vec_free (next_path);
>> +}
>> +
>>  /* We are exiting E->src, see if E->dest ends with a conditional
>>     jump which has a known value when reached via E.
>>
>> @@ -1000,7 +1156,10 @@ thread_through_normal_block (edge e,
>>        cond = simplify_control_stmt_condition (e, stmt, dummy_cond, simplify,
>>                                             handle_dominating_asserts);
>>
>> -      if (cond && is_gimple_min_invariant (cond))
>> +      if (!cond)
>> +     return 0;
>> +
>> +      if (is_gimple_min_invariant (cond))
>>       {
>>         edge taken_edge = find_taken_edge (e->dest, cond);
>>         basic_block dest = (taken_edge ? taken_edge->dest : NULL);
>> @@ -1046,7 +1205,25 @@ thread_through_normal_block (edge e,
>>                                     backedge_seen_p);
>>         return 1;
>>       }
>> +
>> +      if (TREE_CODE (cond) != SSA_NAME
>> +       || e->dest->loop_father != e->src->loop_father)
>> +     return 0;
>> +
>> +      /* When COND cannot be simplified, try to find paths from a control
>> +      statement back through the PHI nodes which would affect that control
>> +      statement.  */
>> +      vec<basic_block, va_gc> *bb_path;
>> +      vec_alloc (bb_path, n_basic_blocks_for_fn (cfun));
>> +      vec_safe_push (bb_path, e->dest);
>> +      hash_set<gimple> *visited_phis = new hash_set<gimple>;
>> +
>> +      find_control_statement_thread_paths (cond, visited_phis, bb_path);
>> +
>> +      delete visited_phis;
>> +      vec_free (bb_path);
>> +
>> +      return -1;
>>      }
>>    return 0;
>>  }
>> --
>> 2.1.0.243.g30d45f7
>>
>

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-17  9:29               ` James Greenhalgh
@ 2014-11-18 19:36                 ` Steve Ellcey
  2014-11-18 20:04                   ` Jeff Law
  0 siblings, 1 reply; 54+ messages in thread
From: Steve Ellcey @ 2014-11-18 19:36 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: Sebastian Pop, Jeff Law, Richard Biener, GCC Patches

On Mon, 2014-11-17 at 09:24 +0000, James Greenhalgh wrote:

> For what it is worth, I've bootstrapped and tested this patch on
> aarch64-none-linux-gnu and arm-none-linux-gnueabi with no issues, and
> both targets get the expected speedup in the interesting benchmark.
> I've also thrown some of the larger popular benchmark suites at it, and
> they've compiled and run without any compilation issues or miscompares.
> 
> I'm happy to help out with the testing and bug-triaging effort once this
> patch goes in.
> 
> Some very shallow comments: you should document the new parameters
> in doc/invoke.texi and you ought to run contrib/check_GNU_style.sh
> on the patch and clean up the coding style issues it highlights.
> 
> Thanks,
> James Greenhalgh

I tested the patch on MIPS and things looked good there too.  I got the
desired speedup and did not see any regressions.

Steve Ellcey

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-18 19:36                 ` Steve Ellcey
@ 2014-11-18 20:04                   ` Jeff Law
  0 siblings, 0 replies; 54+ messages in thread
From: Jeff Law @ 2014-11-18 20:04 UTC (permalink / raw)
  To: Steve Ellcey, James Greenhalgh; +Cc: Sebastian Pop, Richard Biener, GCC Patches

On 11/18/14 12:25, Steve Ellcey wrote:
> On Mon, 2014-11-17 at 09:24 +0000, James Greenhalgh wrote:
>
>> For what it is worth, I've bootstrapped and tested this patch on
>> aarch64-none-linux-gnu and arm-none-linux-gnueabi with no issues, and
>> both targets get the expected speedup in the interesting benchmark.
>> I've also thrown some of the larger popular benchmark suites at it, and
>> they've compiled and run without any compilation issues or miscompares.
>>
>> I'm happy to help out with the testing and bug-triaging effort once this
>> patch goes in.
>>
>> Some very shallow comments: you should document the new parameters
>> in doc/invoke.texi and you ought to run contrib/check_GNU_style.sh
>> on the patch and clean up the coding style issues it highlights.
>>
>> Thanks,
>> James Greenhalgh
>
> I tested the patch on MIPS and things looked good there too.  I got the
> desired speedup and did not see any regressions.
It's on my list of things to look at.  The patch clearly was in prior to 
stage1 close and is thus eligible for inclusion in GCC 5.  Slogging 
through stuff as quickly as I can.


jeff

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-17 12:47               ` Richard Biener
@ 2014-11-18 22:29                 ` Sebastian Pop
  2014-11-22 23:41                   ` Jeff Law
  0 siblings, 1 reply; 54+ messages in thread
From: Sebastian Pop @ 2014-11-18 22:29 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jeff Law, James Greenhalgh, Steve Ellcey, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 2356 bytes --]

Richard Biener wrote:
> On Tue, Nov 11, 2014 at 2:14 AM, Sebastian Pop <sebpop@gmail.com> wrote:
> > Hi Jeff,
> >
> > I have adapted the code generation part from James' patch to current trunk, and
> > the resulting patch gets the 30% speedup on coremark and passes bootstrap of GCC.
> >
> > Ok for trunk?
> 
> In addition to missing documentation for the parameters
> 
> +      /* If we are copying an abnormally shaped region, keep track of
> +        which block will become our loop header.  */
> +      if ((region[i] != entry->dest && region[i] == loop->header)
> +         || (region[i] != entry->src && region[i] == loop->latch))
> +       {
> +         save_loop_details = true;
> +         memo_loop_latch_no = i;
> +         memo_loop_header_no = i;
> +       }
> 
> this looks bogus as you overwrite latch/header.  

Right, this should be:

    if (region[i] != entry->dest && region[i] == loop->header)
      {
        save_loop_details = true;
        memo_loop_header_no = i;
      }

    if (region[i] != entry->src && region[i] == loop->latch)
      {
        save_loop_details = true;
        memo_loop_latch_no = i;
      }


> I wonder what you
> tried to fix with this as "abnormally shaped" isn't sth we support
> given the check before (all blocks must belong to the same loop
> and thus entry is always the loop header or there is no loop)?
> 

The regions that we duplicate start inside a loop and stay inside the same loop,
and the jump threading path is not allowed to go in deeper nested loops.

The reason why we need to modify the sese duplication function is that the sese
region that we need to duplicate starts at an arbitrary place inside the loop,
whereas the current user of the sese duplication function tree-ssa-loop-ch.c:245
starts at the edge entering the loop and exits at the latch edge.

> I'll leave the rest to Jeff but it looks good to me from an overall
> structure.
> 

Thanks for your review.

Sebastian

PS: Patch passed bootstrap and regtest on x86_64-linux.

PS: I have run some perf analysis with the patch:
- on a bootstrap of GCC I see 3209 FSM jump threads
- libpng and libjpeg contain FSM jump threads, the perf increase is in the 1%
  (measured on simulators and reduced data sets)
- coremark gets jump threaded (as expected)
- I'm setting up the llvm test-suite and I will report perf differences

[-- Attachment #2: 0001-extend-jump-thread-for-finite-state-automata-PR-5474.patch --]
[-- Type: text/x-diff, Size: 16069 bytes --]

From aee8e01469c05e370b757b20c357a1c9dae57950 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <s.pop@samsung.com>
Date: Fri, 26 Sep 2014 14:54:20 -0500
Subject: [PATCH] extend jump thread for finite state automata PR 54742

Adapted from a patch from James Greenhalgh.

	* params.def (max-fsm-thread-path-insns, max-fsm-thread-length,
	max-fsm-thread-paths): New.

	* doc/invoke.texi (max-fsm-thread-path-insns, max-fsm-thread-length,
	max-fsm-thread-paths): Documented.

	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c: New.

	* tree-cfg.c (gimple_duplicate_sese_region): Save and restore loop
	header and latch.

	* tree-ssa-threadedge.c (simplify_control_stmt_condition): Restore the
	original value of cond when simplification fails.
	(fsm_find_thread_path): New.
	(fsm_find_control_statement_thread_paths): New.
	(fsm_thread_through_normal_block): Call find_control_statement_thread_paths.

	* tree-ssa-threadupdate.c (dump_jump_thread_path): Pretty print
	EDGE_START_FSM_THREAD.
	(thread_through_all_blocks): Generate code for EDGE_START_FSM_THREAD edges
	calling gimple_duplicate_sese_region.

	* tree-ssa-threadupdate.h (jump_thread_edge_type): Add EDGE_START_FSM_THREAD.
---
 gcc/doc/invoke.texi                              |  12 ++
 gcc/params.def                                   |  15 ++
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c |  38 +++++
 gcc/tree-cfg.c                                   |  26 ++-
 gcc/tree-ssa-threadedge.c                        | 201 ++++++++++++++++++++++-
 gcc/tree-ssa-threadupdate.c                      |  52 +++++-
 gcc/tree-ssa-threadupdate.h                      |   1 +
 7 files changed, 340 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 13270bc..613edbb 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -10586,6 +10586,18 @@ large and significantly increase compile time at optimization level
 @option{-O1} and higher.  This parameter is a maximum nubmer of statements
 in a single generated constructor.  Default value is 5000.
 
+@item max-fsm-thread-path-insns
+Maximum number of instructions to copy when duplicating blocks on a
+finite state automaton jump thread path.  The default is 100.
+
+@item max-fsm-thread-length
+Maximum number of basic blocks on a finite state automaton jump thread
+path.  The default is 10.
+
+@item max-fsm-thread-paths
+Maximum number of new jump thread paths to create for a finite state
+automaton.  The default is 50.
+
 @end table
 @end table
 
diff --git a/gcc/params.def b/gcc/params.def
index d2d2add..55c287e 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1125,6 +1125,21 @@ DEFPARAM (PARAM_CHKP_MAX_CTOR_SIZE,
 	  "Maximum number of statements to be included into a single static "
 	  "constructor generated by Pointer Bounds Checker",
 	  5000, 100, 0)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_PATH_INSNS,
+	  "max-fsm-thread-path-insns",
+	  "Maximum number of instructions to copy when duplicating blocks on a finite state automaton jump thread path",
+	  100, 1, 999999)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_LENGTH,
+	  "max-fsm-thread-length",
+	  "Maximum number of basic blocks on a finite state automaton jump thread path",
+	  10, 1, 999999)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_PATHS,
+	  "max-fsm-thread-paths",
+	  "Maximum number of new jump thread paths to create for a finite state automaton",
+	  50, 1, 999999)
 /*
 
 Local variables:
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
new file mode 100644
index 0000000..310d3db
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
@@ -0,0 +1,38 @@
+int sum0, sum1, sum2, sum3;
+int foo (char *s, char **ret)
+{
+  int state=0;
+  char c;
+
+  for (; *s && state != 4; s++)
+    {
+      c = *s;
+      if (c == '*')
+	{
+	  s++;
+	  break;
+	}
+      switch (state)
+	{
+	case 0:
+	  if (c == '+')
+	    state = 1;
+	  else if (c != '-')
+	    sum0+=c;
+	  break;
+	case 1:
+	  if (c == '+')
+	    state = 2;
+	  else if (c == '-')
+	    state = 0;
+	  else
+	    sum1+=c;
+	  break;
+	default:
+	  break;
+	}
+
+    }
+  *ret = s;
+  return state;
+}
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index ee10bc6..297749f 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -5949,10 +5949,12 @@ gimple_duplicate_sese_region (edge entry, edge exit,
 {
   unsigned i;
   bool free_region_copy = false, copying_header = false;
+  bool save_loop_details = false;
   struct loop *loop = entry->dest->loop_father;
   edge exit_copy;
   vec<basic_block> doms;
   edge redirected;
+  int memo_loop_header_no = 0, memo_loop_latch_no = 0;
   int total_freq = 0, entry_freq = 0;
   gcov_type total_count = 0, entry_count = 0;
 
@@ -5970,9 +5972,20 @@ gimple_duplicate_sese_region (edge entry, edge exit,
       if (region[i]->loop_father != loop)
 	return false;
 
-      if (region[i] != entry->dest
-	  && region[i] == loop->header)
-	return false;
+      /* If we are copying a region that starts and ends in an arbirary place in
+	 the loop: keep track of which block will become our loop header.  */
+      if (region[i] != entry->dest && region[i] == loop->header)
+	{
+	  save_loop_details = true;
+	  memo_loop_header_no = i;
+	}
+
+      /* And which block will become our loop latch.  */
+      if (region[i] != entry->src && region[i] == loop->latch)
+	{
+	  save_loop_details = true;
+	  memo_loop_latch_no = i;
+	}
     }
 
   /* In case the function is used for loop header copying (which is the primary
@@ -6055,6 +6068,13 @@ gimple_duplicate_sese_region (edge entry, edge exit,
       loop->latch = exit->src;
     }
 
+  /* Restore loop details if we were asked to save them.  */
+  if (save_loop_details)
+    {
+      loop->header = region[memo_loop_header_no];
+      loop->latch = region[memo_loop_latch_no];
+    }
+
   /* Redirect the entry and add the phi node arguments.  */
   redirected = redirect_edge_and_branch (entry, get_bb_copy (entry->dest));
   gcc_assert (redirected != NULL);
diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
index d5b9696..edd3c49 100644
--- a/gcc/tree-ssa-threadedge.c
+++ b/gcc/tree-ssa-threadedge.c
@@ -56,6 +56,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "tree-ssa-threadedge.h"
 #include "builtins.h"
+#include "cfg.h"
+#include "cfganal.h"
 
 /* To avoid code explosion due to jump threading, we limit the
    number of statements we are going to copy.  This variable
@@ -660,6 +662,7 @@ simplify_control_stmt_condition (edge e,
      rather than use a relational operator.  These are simpler to handle.  */
   if (TREE_CODE (cond) == SSA_NAME)
     {
+      tree original_lhs = cond;
       cached_lhs = cond;
 
       /* Get the variable's current value from the equivalence chains.
@@ -688,6 +691,12 @@ simplify_control_stmt_condition (edge e,
 	 pass specific callback to try and simplify it further.  */
       if (cached_lhs && ! is_gimple_min_invariant (cached_lhs))
         cached_lhs = (*simplify) (stmt, stmt);
+
+      /* We couldn't find an invariant.  But, callers of this
+	 function may be able to do something useful with the
+	 unmodified destination.  */
+      if (!cached_lhs)
+	cached_lhs = original_lhs;
     }
   else
     cached_lhs = NULL;
@@ -947,6 +956,173 @@ thread_around_empty_blocks (edge taken_edge,
   return false;
 }
 
+/* Return true if there is at least one path from START_BB to END_BB.
+   VISITED_BBS is used to make sure we don't fall into an infinite loop.  */
+
+static bool
+fsm_find_thread_path (basic_block start_bb, basic_block end_bb,
+		      vec<basic_block, va_gc> *&path,
+		      hash_set<basic_block> *visited_bbs, int n_insns)
+{
+  if (start_bb == end_bb)
+    {
+      vec_safe_push (path, start_bb);
+      return true;
+    }
+
+  if (!visited_bbs->add (start_bb))
+    {
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, start_bb->succs)
+	if (fsm_find_thread_path (e->dest, end_bb, path, visited_bbs, n_insns))
+	  {
+	    vec_safe_push (path, start_bb);
+	    return true;
+	  }
+    }
+
+  return false;
+}
+
+static int max_threaded_paths;
+
+/* We trace the value of the variable EXPR back through any phi nodes looking
+   for places where it gets a constant value and save the path.  Stop after
+   having recorded MAX_PATHS jump threading paths.  */
+
+static void
+fsm_find_control_statement_thread_paths (tree expr,
+					 hash_set<gimple> *visited_phis,
+					 vec<basic_block, va_gc> *&path)
+{
+  tree var = SSA_NAME_VAR (expr);
+  gimple def_stmt = SSA_NAME_DEF_STMT (expr);
+  basic_block var_bb = gimple_bb (def_stmt);
+
+  if (var == NULL || var_bb == NULL)
+    return;
+
+  vec<basic_block, va_gc> *next_path;
+  vec_alloc (next_path, n_basic_blocks_for_fn (cfun));
+
+  basic_block last_bb_in_path = path->last ();
+
+  /* Put the path from var_bb to last_bb_in_path into next_path.  */
+  if (var_bb != last_bb_in_path)
+    {
+      edge e;
+      int e_count = 0;
+      edge_iterator ei;
+
+      FOR_EACH_EDGE (e, ei, last_bb_in_path->preds)
+	{
+	  hash_set<basic_block> *visited_bbs = new hash_set<basic_block>;
+
+	  if (fsm_find_thread_path (var_bb, e->src, next_path, visited_bbs, 0))
+	    ++e_count;
+
+	  delete visited_bbs;
+
+	  /* If there is more than one path, stop.  */
+	  if (e_count > 1)
+	    {
+	      vec_free (next_path);
+	      return;
+	    }
+	}
+    }
+
+  /* Visit PHI nodes once.  */
+  if (gimple_code (def_stmt) != GIMPLE_PHI
+      || visited_phis->add (def_stmt))
+    {
+      vec_free (next_path);
+      return;
+    }
+
+  /* Append all the nodes from next_path to path.  */
+  vec_safe_splice (path, next_path);
+  gcc_assert (path->last () == var_bb);
+
+  /* Iterate over the arguments of PHI.  */
+  unsigned int i;
+  for (i = 0; i < gimple_phi_num_args (def_stmt); i++)
+    {
+      tree arg = gimple_phi_arg_def (def_stmt, i);
+      basic_block bbi = gimple_phi_arg_edge (def_stmt, i)->src;
+
+      /* Skip edges pointing outside the current loop.  */
+      if (!arg || var_bb->loop_father != bbi->loop_father)
+	continue;
+
+      /* Add BBI to the path.  */
+      vec_safe_push (path, bbi);
+
+      if (TREE_CODE (arg) == INTEGER_CST)
+	{
+	  int j, n = path->length ();
+	  vec<jump_thread_edge *> *jump_thread_path
+	    = new vec<jump_thread_edge *> ();
+	  int joiners = 0;
+
+	  for (j = 0; j < n - 1; j++)
+	    {
+	      edge e = find_edge ((*path)[n - j - 1],
+				  (*path)[n - j - 2]);
+	      gcc_assert (e);
+	      enum jump_thread_edge_type kind;
+
+	      if (j == 0)
+		kind = EDGE_START_FSM_THREAD;
+	      else if (single_pred_p (e->src))
+		kind = EDGE_NO_COPY_SRC_BLOCK;
+	      else {
+		kind = EDGE_COPY_SRC_JOINER_BLOCK;
+		++joiners;
+	      }
+
+	      jump_thread_edge *x = new jump_thread_edge (e, kind);
+	      jump_thread_path->safe_push (x);
+	    }
+
+	  /* Add the edge taken when the control variable has value ARG.  */
+	  edge taken_edge = find_taken_edge ((*path)[0], arg);
+	  jump_thread_edge *x
+	    = new jump_thread_edge (taken_edge, EDGE_NO_COPY_SRC_BLOCK);
+	  jump_thread_path->safe_push (x);
+
+	  /* A path with less than 3 nodes should not be jump-threaded.  */
+	  if (n > 2 && n < PARAM_VALUE (PARAM_MAX_FSM_THREAD_LENGTH)
+	      && max_threaded_paths > 0)
+	    {
+	      int n_insns = 0;
+	      gimple_stmt_iterator gsi;
+
+	      for (j = 1; j < n - 1; j++)
+		for (gsi = gsi_start_bb ((*path)[j]); !gsi_end_p (gsi);
+		     gsi_next (&gsi))
+		  ++n_insns;
+
+	      if (n_insns < PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATH_INSNS))
+		{
+		  register_jump_thread (jump_thread_path);
+		  --max_threaded_paths;
+		}
+	    }
+	}
+      else if (TREE_CODE (arg) == SSA_NAME)
+	fsm_find_control_statement_thread_paths (arg, visited_phis, path);
+
+      /* Remove BBI from the path.  */
+      path->pop ();
+    }
+
+  /* Remove all the nodes that we added from next_path.  */
+  vec_safe_truncate (path, (path->length () - next_path->length ()));
+  vec_free (next_path);
+}
+
 /* We are exiting E->src, see if E->dest ends with a conditional
    jump which has a known value when reached via E.
 
@@ -1032,7 +1208,10 @@ thread_through_normal_block (edge e,
       cond = simplify_control_stmt_condition (e, stmt, dummy_cond, simplify,
 					      handle_dominating_asserts);
 
-      if (cond && is_gimple_min_invariant (cond))
+      if (!cond)
+	return 0;
+
+      if (is_gimple_min_invariant (cond))
 	{
 	  edge taken_edge = find_taken_edge (e->dest, cond);
 	  basic_block dest = (taken_edge ? taken_edge->dest : NULL);
@@ -1078,6 +1257,26 @@ thread_through_normal_block (edge e,
 				      backedge_seen_p);
 	  return 1;
 	}
+
+      if (TREE_CODE (cond) != SSA_NAME
+	  || e->dest->loop_father != e->src->loop_father)
+	return 0;
+
+      /* When COND cannot be simplified, try to find paths from a control
+	 statement back through the PHI nodes which would affect that control
+	 statement.  */
+      vec<basic_block, va_gc> *bb_path;
+      vec_alloc (bb_path, n_basic_blocks_for_fn (cfun));
+      vec_safe_push (bb_path, e->dest);
+      hash_set<gimple> *visited_phis = new hash_set<gimple>;
+
+      max_threaded_paths = PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATHS);
+      fsm_find_control_statement_thread_paths (cond, visited_phis, bb_path);
+
+      delete visited_phis;
+      vec_free (bb_path);
+
+      return -1;
     }
   return 0;
 }
diff --git a/gcc/tree-ssa-threadupdate.c b/gcc/tree-ssa-threadupdate.c
index 151ed83..ec8e21f 100644
--- a/gcc/tree-ssa-threadupdate.c
+++ b/gcc/tree-ssa-threadupdate.c
@@ -167,8 +167,9 @@ dump_jump_thread_path (FILE *dump_file, vec<jump_thread_edge *> path,
 		       bool registering)
 {
   fprintf (dump_file,
-	   "  %s jump thread: (%d, %d) incoming edge; ",
+	   "  %s%s jump thread: (%d, %d) incoming edge; ",
 	   (registering ? "Registering" : "Cancelling"),
+	   (path[0]->type == EDGE_START_FSM_THREAD ? " FSM": ""),
 	   path[0]->e->src->index, path[0]->e->dest->index);
 
   for (unsigned int i = 1; i < path.length (); i++)
@@ -2343,6 +2344,55 @@ thread_through_all_blocks (bool may_peel_loop_headers)
   threaded_blocks = BITMAP_ALLOC (NULL);
   memset (&thread_stats, 0, sizeof (thread_stats));
 
+  for (i = 0; i < paths.length ();)
+    {
+      vec<jump_thread_edge *> *path = paths[i];
+      edge entry = (*path)[0]->e;
+
+      if ((*path)[0]->type != EDGE_START_FSM_THREAD
+	  /* Do not jump-thread twice from the same block.  */
+	  || bitmap_bit_p (threaded_blocks, entry->src->index)) {
+	i++;
+	continue;
+      }
+
+      unsigned len = path->length ();
+      edge exit = (*path)[len - 1]->e;
+      basic_block *region = XNEWVEC (basic_block, len - 1);
+
+      for (unsigned int j = 0; j < len - 1; j++)
+	region[j] = (*path)[j]->e->dest;
+
+      bool success = gimple_duplicate_sese_region (entry, exit, region,
+						   len - 1, NULL, 0);
+      delete_jump_thread_path (path);
+      paths.unordered_remove (i);
+
+      if (success)
+	{
+	  /* We do not update dominance info.  */
+	  free_dominance_info (CDI_DOMINATORS);
+	  bitmap_set_bit (threaded_blocks, entry->src->index);
+	}
+    }
+
+  for (i = 0; i < paths.length ();)
+    {
+      vec<jump_thread_edge *> *path = paths[i];
+      edge entry = (*path)[0]->e;
+
+      /* Do not jump-thread twice from the same block.  */
+      if (bitmap_bit_p (threaded_blocks, entry->src->index))
+	{
+	  delete_jump_thread_path (path);
+	  paths.unordered_remove (i);
+	}
+      else
+	i++;
+    }
+
+  bitmap_clear (threaded_blocks);
+
   mark_threaded_blocks (threaded_blocks);
 
   initialize_original_copy_tables ();
diff --git a/gcc/tree-ssa-threadupdate.h b/gcc/tree-ssa-threadupdate.h
index 426aca5..42c3a9e 100644
--- a/gcc/tree-ssa-threadupdate.h
+++ b/gcc/tree-ssa-threadupdate.h
@@ -26,6 +26,7 @@ extern bool thread_through_all_blocks (bool);
 enum jump_thread_edge_type
 {
   EDGE_START_JUMP_THREAD,
+  EDGE_START_FSM_THREAD,
   EDGE_COPY_SRC_BLOCK,
   EDGE_COPY_SRC_JOINER_BLOCK,
   EDGE_NO_COPY_SRC_BLOCK
-- 
2.1.0.243.g30d45f7


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-10-26 21:34           ` [Patch] Improving jump-thread " Sebastian Pop
  2014-11-11  1:40             ` Sebastian Pop
@ 2014-11-19 22:35             ` Jeff Law
  1 sibling, 0 replies; 54+ messages in thread
From: Jeff Law @ 2014-11-19 22:35 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

On 10/26/14 15:34, Sebastian Pop wrote:
>
> I have tried to understand why the code generation part ICEs on coremark: the
> first problem that I have seen is that tree-ssa-threadupdate.c does not handle
> more than a joiner block per path to be threaded, so we would not be able to
> jump thread accross the joiners of the if condition and the joiner of the switch
> condition: i.e., these paths
Right.  There's nothing I can think of inherently that makes that 
impossible, it's just not something the code currently tries to support. 
  I suspect there's a few places that need light generalization.  I can 
offhand think of one or two.

It's not lost on me that what we're building is a specialized region 
cloner.  I keep wanting to go back and see if there's a representation 
where we have an incoming edge, series of blocks and an outgoing edge. 
The concept of a "join" block really isn't important.  It's just a block 
that we want to copy for which we don't know its destination.

Anyway, within that representation, I think the way to wire up the edges 
is simple if we build a map at the beginning of the process.   The set 
of blocks in the path all get clones.  There's a single edge into the 
cloned path (the incoming edge) and a single edge out (the edge 
corresponding to the statically computed destination of the path). 
Edges that were interior in the original path are kept interior in the 
clone.  Edges that reached outside the original path go from the clone 
to the original destinations outside the path.  It's a SEME region.

>
> Another problem is that we attach the path to be threaded to the ->aux field of
> the first edge in the path, such that we would have to cancel some of the paths
> because we cannot keep track of all the paths to be threaded.
What I can easily see is cases where you have two paths starting at a 
particular node where one path is a strict subset of another path. 
Assuming the amount of copying we're doing is reasonable, then you'd 
want to keep just the longer path

But I don't think that's the case you're struggling with.  We could (for 
example) have a case where all the successors of a join block become 
threadable, possibly to different destinations.

That would argue that we really want to handle the threads off a 
different data structure than e->aux.

jeff

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-18 22:29                 ` Sebastian Pop
@ 2014-11-22 23:41                   ` Jeff Law
  2014-11-24  0:06                     ` Sebastian Pop
  0 siblings, 1 reply; 54+ messages in thread
From: Jeff Law @ 2014-11-22 23:41 UTC (permalink / raw)
  To: Sebastian Pop, Richard Biener; +Cc: James Greenhalgh, Steve Ellcey, GCC Patches

On 11/18/14 15:19, Sebastian Pop wrote:
> The regions that we duplicate start inside a loop and stay inside the same loop,
> and the jump threading path is not allowed to go in deeper nested loops.
>
> The reason why we need to modify the sese duplication function is that the sese
> region that we need to duplicate starts at an arbitrary place inside the loop,
> whereas the current user of the sese duplication function tree-ssa-loop-ch.c:245
> starts at the edge entering the loop and exits at the latch edge.
>
>> >I'll leave the rest to Jeff but it looks good to me from an overall
>> >structure.
>> >
> Thanks for your review.
>
> Sebastian
>
> PS: Patch passed bootstrap and regtest on x86_64-linux.
>
> PS: I have run some perf analysis with the patch:
> - on a bootstrap of GCC I see 3209 FSM jump threads
> - libpng and libjpeg contain FSM jump threads, the perf increase is in the 1%
>    (measured on simulators and reduced data sets)
> - coremark gets jump threaded (as expected)
> - I'm setting up the llvm test-suite and I will report perf differences
So that's *far* more jump threads than I would expect this to find in a 
bootstrap of GCC -- like 3 orders of magnitude more than I'd expect to find.

I haven't dug deep, but the first level analysis is not encouraging.

Basically I used the trunk compiler with and without your patch to build 
gcc-4.7.3's cc1 (4.7.3 simply because that's what I last used this 
testing framework).  So that gives me two cc1s that I then use to 
compile a bunch of .i files under valgrind's (cachegrind) control.

valgrind --tool=cachegrind --cache-sim=no --branch-sim=yes ......

That gives me two hunks of data for each input file I test. 
Specifically I get the dynamic number of instructions and the dynamic 
number of branches executed.

For jump threading those values correspond directly to the effect we're 
looking for -- a reduction in dynamic conditional jumps and a reduction 
in dynamic instructions executed.  Typically the change in dynamic 
instructions executed is 2-3X the change in dynamic conditional jumps -- 
which makes sense as removing the conditional jump usually means we 
remove a comparison and possibly some setup code as well.

Consistently with your patch those values get worse.   Across the entire 
set of .i files I get

For the trunk:

instructions:1339016494968
branches:     243568982489

With your patch:

instructions:1339739533291
branches:     243806615986

So that's 723038323 more instructions and 237633497 more branches after 
installing your patch.  While we're looking a just under .1% regression 
in dynamic branches, that's a terrible result for this work.

I'm not sure if the threads you're optimizing are somehow hiding other 
jump threading opportunities or somehow hiding CSE-able jump conditions, 
mucking up a loop structure or something else but something very bad is 
happening here.

If I put Steve's patch through the same testing I get:

instructions:1339006760834
branches:     243565768224

Which you'll note is a *very* slight decrease of 3214265 dynamic 
branches as 9734134 total instructions executed.

So I think we need to dig deeper into why the branching behaviour of GCC 
gets noticeably worse with your patch when it should be as good as or 
better than without your patch.

I know when i was analyzing the last update to this code, I found cases 
where we're much better off taking the shorter jump threading path 
without a joiner rather than preferring the long path with a joiner. 
IIRC, the issue was that if we selected the joiner path, then the 
duplication would create another jump threading opportunity (the 
original, shorter path without a join) that wouldn't be seen until the 
next pass of jump threading.

Jeff

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-22 23:41                   ` Jeff Law
@ 2014-11-24  0:06                     ` Sebastian Pop
  2014-11-24 21:33                       ` Jeff Law
  2014-11-24 23:25                       ` Jeff Law
  0 siblings, 2 replies; 54+ messages in thread
From: Sebastian Pop @ 2014-11-24  0:06 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 2876 bytes --]

Jeff Law wrote:
> >PS: I have run some perf analysis with the patch:
> >- on a bootstrap of GCC I see 3209 FSM jump threads
> >- libpng and libjpeg contain FSM jump threads, the perf increase is in the 1%
> >   (measured on simulators and reduced data sets)
> >- coremark gets jump threaded (as expected)
> >- I'm setting up the llvm test-suite and I will report perf differences
> So that's *far* more jump threads than I would expect this to find
> in a bootstrap of GCC -- like 3 orders of magnitude more than I'd
> expect to find.

The second patch attached limits the search for FSM jump threads to loops.  With
that patch, we are now down to 470 jump threads in an x86_64-linux bootstrap
(and 424 jump threads on powerpc64-linux bootstrap.)

> I haven't dug deep, but the first level analysis is not encouraging.
> 
> Basically I used the trunk compiler with and without your patch to
> build gcc-4.7.3's cc1 (4.7.3 simply because that's what I last used
> this testing framework).  So that gives me two cc1s that I then use
> to compile a bunch of .i files under valgrind's (cachegrind)
> control.
> 
> valgrind --tool=cachegrind --cache-sim=no --branch-sim=yes ......
> 
> That gives me two hunks of data for each input file I test.
> Specifically I get the dynamic number of instructions and the
> dynamic number of branches executed.
> 
> For jump threading those values correspond directly to the effect
> we're looking for -- a reduction in dynamic conditional jumps and a
> reduction in dynamic instructions executed.  Typically the change in
> dynamic instructions executed is 2-3X the change in dynamic
> conditional jumps -- which makes sense as removing the conditional
> jump usually means we remove a comparison and possibly some setup
> code as well.
> 
> Consistently with your patch those values get worse.   Across the
> entire set of .i files I get
> 
> For the trunk:
> 
> instructions:1339016494968
> branches:     243568982489
> 
> With your patch:
> 
> instructions:1339739533291
> branches:     243806615986
> 
> 
> So that's 723038323 more instructions and 237633497 more branches
> after installing your patch.  While we're looking a just under .1%
> regression in dynamic branches, that's a terrible result for this
> work.

One of the reasons I think we see more branches is that in sese region copying we
do not use the knowledge of the value of the condition for the last branch in a
jump-thread path: we rely on other propagation passes to remove the branch.  The
last attached patch adds:

  /* Remove the last branch in the jump thread path.  */
  remove_ctrl_stmt_and_useless_edges (region_copy[n_region - 1], exit->dest);

Please let me know if the attached patches are producing better results on gcc.

I rebased the original patch on trunk and all patches bootstrapped together on
x86_64-linux and powerpc64-linux.

Thanks,
Sebastian

[-- Attachment #2: 0001-extend-jump-thread-for-finite-state-automata-PR-5474.patch --]
[-- Type: text/x-diff, Size: 16088 bytes --]

From 120d5490598b1a09a06c04796b4fda46be7fd7db Mon Sep 17 00:00:00 2001
From: Sebastian Pop <s.pop@samsung.com>
Date: Fri, 26 Sep 2014 14:54:20 -0500
Subject: [PATCH 1/4] extend jump thread for finite state automata PR 54742

Adapted from a patch from James Greenhalgh.

	* params.def (max-fsm-thread-path-insns, max-fsm-thread-length,
	max-fsm-thread-paths): New.

	* doc/invoke.texi (max-fsm-thread-path-insns, max-fsm-thread-length,
	max-fsm-thread-paths): Documented.

	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c: New.

	* tree-cfg.c (gimple_duplicate_sese_region): Save and restore loop
	header and latch.

	* tree-ssa-threadedge.c (simplify_control_stmt_condition): Restore the
	original value of cond when simplification fails.
	(fsm_find_thread_path): New.
	(fsm_find_control_statement_thread_paths): New.
	(fsm_thread_through_normal_block): Call find_control_statement_thread_paths.

	* tree-ssa-threadupdate.c (dump_jump_thread_path): Pretty print
	EDGE_START_FSM_THREAD.
	(thread_through_all_blocks): Generate code for EDGE_START_FSM_THREAD edges
	calling gimple_duplicate_sese_region.

	* tree-ssa-threadupdate.h (jump_thread_edge_type): Add EDGE_START_FSM_THREAD.
---
 gcc/doc/invoke.texi                              |  12 ++
 gcc/params.def                                   |  15 ++
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c |  38 +++++
 gcc/tree-cfg.c                                   |  26 ++-
 gcc/tree-ssa-threadedge.c                        | 203 ++++++++++++++++++++++-
 gcc/tree-ssa-threadupdate.c                      |  52 +++++-
 gcc/tree-ssa-threadupdate.h                      |   1 +
 7 files changed, 342 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 89edddb..074183f 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -10624,6 +10624,18 @@ large and significantly increase compile time at optimization level
 @option{-O1} and higher.  This parameter is a maximum nubmer of statements
 in a single generated constructor.  Default value is 5000.
 
+@item max-fsm-thread-path-insns
+Maximum number of instructions to copy when duplicating blocks on a
+finite state automaton jump thread path.  The default is 100.
+
+@item max-fsm-thread-length
+Maximum number of basic blocks on a finite state automaton jump thread
+path.  The default is 10.
+
+@item max-fsm-thread-paths
+Maximum number of new jump thread paths to create for a finite state
+automaton.  The default is 50.
+
 @end table
 @end table
 
diff --git a/gcc/params.def b/gcc/params.def
index 6c71326..37741d3 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1140,6 +1140,21 @@ DEFPARAM (PARAM_CHKP_MAX_CTOR_SIZE,
 	  "Maximum number of statements to be included into a single static "
 	  "constructor generated by Pointer Bounds Checker",
 	  5000, 100, 0)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_PATH_INSNS,
+	  "max-fsm-thread-path-insns",
+	  "Maximum number of instructions to copy when duplicating blocks on a finite state automaton jump thread path",
+	  100, 1, 999999)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_LENGTH,
+	  "max-fsm-thread-length",
+	  "Maximum number of basic blocks on a finite state automaton jump thread path",
+	  10, 1, 999999)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_PATHS,
+	  "max-fsm-thread-paths",
+	  "Maximum number of new jump thread paths to create for a finite state automaton",
+	  50, 1, 999999)
 /*
 
 Local variables:
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
new file mode 100644
index 0000000..310d3db
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
@@ -0,0 +1,38 @@
+int sum0, sum1, sum2, sum3;
+int foo (char *s, char **ret)
+{
+  int state=0;
+  char c;
+
+  for (; *s && state != 4; s++)
+    {
+      c = *s;
+      if (c == '*')
+	{
+	  s++;
+	  break;
+	}
+      switch (state)
+	{
+	case 0:
+	  if (c == '+')
+	    state = 1;
+	  else if (c != '-')
+	    sum0+=c;
+	  break;
+	case 1:
+	  if (c == '+')
+	    state = 2;
+	  else if (c == '-')
+	    state = 0;
+	  else
+	    sum1+=c;
+	  break;
+	default:
+	  break;
+	}
+
+    }
+  *ret = s;
+  return state;
+}
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index e78554f..6d96c52 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -5967,10 +5967,12 @@ gimple_duplicate_sese_region (edge entry, edge exit,
 {
   unsigned i;
   bool free_region_copy = false, copying_header = false;
+  bool save_loop_details = false;
   struct loop *loop = entry->dest->loop_father;
   edge exit_copy;
   vec<basic_block> doms;
   edge redirected;
+  int memo_loop_header_no = 0, memo_loop_latch_no = 0;
   int total_freq = 0, entry_freq = 0;
   gcov_type total_count = 0, entry_count = 0;
 
@@ -5988,9 +5990,20 @@ gimple_duplicate_sese_region (edge entry, edge exit,
       if (region[i]->loop_father != loop)
 	return false;
 
-      if (region[i] != entry->dest
-	  && region[i] == loop->header)
-	return false;
+      /* If we are copying a region that starts and ends in an arbirary place in
+	 the loop: keep track of which block will become our loop header.  */
+      if (region[i] != entry->dest && region[i] == loop->header)
+	{
+	  save_loop_details = true;
+	  memo_loop_header_no = i;
+	}
+
+      /* And which block will become our loop latch.  */
+      if (region[i] != entry->src && region[i] == loop->latch)
+	{
+	  save_loop_details = true;
+	  memo_loop_latch_no = i;
+	}
     }
 
   /* In case the function is used for loop header copying (which is the primary
@@ -6073,6 +6086,13 @@ gimple_duplicate_sese_region (edge entry, edge exit,
       loop->latch = exit->src;
     }
 
+  /* Restore loop details if we were asked to save them.  */
+  if (save_loop_details)
+    {
+      loop->header = region[memo_loop_header_no];
+      loop->latch = region[memo_loop_latch_no];
+    }
+
   /* Redirect the entry and add the phi node arguments.  */
   redirected = redirect_edge_and_branch (entry, get_bb_copy (entry->dest));
   gcc_assert (redirected != NULL);
diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
index 8b0b7b8..3939a74 100644
--- a/gcc/tree-ssa-threadedge.c
+++ b/gcc/tree-ssa-threadedge.c
@@ -56,6 +56,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "tree-ssa-threadedge.h"
 #include "builtins.h"
+#include "cfg.h"
+#include "cfganal.h"
 
 /* To avoid code explosion due to jump threading, we limit the
    number of statements we are going to copy.  This variable
@@ -661,6 +663,7 @@ simplify_control_stmt_condition (edge e,
      rather than use a relational operator.  These are simpler to handle.  */
   if (TREE_CODE (cond) == SSA_NAME)
     {
+      tree original_lhs = cond;
       cached_lhs = cond;
 
       /* Get the variable's current value from the equivalence chains.
@@ -689,6 +692,12 @@ simplify_control_stmt_condition (edge e,
 	 pass specific callback to try and simplify it further.  */
       if (cached_lhs && ! is_gimple_min_invariant (cached_lhs))
         cached_lhs = (*simplify) (stmt, stmt);
+
+      /* We couldn't find an invariant.  But, callers of this
+	 function may be able to do something useful with the
+	 unmodified destination.  */
+      if (!cached_lhs)
+	cached_lhs = original_lhs;
     }
   else
     cached_lhs = NULL;
@@ -948,6 +957,175 @@ thread_around_empty_blocks (edge taken_edge,
   return false;
 }
 
+/* Return true if there is at least one path from START_BB to END_BB.
+   VISITED_BBS is used to make sure we don't fall into an infinite loop.  */
+
+static bool
+fsm_find_thread_path (basic_block start_bb, basic_block end_bb,
+		      vec<basic_block, va_gc> *&path,
+		      hash_set<basic_block> *visited_bbs, int n_insns)
+{
+  if (start_bb == end_bb)
+    {
+      vec_safe_push (path, start_bb);
+      return true;
+    }
+
+  if (!visited_bbs->add (start_bb))
+    {
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, start_bb->succs)
+	if (fsm_find_thread_path (e->dest, end_bb, path, visited_bbs, n_insns))
+	  {
+	    vec_safe_push (path, start_bb);
+	    return true;
+	  }
+    }
+
+  return false;
+}
+
+static int max_threaded_paths;
+
+/* We trace the value of the variable EXPR back through any phi nodes looking
+   for places where it gets a constant value and save the path.  Stop after
+   having recorded MAX_PATHS jump threading paths.  */
+
+static void
+fsm_find_control_statement_thread_paths (tree expr,
+					 hash_set<gimple> *visited_phis,
+					 vec<basic_block, va_gc> *&path)
+{
+  tree var = SSA_NAME_VAR (expr);
+  gimple def_stmt = SSA_NAME_DEF_STMT (expr);
+  basic_block var_bb = gimple_bb (def_stmt);
+
+  if (var == NULL || var_bb == NULL)
+    return;
+
+  vec<basic_block, va_gc> *next_path;
+  vec_alloc (next_path, n_basic_blocks_for_fn (cfun));
+
+  basic_block last_bb_in_path = path->last ();
+
+  /* Put the path from var_bb to last_bb_in_path into next_path.  */
+  if (var_bb != last_bb_in_path)
+    {
+      edge e;
+      int e_count = 0;
+      edge_iterator ei;
+
+      FOR_EACH_EDGE (e, ei, last_bb_in_path->preds)
+	{
+	  hash_set<basic_block> *visited_bbs = new hash_set<basic_block>;
+
+	  if (fsm_find_thread_path (var_bb, e->src, next_path, visited_bbs, 0))
+	    ++e_count;
+
+	  delete visited_bbs;
+
+	  /* If there is more than one path, stop.  */
+	  if (e_count > 1)
+	    {
+	      vec_free (next_path);
+	      return;
+	    }
+	}
+    }
+
+  /* Visit PHI nodes once.  */
+  if (gimple_code (def_stmt) != GIMPLE_PHI
+      || visited_phis->add (def_stmt))
+    {
+      vec_free (next_path);
+      return;
+    }
+
+  gphi *phi = as_a <gphi *> (def_stmt);
+
+  /* Append all the nodes from next_path to path.  */
+  vec_safe_splice (path, next_path);
+  gcc_assert (path->last () == var_bb);
+
+  /* Iterate over the arguments of PHI.  */
+  unsigned int i;
+  for (i = 0; i < gimple_phi_num_args (phi); i++)
+    {
+      tree arg = gimple_phi_arg_def (phi, i);
+      basic_block bbi = gimple_phi_arg_edge (phi, i)->src;
+
+      /* Skip edges pointing outside the current loop.  */
+      if (!arg || var_bb->loop_father != bbi->loop_father)
+	continue;
+
+      /* Add BBI to the path.  */
+      vec_safe_push (path, bbi);
+
+      if (TREE_CODE (arg) == INTEGER_CST)
+	{
+	  int j, n = path->length ();
+	  vec<jump_thread_edge *> *jump_thread_path
+	    = new vec<jump_thread_edge *> ();
+	  int joiners = 0;
+
+	  for (j = 0; j < n - 1; j++)
+	    {
+	      edge e = find_edge ((*path)[n - j - 1],
+				  (*path)[n - j - 2]);
+	      gcc_assert (e);
+	      enum jump_thread_edge_type kind;
+
+	      if (j == 0)
+		kind = EDGE_START_FSM_THREAD;
+	      else if (single_pred_p (e->src))
+		kind = EDGE_NO_COPY_SRC_BLOCK;
+	      else {
+		kind = EDGE_COPY_SRC_JOINER_BLOCK;
+		++joiners;
+	      }
+
+	      jump_thread_edge *x = new jump_thread_edge (e, kind);
+	      jump_thread_path->safe_push (x);
+	    }
+
+	  /* Add the edge taken when the control variable has value ARG.  */
+	  edge taken_edge = find_taken_edge ((*path)[0], arg);
+	  jump_thread_edge *x
+	    = new jump_thread_edge (taken_edge, EDGE_NO_COPY_SRC_BLOCK);
+	  jump_thread_path->safe_push (x);
+
+	  /* A path with less than 3 nodes should not be jump-threaded.  */
+	  if (n > 2 && n < PARAM_VALUE (PARAM_MAX_FSM_THREAD_LENGTH)
+	      && max_threaded_paths > 0)
+	    {
+	      int n_insns = 0;
+	      gimple_stmt_iterator gsi;
+
+	      for (j = 1; j < n - 1; j++)
+		for (gsi = gsi_start_bb ((*path)[j]); !gsi_end_p (gsi);
+		     gsi_next (&gsi))
+		  ++n_insns;
+
+	      if (n_insns < PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATH_INSNS))
+		{
+		  register_jump_thread (jump_thread_path);
+		  --max_threaded_paths;
+		}
+	    }
+	}
+      else if (TREE_CODE (arg) == SSA_NAME)
+	fsm_find_control_statement_thread_paths (arg, visited_phis, path);
+
+      /* Remove BBI from the path.  */
+      path->pop ();
+    }
+
+  /* Remove all the nodes that we added from next_path.  */
+  vec_safe_truncate (path, (path->length () - next_path->length ()));
+  vec_free (next_path);
+}
+
 /* We are exiting E->src, see if E->dest ends with a conditional
    jump which has a known value when reached via E.
 
@@ -1033,7 +1211,10 @@ thread_through_normal_block (edge e,
       cond = simplify_control_stmt_condition (e, stmt, dummy_cond, simplify,
 					      handle_dominating_asserts);
 
-      if (cond && is_gimple_min_invariant (cond))
+      if (!cond)
+	return 0;
+
+      if (is_gimple_min_invariant (cond))
 	{
 	  edge taken_edge = find_taken_edge (e->dest, cond);
 	  basic_block dest = (taken_edge ? taken_edge->dest : NULL);
@@ -1079,6 +1260,26 @@ thread_through_normal_block (edge e,
 				      backedge_seen_p);
 	  return 1;
 	}
+
+      if (TREE_CODE (cond) != SSA_NAME
+	  || e->dest->loop_father != e->src->loop_father)
+	return 0;
+
+      /* When COND cannot be simplified, try to find paths from a control
+	 statement back through the PHI nodes which would affect that control
+	 statement.  */
+      vec<basic_block, va_gc> *bb_path;
+      vec_alloc (bb_path, n_basic_blocks_for_fn (cfun));
+      vec_safe_push (bb_path, e->dest);
+      hash_set<gimple> *visited_phis = new hash_set<gimple>;
+
+      max_threaded_paths = PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATHS);
+      fsm_find_control_statement_thread_paths (cond, visited_phis, bb_path);
+
+      delete visited_phis;
+      vec_free (bb_path);
+
+      return -1;
     }
   return 0;
 }
diff --git a/gcc/tree-ssa-threadupdate.c b/gcc/tree-ssa-threadupdate.c
index ca0b8bf..a453b5e 100644
--- a/gcc/tree-ssa-threadupdate.c
+++ b/gcc/tree-ssa-threadupdate.c
@@ -167,8 +167,9 @@ dump_jump_thread_path (FILE *dump_file, vec<jump_thread_edge *> path,
 		       bool registering)
 {
   fprintf (dump_file,
-	   "  %s jump thread: (%d, %d) incoming edge; ",
+	   "  %s%s jump thread: (%d, %d) incoming edge; ",
 	   (registering ? "Registering" : "Cancelling"),
+	   (path[0]->type == EDGE_START_FSM_THREAD ? " FSM": ""),
 	   path[0]->e->src->index, path[0]->e->dest->index);
 
   for (unsigned int i = 1; i < path.length (); i++)
@@ -2343,6 +2344,55 @@ thread_through_all_blocks (bool may_peel_loop_headers)
   threaded_blocks = BITMAP_ALLOC (NULL);
   memset (&thread_stats, 0, sizeof (thread_stats));
 
+  for (i = 0; i < paths.length ();)
+    {
+      vec<jump_thread_edge *> *path = paths[i];
+      edge entry = (*path)[0]->e;
+
+      if ((*path)[0]->type != EDGE_START_FSM_THREAD
+	  /* Do not jump-thread twice from the same block.  */
+	  || bitmap_bit_p (threaded_blocks, entry->src->index)) {
+	i++;
+	continue;
+      }
+
+      unsigned len = path->length ();
+      edge exit = (*path)[len - 1]->e;
+      basic_block *region = XNEWVEC (basic_block, len - 1);
+
+      for (unsigned int j = 0; j < len - 1; j++)
+	region[j] = (*path)[j]->e->dest;
+
+      bool success = gimple_duplicate_sese_region (entry, exit, region,
+						   len - 1, NULL, 0);
+      delete_jump_thread_path (path);
+      paths.unordered_remove (i);
+
+      if (success)
+	{
+	  /* We do not update dominance info.  */
+	  free_dominance_info (CDI_DOMINATORS);
+	  bitmap_set_bit (threaded_blocks, entry->src->index);
+	}
+    }
+
+  for (i = 0; i < paths.length ();)
+    {
+      vec<jump_thread_edge *> *path = paths[i];
+      edge entry = (*path)[0]->e;
+
+      /* Do not jump-thread twice from the same block.  */
+      if (bitmap_bit_p (threaded_blocks, entry->src->index))
+	{
+	  delete_jump_thread_path (path);
+	  paths.unordered_remove (i);
+	}
+      else
+	i++;
+    }
+
+  bitmap_clear (threaded_blocks);
+
   mark_threaded_blocks (threaded_blocks);
 
   initialize_original_copy_tables ();
diff --git a/gcc/tree-ssa-threadupdate.h b/gcc/tree-ssa-threadupdate.h
index 426aca5..42c3a9e 100644
--- a/gcc/tree-ssa-threadupdate.h
+++ b/gcc/tree-ssa-threadupdate.h
@@ -26,6 +26,7 @@ extern bool thread_through_all_blocks (bool);
 enum jump_thread_edge_type
 {
   EDGE_START_JUMP_THREAD,
+  EDGE_START_FSM_THREAD,
   EDGE_COPY_SRC_BLOCK,
   EDGE_COPY_SRC_JOINER_BLOCK,
   EDGE_NO_COPY_SRC_BLOCK
-- 
1.9.1


[-- Attachment #3: 0002-look-for-fsm-threads-only-in-loops.patch --]
[-- Type: text/x-diff, Size: 3821 bytes --]

From b3c22ccf4ba3a26ba7b2ac3760059032235f5089 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <sebpop@gmail.com>
Date: Sun, 23 Nov 2014 01:03:40 -0600
Subject: [PATCH 2/4] look for fsm threads only in loops

---
 gcc/tree-ssa-threadedge.c | 84 +++++++++++++++++++++++++++--------------------
 1 file changed, 49 insertions(+), 35 deletions(-)

diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
index 3939a74..41b6494 100644
--- a/gcc/tree-ssa-threadedge.c
+++ b/gcc/tree-ssa-threadedge.c
@@ -1064,36 +1064,7 @@ fsm_find_control_statement_thread_paths (tree expr,
 
       if (TREE_CODE (arg) == INTEGER_CST)
 	{
-	  int j, n = path->length ();
-	  vec<jump_thread_edge *> *jump_thread_path
-	    = new vec<jump_thread_edge *> ();
-	  int joiners = 0;
-
-	  for (j = 0; j < n - 1; j++)
-	    {
-	      edge e = find_edge ((*path)[n - j - 1],
-				  (*path)[n - j - 2]);
-	      gcc_assert (e);
-	      enum jump_thread_edge_type kind;
-
-	      if (j == 0)
-		kind = EDGE_START_FSM_THREAD;
-	      else if (single_pred_p (e->src))
-		kind = EDGE_NO_COPY_SRC_BLOCK;
-	      else {
-		kind = EDGE_COPY_SRC_JOINER_BLOCK;
-		++joiners;
-	      }
-
-	      jump_thread_edge *x = new jump_thread_edge (e, kind);
-	      jump_thread_path->safe_push (x);
-	    }
-
-	  /* Add the edge taken when the control variable has value ARG.  */
-	  edge taken_edge = find_taken_edge ((*path)[0], arg);
-	  jump_thread_edge *x
-	    = new jump_thread_edge (taken_edge, EDGE_NO_COPY_SRC_BLOCK);
-	  jump_thread_path->safe_push (x);
+	  int n = path->length ();
 
 	  /* A path with less than 3 nodes should not be jump-threaded.  */
 	  if (n > 2 && n < PARAM_VALUE (PARAM_MAX_FSM_THREAD_LENGTH)
@@ -1101,14 +1072,56 @@ fsm_find_control_statement_thread_paths (tree expr,
 	    {
 	      int n_insns = 0;
 	      gimple_stmt_iterator gsi;
+	      int j;
+	      loop_p loop = (*path)[0]->loop_father;
+	      bool path_crosses_loops = false;
 
 	      for (j = 1; j < n - 1; j++)
-		for (gsi = gsi_start_bb ((*path)[j]); !gsi_end_p (gsi);
-		     gsi_next (&gsi))
-		  ++n_insns;
+		{
+		  basic_block bb = (*path)[j];
+		  if (bb->loop_father != loop)
+		    {
+		      path_crosses_loops = true;
+		      break;
+		    }
+		  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
+		       gsi_next (&gsi))
+		    ++n_insns;
+		}
 
-	      if (n_insns < PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATH_INSNS))
+	      if (!path_crosses_loops
+		  && n_insns < PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATH_INSNS))
 		{
+		  vec<jump_thread_edge *> *jump_thread_path
+		    = new vec<jump_thread_edge *> ();
+		  int joiners = 0;
+
+		  for (j = 0; j < n - 1; j++)
+		    {
+		      edge e = find_edge ((*path)[n - j - 1],
+					  (*path)[n - j - 2]);
+		      gcc_assert (e);
+		      enum jump_thread_edge_type kind;
+
+		      if (j == 0)
+			kind = EDGE_START_FSM_THREAD;
+		      else if (single_pred_p (e->src))
+			kind = EDGE_NO_COPY_SRC_BLOCK;
+		      else {
+			kind = EDGE_COPY_SRC_JOINER_BLOCK;
+			++joiners;
+		      }
+
+		      jump_thread_edge *x = new jump_thread_edge (e, kind);
+		      jump_thread_path->safe_push (x);
+		    }
+
+		  /* Add the edge taken when the control variable has value ARG.  */
+		  edge taken_edge = find_taken_edge ((*path)[0], arg);
+		  jump_thread_edge *x
+		    = new jump_thread_edge (taken_edge, EDGE_NO_COPY_SRC_BLOCK);
+		  jump_thread_path->safe_push (x);
+
 		  register_jump_thread (jump_thread_path);
 		  --max_threaded_paths;
 		}
@@ -1262,7 +1275,8 @@ thread_through_normal_block (edge e,
 	}
 
       if (TREE_CODE (cond) != SSA_NAME
-	  || e->dest->loop_father != e->src->loop_father)
+	  || e->dest->loop_father != e->src->loop_father
+	  || loop_depth (e->dest->loop_father) == 0)
 	return 0;
 
       /* When COND cannot be simplified, try to find paths from a control
-- 
1.9.1


[-- Attachment #4: 0003-add-FSM-debug.patch --]
[-- Type: text/x-diff, Size: 1064 bytes --]

From 40531e183620fbcbfa678ddceaacbecd69a2b087 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <sebpop@gmail.com>
Date: Sun, 23 Nov 2014 01:04:05 -0600
Subject: [PATCH 3/4] add FSM debug

---
 gcc/tree-ssa-threadupdate.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/gcc/tree-ssa-threadupdate.c b/gcc/tree-ssa-threadupdate.c
index a453b5e..dd2b518 100644
--- a/gcc/tree-ssa-threadupdate.c
+++ b/gcc/tree-ssa-threadupdate.c
@@ -2365,15 +2365,17 @@ thread_through_all_blocks (bool may_peel_loop_headers)
 
       bool success = gimple_duplicate_sese_region (entry, exit, region,
 						   len - 1, NULL, 0);
-      delete_jump_thread_path (path);
-      paths.unordered_remove (i);
-
       if (success)
 	{
+	  dump_jump_thread_path (stderr, *path, false);
+
 	  /* We do not update dominance info.  */
 	  free_dominance_info (CDI_DOMINATORS);
 	  bitmap_set_bit (threaded_blocks, entry->src->index);
 	}
+
+      delete_jump_thread_path (path);
+      paths.unordered_remove (i);
     }
 
   for (i = 0; i < paths.length ();)
-- 
1.9.1


[-- Attachment #5: 0004-make-copied-region-single-entry-and-remove-last-cond.patch --]
[-- Type: text/x-diff, Size: 6778 bytes --]

From b9b6155099d81b5ee6322e8bba2e3ba5d4f00b6e Mon Sep 17 00:00:00 2001
From: Sebastian Pop <sebpop@gmail.com>
Date: Sun, 23 Nov 2014 10:52:11 -0600
Subject: [PATCH 4/4] make copied region single entry and remove last condition
 stmt

---
 gcc/tree-cfg.c              |   2 +-
 gcc/tree-cfg.h              |   1 +
 gcc/tree-ssa-threadupdate.c | 151 +++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 151 insertions(+), 3 deletions(-)

diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index 6d96c52..ffa5162 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -2666,7 +2666,7 @@ reinstall_phi_args (edge new_edge, edge old_edge)
    near its "logical" location.  This is of most help to humans looking
    at debugging dumps.  */
 
-static basic_block
+basic_block
 split_edge_bb_loc (edge edge_in)
 {
   basic_block dest = edge_in->dest;
diff --git a/gcc/tree-cfg.h b/gcc/tree-cfg.h
index 626e973..51f0899 100644
--- a/gcc/tree-cfg.h
+++ b/gcc/tree-cfg.h
@@ -67,6 +67,7 @@ extern void verify_gimple_in_cfg (struct function *, bool);
 extern tree gimple_block_label (basic_block);
 extern void add_phi_args_after_copy_bb (basic_block);
 extern void add_phi_args_after_copy (basic_block *, unsigned, edge);
+extern basic_block split_edge_bb_loc (edge);
 extern bool gimple_duplicate_sese_region (edge, edge, basic_block *, unsigned,
 					basic_block *, bool);
 extern bool gimple_duplicate_sese_tail (edge, edge, basic_block *, unsigned,
diff --git a/gcc/tree-ssa-threadupdate.c b/gcc/tree-ssa-threadupdate.c
index dd2b518..3ee2117 100644
--- a/gcc/tree-ssa-threadupdate.c
+++ b/gcc/tree-ssa-threadupdate.c
@@ -2318,6 +2318,153 @@ bb_ends_with_multiway_branch (basic_block bb ATTRIBUTE_UNUSED)
   return false;
 }
 
+/* Duplicates a Single Entry Multiple Exit REGION (set of N_REGION basic
+   blocks).  The ENTRY edge is redirected to the duplicate of the region.  If
+   REGION is not a Single Entry region, ignore any incoming edges other than
+   ENTRY: this makes the copied region a Single Entry region.
+
+   Remove the last conditional statement in the last basic block in the REGION,
+   and create a single fallthru edge pointing to the same destination as the
+   EXIT edge.
+
+   The new basic blocks are stored to REGION_COPY in the same order as they had
+   in REGION, provided that REGION_COPY is not NULL.
+
+   Returns false if it is unable to copy the region, true otherwise.  */
+
+static bool
+duplicate_seme_region (edge entry, edge exit,
+		       basic_block *region, unsigned n_region,
+		       basic_block *region_copy)
+{
+  unsigned i;
+  bool free_region_copy = false, copying_header = false;
+  struct loop *loop = entry->dest->loop_father;
+  edge exit_copy;
+  edge redirected;
+  int total_freq = 0, entry_freq = 0;
+  gcov_type total_count = 0, entry_count = 0;
+
+  if (!can_copy_bbs_p (region, n_region))
+    return false;
+
+  /* Some sanity checking.  Note that we do not check for all possible
+     missuses of the functions.  I.e. if you ask to copy something weird,
+     it will work, but the state of structures probably will not be
+     correct.  */
+  for (i = 0; i < n_region; i++)
+    {
+      /* We do not handle subloops, i.e. all the blocks must belong to the
+	 same loop.  */
+      if (region[i]->loop_father != loop)
+	return false;
+    }
+
+  initialize_original_copy_tables ();
+
+  if (copying_header)
+    set_loop_copy (loop, loop_outer (loop));
+  else
+    set_loop_copy (loop, loop);
+
+  if (!region_copy)
+    {
+      region_copy = XNEWVEC (basic_block, n_region);
+      free_region_copy = true;
+    }
+
+  if (entry->dest->count)
+    {
+      total_count = entry->dest->count;
+      entry_count = entry->count;
+      /* Fix up corner cases, to avoid division by zero or creation of negative
+	 frequencies.  */
+      if (entry_count > total_count)
+	entry_count = total_count;
+    }
+  else
+    {
+      total_freq = entry->dest->frequency;
+      entry_freq = EDGE_FREQUENCY (entry);
+      /* Fix up corner cases, to avoid division by zero or creation of negative
+	 frequencies.  */
+      if (total_freq == 0)
+	total_freq = 1;
+      else if (entry_freq > total_freq)
+	entry_freq = total_freq;
+    }
+
+  copy_bbs (region, n_region, region_copy, &exit, 1, &exit_copy, loop,
+	    split_edge_bb_loc (entry), 0);
+  if (total_count)
+    {
+      scale_bbs_frequencies_gcov_type (region, n_region,
+				       total_count - entry_count,
+				       total_count);
+      scale_bbs_frequencies_gcov_type (region_copy, n_region, entry_count,
+				       total_count);
+    }
+  else
+    {
+      scale_bbs_frequencies_int (region, n_region, total_freq - entry_freq,
+				 total_freq);
+      scale_bbs_frequencies_int (region_copy, n_region, entry_freq, total_freq);
+    }
+
+#ifdef ENABLE_CHECKING
+  /* Make sure no edge other than ENTRY is entering the copied region.  */
+  for (i = 0; i < n_region; i++)
+    {
+      edge e;
+      edge_iterator ei;
+      basic_block bb = region_copy[i];
+
+      if (single_pred_p (bb))
+	continue;
+
+      for (ei = ei_start (bb->preds); (e = ei_safe_edge (ei)); ei_next (&ei))
+	{
+	  basic_block x = e->src;
+	  bool found = false;
+
+	  for (unsigned j = 0; j < n_region; j++)
+	    if (x == region_copy[j])
+	      {
+		found = true;
+		break;
+	      }
+
+	  gcc_assert (found);
+	}
+    }
+#endif
+
+  /* Remove the last branch in the jump thread path.  */
+  remove_ctrl_stmt_and_useless_edges (region_copy[n_region - 1], exit->dest);
+  edge e = make_edge (region_copy[n_region - 1], exit->dest, EDGE_FALLTHRU);
+
+  if (e) {
+    rescan_loop_exit (e, true, false);
+    e->probability = REG_BR_PROB_BASE;
+    e->count = region_copy[n_region - 1]->count;
+    //copy_phi_args (e->dest, rd->path->last ()->e, e, rd->path, idx);
+  }
+
+  /* Redirect the entry and add the phi node arguments.  */
+  redirected = redirect_edge_and_branch (entry, get_bb_copy (entry->dest));
+  gcc_assert (redirected != NULL);
+  flush_pending_stmts (entry);
+
+  /* Add the other PHI node arguments.  */
+  add_phi_args_after_copy (region_copy, n_region, NULL);
+
+  if (free_region_copy)
+    free (region_copy);
+
+  free_original_copy_tables ();
+  return true;
+}
+
 /* Walk through all blocks and thread incoming edges to the appropriate
    outgoing edge for each edge pair recorded in THREADED_EDGES.
 
@@ -2363,8 +2510,8 @@ thread_through_all_blocks (bool may_peel_loop_headers)
       for (unsigned int j = 0; j < len - 1; j++)
 	region[j] = (*path)[j]->e->dest;
 
-      bool success = gimple_duplicate_sese_region (entry, exit, region,
-						   len - 1, NULL, 0);
+      bool success = duplicate_seme_region (entry, exit, region,
+					    len - 1, NULL);
       if (success)
 	{
 	  dump_jump_thread_path (stderr, *path, false);
-- 
1.9.1


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-24  0:06                     ` Sebastian Pop
@ 2014-11-24 21:33                       ` Jeff Law
  2014-11-24 22:28                         ` Sebastian Pop
  2014-11-24 23:25                       ` Jeff Law
  1 sibling, 1 reply; 54+ messages in thread
From: Jeff Law @ 2014-11-24 21:33 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

On 11/23/14 15:22, Sebastian Pop wrote:
> The second patch attached limits the search for FSM jump threads to loops.  With
> that patch, we are now down to 470 jump threads in an x86_64-linux bootstrap
> (and 424 jump threads on powerpc64-linux bootstrap.)
>
Yea, that was one of the things I was going to poke at as well as a 
quick scan of your patch gave me the impression it wasn't limited to loops.

Again, I haven't looked much at the patch, but I got the impression 
you're doing a backwards walk through the predecessors to discover the 
result of the COND_EXPR.  Correct?

That's something I'd been wanting to do -- basically start with a 
COND_EXPR, then walk the dataflow backwards substituting values into the 
COND_EXPR (possibly creating non-gimple).  Ultimately the goal is to 
substitute and fold, getting to a constant :-)

The forward exhaustive stuff we do now is, crazy.   The backwards 
approach could be decoupled from DOM & VRP into an independent pass, 
which I think would be wise.

Using a SEME region copier is also something I really wanted to do long 
term.  In fact, I believe a lot of tree-ssa-threadupdate.c ought to be 
ripped out and replaced with a SEME based copier.

It appears you've built at least parts of two pieces needed to all this 
as a Bodik style optimizer.  Which is exactly the long term direction I 
think this code ought to take.

>
> One of the reasons I think we see more branches is that in sese region copying we
> do not use the knowledge of the value of the condition for the last branch in a
> jump-thread path: we rely on other propagation passes to remove the branch.  The
> last attached patch adds:
>
>    /* Remove the last branch in the jump thread path.  */
>    remove_ctrl_stmt_and_useless_edges (region_copy[n_region - 1], exit->dest);
That's certainly a possibility.  But I would expect that even with this 
limitation something would be picking up the fact that the branch is 
statically computable (even if it's an RTL optimizer).  But it's 
definitely something to look for.

>
> Please let me know if the attached patches are producing better results on gcc.

For the trunk:
   instructions:1339016494968
   branches     :243568982489

First version of your patch:

   instructions:1339739533291
   branches:     243806615986

Latest version of your patch:

   instructions:1339749122609
   branches:     243809838262

Which is in the noise for this test.  Which makes me wonder if I botched 
something on the latest run.  It doesn't appear so, but I'm re-running 
just to be sure.  I'm also turning on -g so that I can use cg_annotate 
to poke a bit deeper and perhaps identify one or more concrete examples 
where your patch is making this worse.

Jeff

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-24 21:33                       ` Jeff Law
@ 2014-11-24 22:28                         ` Sebastian Pop
  2014-11-24 23:02                           ` Sebastian Pop
                                             ` (3 more replies)
  0 siblings, 4 replies; 54+ messages in thread
From: Sebastian Pop @ 2014-11-24 22:28 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

Jeff Law wrote:
> On 11/23/14 15:22, Sebastian Pop wrote:
> >The second patch attached limits the search for FSM jump threads to loops.  With
> >that patch, we are now down to 470 jump threads in an x86_64-linux bootstrap
> >(and 424 jump threads on powerpc64-linux bootstrap.)
> >
> Yea, that was one of the things I was going to poke at as well as a
> quick scan of your patch gave me the impression it wasn't limited to
> loops.
> 
> Again, I haven't looked much at the patch, but I got the impression
> you're doing a backwards walk through the predecessors to discover
> the result of the COND_EXPR.  Correct?

Yes.

> 
> That's something I'd been wanting to do -- basically start with a
> COND_EXPR, then walk the dataflow backwards substituting values into
> the COND_EXPR (possibly creating non-gimple).  Ultimately the goal
> is to substitute and fold, getting to a constant :-)
> 
> The forward exhaustive stuff we do now is, crazy.   The backwards
> approach could be decoupled from DOM & VRP into an independent pass,
> which I think would be wise.
> 
> Using a SEME region copier is also something I really wanted to do
> long term.  In fact, I believe a lot of tree-ssa-threadupdate.c
> ought to be ripped out and replaced with a SEME based copier.

I did an experiment around these lines over the week-end, and now that you
mention it, I feel less shy to speak about; well the patch does not yet pass
bootstrap, and there still are about 20 failing test-cases.  I feel better
reading the code generation part of jump-threading after this patch ;-)
Basically I think all the tree-ssa-threadupdate.c can be replaced by
duplicate_seme_region that generalizes the code generation.

> 
> It appears you've built at least parts of two pieces needed to all
> this as a Bodik style optimizer.  Which is exactly the long term
> direction I think this code ought to take.
> 
> 
> >
> >One of the reasons I think we see more branches is that in sese region copying we
> >do not use the knowledge of the value of the condition for the last branch in a
> >jump-thread path: we rely on other propagation passes to remove the branch.  The
> >last attached patch adds:
> >
> >   /* Remove the last branch in the jump thread path.  */
> >   remove_ctrl_stmt_and_useless_edges (region_copy[n_region - 1], exit->dest);
> That's certainly a possibility.  But I would expect that even with
> this limitation something would be picking up the fact that the
> branch is statically computable (even if it's an RTL optimizer).
> But it's definitely something to look for.
> 
> >
> >Please let me know if the attached patches are producing better results on gcc.
> 
> For the trunk:
>   instructions:1339016494968
>   branches     :243568982489
> 
> First version of your patch:
> 
>   instructions:1339739533291
>   branches:     243806615986
> 
> Latest version of your patch:
> 
>   instructions:1339749122609
>   branches:     243809838262

I think I got about the same results.

I got my scripts installed on the gcc-farm.  I first used an x86_64 gcc75 and
valgrind was crashing not recognizing how to decode an instruction.  Then I
moved to gcc112 a powerpc64-linux where I got this data from stage2 cc1plus
compiling the same file alias.ii at -O2: (I got 3 runs of each mostly because
there is a bit of noise in all these numbers)

$ valgrind --tool=cachegrind --cache-sim=no --branch-sim=yes ./cc1plus -O2 ~/alias.ii

all 4 patches:

==153617== I   refs:      13,914,038,211
==153617== 
==153617== Branches:       1,926,407,760  (1,879,827,481 cond + 46,580,279 ind)
==153617== Mispredicts:      144,890,904  (  132,094,105 cond + 12,796,799 ind)
==153617== Mispred rate:             7.5% (          7.0%     +       27.4%   )

==34993== I   refs:      13,915,335,629
==34993== 
==34993== Branches:       1,926,597,919  (1,880,017,558 cond + 46,580,361 ind)
==34993== Mispredicts:      144,974,266  (  132,177,440 cond + 12,796,826 ind)
==34993== Mispred rate:             7.5% (          7.0%     +       27.4%   )

==140841== I   refs:      13,915,334,459
==140841== 
==140841== Branches:       1,926,597,819  (1,880,017,458 cond + 46,580,361 ind)
==140841== Mispredicts:      144,974,296  (  132,177,470 cond + 12,796,826 ind)
==140841== Mispred rate:             7.5% (          7.0%     +       27.4%   )

patch 1:

==99902== I   refs:      13,915,069,710
==99902== 
==99902== Branches:       1,926,963,813  (1,880,376,148 cond + 46,587,665 ind)
==99902== Mispredicts:      145,501,564  (  132,656,576 cond + 12,844,988 ind)
==99902== Mispred rate:             7.5% (          7.0%     +       27.5%   )

==3907== I   refs:      13,915,082,469
==3907== 
==3907== Branches:       1,926,965,218  (1,880,377,471 cond + 46,587,747 ind)
==3907== Mispredicts:      145,501,569  (  132,656,554 cond + 12,845,015 ind)
==3907== Mispred rate:             7.5% (          7.0%     +       27.5%   )

==44271== I   refs:      13,915,111,997
==44271== 
==44271== Branches:       1,926,968,863  (1,880,380,952 cond + 46,587,911 ind)
==44271== Mispredicts:      145,501,858  (  132,656,789 cond + 12,845,069 ind)
==44271== Mispred rate:             7.5% (          7.0%     +       27.5%   )

master no-patch:

==129233== I   refs:      13,910,221,913
==129233== 
==129233== Branches:       1,925,715,095  (1,879,277,776 cond + 46,437,319 ind)
==129233== Mispredicts:      144,133,332  (  131,510,534 cond + 12,622,798 ind)
==129233== Mispred rate:             7.4% (          6.9%     +       27.1%   )

==147659== I   refs:      13,910,216,249
==147659== 
==147659== Branches:       1,925,714,029  (1,879,276,708 cond + 46,437,321 ind)
==147659== Mispredicts:      144,127,970  (  131,505,172 cond + 12,622,798 ind)
==147659== Mispred rate:             7.4% (          6.9%     +       27.1%   )

==155206== I   refs:      13,910,201,237
==155206== 
==155206== Branches:       1,925,712,267  (1,879,275,030 cond + 46,437,237 ind)
==155206== Mispredicts:      144,128,313  (  131,505,542 cond + 12,622,771 ind)
==155206== Mispred rate:             7.4% (          6.9%     +       27.1%   )


I think that there are about 5 million more instructions executed with the first
patch, and the other patches on top do not really help.

> 
> Which is in the noise for this test.  Which makes me wonder if I
> botched something on the latest run.  It doesn't appear so, but I'm
> re-running just to be sure.  I'm also turning on -g so that I can
> use cg_annotate to poke a bit deeper and perhaps identify one or
> more concrete examples where your patch is making this worse.

Thanks,
Sebastian

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-24 22:28                         ` Sebastian Pop
@ 2014-11-24 23:02                           ` Sebastian Pop
  2014-11-24 23:18                           ` Jeff Law
                                             ` (2 subsequent siblings)
  3 siblings, 0 replies; 54+ messages in thread
From: Sebastian Pop @ 2014-11-24 23:02 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 735 bytes --]

Sebastian Pop wrote:
> > Using a SEME region copier is also something I really wanted to do
> > long term.  In fact, I believe a lot of tree-ssa-threadupdate.c
> > ought to be ripped out and replaced with a SEME based copier.
> 
> I did an experiment around these lines over the week-end, and now that you
> mention it, I feel less shy to speak about; well the patch does not yet pass
> bootstrap, and there still are about 20 failing test-cases.  I feel better
> reading the code generation part of jump-threading after this patch ;-)
> Basically I think all the tree-ssa-threadupdate.c can be replaced by
> duplicate_seme_region that generalizes the code generation.

For reference, here is the patch I am speaking about.

Sebastian

[-- Attachment #2: 0002-use-duplicate_seme-to-generate-code-for-jump-threadi.patch --]
[-- Type: text/x-diff, Size: 104295 bytes --]

From c7213811e2ec2443df9ffc3ca72b3b15a6c9aaf9 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <sebpop@gmail.com>
Date: Fri, 21 Nov 2014 13:17:12 -0600
Subject: [PATCH 2/3] use duplicate_seme to generate code for jump threading

---
 gcc/tree-cfg.c              |  142 +++
 gcc/tree-cfg.h              |    2 +
 gcc/tree-ssa-threadedge.c   |   61 +-
 gcc/tree-ssa-threadupdate.c | 2487 +------------------------------------------
 gcc/tree-ssa-threadupdate.h |   23 +-
 5 files changed, 222 insertions(+), 2493 deletions(-)

diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index 6d96c52..d6dc442 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -6120,6 +6120,148 @@ gimple_duplicate_sese_region (edge entry, edge exit,
   return true;
 }
 
+/* Duplicates a REGION (set of N_REGION basic blocks).  The edge ENTRY is
+   redirected to the duplicate of the region.  Dominance and loop information is
+   updated if UPDATE_DOMINANCE is true, but not the SSA web.  If
+   UPDATE_DOMINANCE is false then we assume that the caller will update the
+   dominance information after calling this function.  The new basic blocks are
+   stored to REGION_COPY in the same order as they had in REGION, provided that
+   REGION_COPY is not NULL.
+
+   Returns false if it is unable to copy the region, true otherwise.  */
+
+bool
+gimple_duplicate_seme_region (edge entry,
+			      basic_block *region, unsigned n_region,
+			      basic_block *region_copy,
+			      bool update_dominance)
+{
+  unsigned i;
+  bool free_region_copy = false, copying_header = false;
+  struct loop *loop = entry->dest->loop_father;
+  vec<basic_block> doms;
+  edge redirected;
+  int memo_loop_header_no = 0, memo_loop_latch_no = 0;
+  int total_freq = 0, entry_freq = 0;
+  gcov_type total_count = 0, entry_count = 0;
+
+  if (!can_copy_bbs_p (region, n_region))
+    return false;
+
+  /* Some sanity checking.  Note that we do not check for all possible
+     missuses of the functions.  I.e. if you ask to copy something weird,
+     it will work, but the state of structures probably will not be
+     correct.  */
+  for (i = 0; i < n_region; i++)
+    {
+      /* We do not handle subloops, i.e. all the blocks must belong to the
+	 same loop.  */
+      if (region[i]->loop_father != loop)
+	return false;
+
+      /* If we are copying a region that starts and ends in an arbirary place in
+	 the loop: keep track of which block will become our loop header.  */
+      if (region[i] != entry->dest && region[i] == loop->header)
+	memo_loop_header_no = i;
+
+      /* And which block will become our loop latch.  */
+      if (region[i] != entry->src && region[i] == loop->latch)
+	memo_loop_latch_no = i;
+    }
+
+  initialize_original_copy_tables ();
+
+  if (copying_header)
+    set_loop_copy (loop, loop_outer (loop));
+  else
+    set_loop_copy (loop, loop);
+
+  if (!region_copy)
+    {
+      region_copy = XNEWVEC (basic_block, n_region);
+      free_region_copy = true;
+    }
+
+  /* Record blocks outside the region that are dominated by something
+     inside.  */
+  if (update_dominance)
+    {
+      doms.create (0);
+      doms = get_dominated_by_region (CDI_DOMINATORS, region, n_region);
+    }
+
+  if (entry->dest->count)
+    {
+      total_count = entry->dest->count;
+      entry_count = entry->count;
+      /* Fix up corner cases, to avoid division by zero or creation of negative
+	 frequencies.  */
+      if (entry_count > total_count)
+	entry_count = total_count;
+    }
+  else
+    {
+      total_freq = entry->dest->frequency;
+      entry_freq = EDGE_FREQUENCY (entry);
+      /* Fix up corner cases, to avoid division by zero or creation of negative
+	 frequencies.  */
+      if (total_freq == 0)
+	total_freq = 1;
+      else if (entry_freq > total_freq)
+	entry_freq = total_freq;
+    }
+
+  copy_bbs (region, n_region, region_copy, NULL, 0, NULL, loop,
+	    split_edge_bb_loc (entry), update_dominance);
+  if (total_count)
+    {
+      scale_bbs_frequencies_gcov_type (region, n_region,
+				       total_count - entry_count,
+				       total_count);
+      scale_bbs_frequencies_gcov_type (region_copy, n_region, entry_count,
+				       total_count);
+    }
+  else
+    {
+      scale_bbs_frequencies_int (region, n_region, total_freq - entry_freq,
+				 total_freq);
+      scale_bbs_frequencies_int (region_copy, n_region, entry_freq, total_freq);
+    }
+
+  /* Restore loop details if we were asked to save them.  */
+  if (memo_loop_header_no)
+    loop->header = region[memo_loop_header_no];
+
+  if (memo_loop_latch_no)
+    loop->latch = region[memo_loop_latch_no];
+
+  /* Redirect the entry and add the phi node arguments.  */
+  redirected = redirect_edge_and_branch (entry, get_bb_copy (entry->dest));
+  gcc_assert (redirected != NULL);
+  flush_pending_stmts (entry);
+
+  /* Concerning updating of dominators:  We must recount dominators
+     for entry block and its copy.  Anything that is outside of the
+     region, but was dominated by something inside needs recounting as
+     well.  */
+  if (update_dominance)
+    {
+      set_immediate_dominator (CDI_DOMINATORS, entry->dest, entry->src);
+      doms.safe_push (get_bb_original (entry->dest));
+      iterate_fix_dominators (CDI_DOMINATORS, doms, false);
+      doms.release ();
+    }
+
+  /* Add the other PHI node arguments.  */
+  add_phi_args_after_copy (region_copy, n_region, NULL);
+
+  if (free_region_copy)
+    free (region_copy);
+
+  free_original_copy_tables ();
+  return true;
+}
+
 /* Checks if BB is part of the region defined by N_REGION BBS.  */
 static bool 
 bb_part_of_region_p (basic_block bb, basic_block* bbs, unsigned n_region)
diff --git a/gcc/tree-cfg.h b/gcc/tree-cfg.h
index 626e973..7f53bb7 100644
--- a/gcc/tree-cfg.h
+++ b/gcc/tree-cfg.h
@@ -69,6 +69,8 @@ extern void add_phi_args_after_copy_bb (basic_block);
 extern void add_phi_args_after_copy (basic_block *, unsigned, edge);
 extern bool gimple_duplicate_sese_region (edge, edge, basic_block *, unsigned,
 					basic_block *, bool);
+extern bool gimple_duplicate_seme_region (edge, basic_block *, unsigned,
+					  basic_block *, bool);
 extern bool gimple_duplicate_sese_tail (edge, edge, basic_block *, unsigned,
 				      basic_block *);
 extern void gather_blocks_in_sese_region (basic_block entry, basic_block exit,
diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
index 3939a74..0d5fbfd 100644
--- a/gcc/tree-ssa-threadedge.c
+++ b/gcc/tree-ssa-threadedge.c
@@ -848,7 +848,7 @@ thread_around_empty_blocks (edge taken_edge,
 			    bool handle_dominating_asserts,
 			    tree (*simplify) (gimple, gimple),
 			    bitmap visited,
-			    vec<jump_thread_edge *> *path,
+			    vec<edge> *path,
 			    bool *backedge_seen_p)
 {
   basic_block bb = taken_edge->dest;
@@ -886,9 +886,7 @@ thread_around_empty_blocks (edge taken_edge,
 	  taken_edge = single_succ_edge (bb);
 	  if (!bitmap_bit_p (visited, taken_edge->dest->index))
 	    {
-	      jump_thread_edge *x
-		= new jump_thread_edge (taken_edge, EDGE_NO_COPY_SRC_BLOCK);
-	      path->safe_push (x);
+	      path->safe_push (taken_edge);
 	      bitmap_set_bit (visited, taken_edge->dest->index);
 	      *backedge_seen_p |= ((taken_edge->flags & EDGE_DFS_BACK) != 0);
 	      if (*backedge_seen_p)
@@ -937,9 +935,7 @@ thread_around_empty_blocks (edge taken_edge,
 	return false;
       bitmap_set_bit (visited, taken_edge->dest->index);
 
-      jump_thread_edge *x
-	= new jump_thread_edge (taken_edge, EDGE_NO_COPY_SRC_BLOCK);
-      path->safe_push (x);
+      path->safe_push (taken_edge);
       *backedge_seen_p |= ((taken_edge->flags & EDGE_DFS_BACK) != 0);
       if (*backedge_seen_p)
 	simplify = dummy_simplify;
@@ -1065,35 +1061,19 @@ fsm_find_control_statement_thread_paths (tree expr,
       if (TREE_CODE (arg) == INTEGER_CST)
 	{
 	  int j, n = path->length ();
-	  vec<jump_thread_edge *> *jump_thread_path
-	    = new vec<jump_thread_edge *> ();
-	  int joiners = 0;
+	  vec<edge> *jump_thread_path = new vec<edge> ();
 
 	  for (j = 0; j < n - 1; j++)
 	    {
 	      edge e = find_edge ((*path)[n - j - 1],
 				  (*path)[n - j - 2]);
 	      gcc_assert (e);
-	      enum jump_thread_edge_type kind;
-
-	      if (j == 0)
-		kind = EDGE_START_FSM_THREAD;
-	      else if (single_pred_p (e->src))
-		kind = EDGE_NO_COPY_SRC_BLOCK;
-	      else {
-		kind = EDGE_COPY_SRC_JOINER_BLOCK;
-		++joiners;
-	      }
-
-	      jump_thread_edge *x = new jump_thread_edge (e, kind);
-	      jump_thread_path->safe_push (x);
+	      jump_thread_path->safe_push (e);
 	    }
 
 	  /* Add the edge taken when the control variable has value ARG.  */
 	  edge taken_edge = find_taken_edge ((*path)[0], arg);
-	  jump_thread_edge *x
-	    = new jump_thread_edge (taken_edge, EDGE_NO_COPY_SRC_BLOCK);
-	  jump_thread_path->safe_push (x);
+	  jump_thread_path->safe_push (taken_edge);
 
 	  /* A path with less than 3 nodes should not be jump-threaded.  */
 	  if (n > 2 && n < PARAM_VALUE (PARAM_MAX_FSM_THREAD_LENGTH)
@@ -1165,7 +1145,7 @@ thread_through_normal_block (edge e,
 			     bool handle_dominating_asserts,
 			     vec<tree> *stack,
 			     tree (*simplify) (gimple, gimple),
-			     vec<jump_thread_edge *> *path,
+			     vec<edge> *path,
 			     bitmap visited,
 			     bool *backedge_seen_p)
 {
@@ -1226,19 +1206,14 @@ thread_through_normal_block (edge e,
 	      || bitmap_bit_p (visited, dest->index))
 	    return 0;
 
-	  /* Only push the EDGE_START_JUMP_THREAD marker if this is
-	     first edge on the path.  */
+	  /* Push the first edge on the path.  */
 	  if (path->length () == 0)
 	    {
-              jump_thread_edge *x
-	        = new jump_thread_edge (e, EDGE_START_JUMP_THREAD);
-	      path->safe_push (x);
+	      path->safe_push (e);
 	      *backedge_seen_p |= ((e->flags & EDGE_DFS_BACK) != 0);
 	    }
 
-	  jump_thread_edge *x
-	    = new jump_thread_edge (taken_edge, EDGE_COPY_SRC_BLOCK);
-	  path->safe_push (x);
+	  path->safe_push (taken_edge);
 	  *backedge_seen_p |= ((taken_edge->flags & EDGE_DFS_BACK) != 0);
 	  if (*backedge_seen_p)
 	    simplify = dummy_simplify;
@@ -1323,7 +1298,7 @@ thread_across_edge (gcond *dummy_cond,
 
   stmt_count = 0;
 
-  vec<jump_thread_edge *> *path = new vec<jump_thread_edge *> ();
+  vec<edge> *path = new vec<edge> ();
   bitmap_clear (visited);
   bitmap_set_bit (visited, e->src->index);
   bitmap_set_bit (visited, e->dest->index);
@@ -1337,7 +1312,7 @@ thread_across_edge (gcond *dummy_cond,
 					      visited, &backedge_seen);
   if (threaded > 0)
     {
-      propagate_threaded_block_debug_into (path->last ()->e->dest,
+      propagate_threaded_block_debug_into (path->last ()->dest,
 					   e->dest);
       remove_temporary_equivalences (stack);
       BITMAP_FREE (visited);
@@ -1406,15 +1381,13 @@ thread_across_edge (gcond *dummy_cond,
 	bitmap_set_bit (visited, e->src->index);
 	bitmap_set_bit (visited, e->dest->index);
 	bitmap_set_bit (visited, taken_edge->dest->index);
-        vec<jump_thread_edge *> *path = new vec<jump_thread_edge *> ();
+        vec<edge> *path = new vec<edge> ();
 
 	/* Record whether or not we were able to thread through a successor
 	   of E->dest.  */
-        jump_thread_edge *x = new jump_thread_edge (e, EDGE_START_JUMP_THREAD);
-	path->safe_push (x);
+	path->safe_push (e);
 
-        x = new jump_thread_edge (taken_edge, EDGE_COPY_SRC_JOINER_BLOCK);
-	path->safe_push (x);
+	path->safe_push (taken_edge);
 	found = false;
 	backedge_seen = ((e->flags & EDGE_DFS_BACK) != 0);
 	backedge_seen |= ((taken_edge->flags & EDGE_DFS_BACK) != 0);
@@ -1432,7 +1405,7 @@ thread_across_edge (gcond *dummy_cond,
 	  simplify = dummy_simplify;
 
 	if (!found)
-	  found = thread_through_normal_block (path->last ()->e, dummy_cond,
+	  found = thread_through_normal_block (path->last (), dummy_cond,
 					       handle_dominating_asserts,
 					       stack, simplify, path, visited,
 					       &backedge_seen) > 0;
@@ -1441,7 +1414,7 @@ thread_across_edge (gcond *dummy_cond,
 	   record the jump threading opportunity.  */
 	if (found)
 	  {
-	    propagate_threaded_block_debug_into (path->last ()->e->dest,
+	    propagate_threaded_block_debug_into (path->last ()->dest,
 						 taken_edge->dest);
 	    register_jump_thread (path);
 	  }
diff --git a/gcc/tree-ssa-threadupdate.c b/gcc/tree-ssa-threadupdate.c
index a453b5e..fcb7d85 100644
--- a/gcc/tree-ssa-threadupdate.c
+++ b/gcc/tree-ssa-threadupdate.c
@@ -53,2279 +53,40 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-cfg.h"
 #include "tree-pass.h"
 
-/* Given a block B, update the CFG and SSA graph to reflect redirecting
-   one or more in-edges to B to instead reach the destination of an
-   out-edge from B while preserving any side effects in B.
+/* Passes which use the jump threading code, register jump threading
+   opportunities as they are discovered.  */
+static vec<vec<edge> *> paths;
 
-   i.e., given A->B and B->C, change A->B to be A->C yet still preserve the
-   side effects of executing B.
-
-     1. Make a copy of B (including its outgoing edges and statements).  Call
-	the copy B'.  Note B' has no incoming edges or PHIs at this time.
-
-     2. Remove the control statement at the end of B' and all outgoing edges
-	except B'->C.
-
-     3. Add a new argument to each PHI in C with the same value as the existing
-	argument associated with edge B->C.  Associate the new PHI arguments
-	with the edge B'->C.
-
-     4. For each PHI in B, find or create a PHI in B' with an identical
-	PHI_RESULT.  Add an argument to the PHI in B' which has the same
-	value as the PHI in B associated with the edge A->B.  Associate
-	the new argument in the PHI in B' with the edge A->B.
-
-     5. Change the edge A->B to A->B'.
-
-	5a. This automatically deletes any PHI arguments associated with the
-	    edge A->B in B.
-
-	5b. This automatically associates each new argument added in step 4
-	    with the edge A->B'.
-
-     6. Repeat for other incoming edges into B.
-
-     7. Put the duplicated resources in B and all the B' blocks into SSA form.
-
-   Note that block duplication can be minimized by first collecting the
-   set of unique destination blocks that the incoming edges should
-   be threaded to.
-
-   We reduce the number of edges and statements we create by not copying all
-   the outgoing edges and the control statement in step #1.  We instead create
-   a template block without the outgoing edges and duplicate the template.
-
-   Another case this code handles is threading through a "joiner" block.  In
-   this case, we do not know the destination of the joiner block, but one
-   of the outgoing edges from the joiner block leads to a threadable path.  This
-   case largely works as outlined above, except the duplicate of the joiner
-   block still contains a full set of outgoing edges and its control statement.
-   We just redirect one of its outgoing edges to our jump threading path.  */
-
-
-/* Steps #5 and #6 of the above algorithm are best implemented by walking
-   all the incoming edges which thread to the same destination edge at
-   the same time.  That avoids lots of table lookups to get information
-   for the destination edge.
-
-   To realize that implementation we create a list of incoming edges
-   which thread to the same outgoing edge.  Thus to implement steps
-   #5 and #6 we traverse our hash table of outgoing edge information.
-   For each entry we walk the list of incoming edges which thread to
-   the current outgoing edge.  */
-
-struct el
-{
-  edge e;
-  struct el *next;
-};
-
-/* Main data structure recording information regarding B's duplicate
-   blocks.  */
-
-/* We need to efficiently record the unique thread destinations of this
-   block and specific information associated with those destinations.  We
-   may have many incoming edges threaded to the same outgoing edge.  This
-   can be naturally implemented with a hash table.  */
-
-struct redirection_data : typed_free_remove<redirection_data>
-{
-  /* We support wiring up two block duplicates in a jump threading path.
-
-     One is a normal block copy where we remove the control statement
-     and wire up its single remaining outgoing edge to the thread path.
-
-     The other is a joiner block where we leave the control statement
-     in place, but wire one of the outgoing edges to a thread path.
-
-     In theory we could have multiple block duplicates in a jump
-     threading path, but I haven't tried that.
-
-     The duplicate blocks appear in this array in the same order in
-     which they appear in the jump thread path.  */
-  basic_block dup_blocks[2];
-
-  /* The jump threading path.  */
-  vec<jump_thread_edge *> *path;
-
-  /* A list of incoming edges which we want to thread to the
-     same path.  */
-  struct el *incoming_edges;
-
-  /* hash_table support.  */
-  typedef redirection_data value_type;
-  typedef redirection_data compare_type;
-  static inline hashval_t hash (const value_type *);
-  static inline int equal (const value_type *, const compare_type *);
-};
-
-/* Dump a jump threading path, including annotations about each
-   edge in the path.  */
-
-static void
-dump_jump_thread_path (FILE *dump_file, vec<jump_thread_edge *> path,
-		       bool registering)
-{
-  fprintf (dump_file,
-	   "  %s%s jump thread: (%d, %d) incoming edge; ",
-	   (registering ? "Registering" : "Cancelling"),
-	   (path[0]->type == EDGE_START_FSM_THREAD ? " FSM": ""),
-	   path[0]->e->src->index, path[0]->e->dest->index);
-
-  for (unsigned int i = 1; i < path.length (); i++)
-    {
-      /* We can get paths with a NULL edge when the final destination
-	 of a jump thread turns out to be a constant address.  We dump
-	 those paths when debugging, so we have to be prepared for that
-	 possibility here.  */
-      if (path[i]->e == NULL)
-	continue;
-
-      if (path[i]->type == EDGE_COPY_SRC_JOINER_BLOCK)
-	fprintf (dump_file, " (%d, %d) joiner; ",
-		 path[i]->e->src->index, path[i]->e->dest->index);
-      if (path[i]->type == EDGE_COPY_SRC_BLOCK)
-       fprintf (dump_file, " (%d, %d) normal;",
-		 path[i]->e->src->index, path[i]->e->dest->index);
-      if (path[i]->type == EDGE_NO_COPY_SRC_BLOCK)
-       fprintf (dump_file, " (%d, %d) nocopy;",
-		 path[i]->e->src->index, path[i]->e->dest->index);
-    }
-  fputc ('\n', dump_file);
-}
-
-/* Simple hashing function.  For any given incoming edge E, we're going
-   to be most concerned with the final destination of its jump thread
-   path.  So hash on the block index of the final edge in the path.  */
-
-inline hashval_t
-redirection_data::hash (const value_type *p)
-{
-  vec<jump_thread_edge *> *path = p->path;
-  return path->last ()->e->dest->index;
-}
-
-/* Given two hash table entries, return true if they have the same
-   jump threading path.  */
-inline int
-redirection_data::equal (const value_type *p1, const compare_type *p2)
-{
-  vec<jump_thread_edge *> *path1 = p1->path;
-  vec<jump_thread_edge *> *path2 = p2->path;
-
-  if (path1->length () != path2->length ())
-    return false;
-
-  for (unsigned int i = 1; i < path1->length (); i++)
-    {
-      if ((*path1)[i]->type != (*path2)[i]->type
-	  || (*path1)[i]->e != (*path2)[i]->e)
-	return false;
-    }
-
-  return true;
-}
-
-/* Data structure of information to pass to hash table traversal routines.  */
-struct ssa_local_info_t
-{
-  /* The current block we are working on.  */
-  basic_block bb;
-
-  /* We only create a template block for the first duplicated block in a
-     jump threading path as we may need many duplicates of that block.
-
-     The second duplicate block in a path is specific to that path.  Creating
-     and sharing a template for that block is considerably more difficult.  */
-  basic_block template_block;
-
-  /* TRUE if we thread one or more jumps, FALSE otherwise.  */
-  bool jumps_threaded;
-
-  /* Blocks duplicated for the thread.  */
-  bitmap duplicate_blocks;
-};
-
-/* Passes which use the jump threading code register jump threading
-   opportunities as they are discovered.  We keep the registered
-   jump threading opportunities in this vector as edge pairs
-   (original_edge, target_edge).  */
-static vec<vec<jump_thread_edge *> *> paths;
-
-/* When we start updating the CFG for threading, data necessary for jump
-   threading is attached to the AUX field for the incoming edge.  Use these
-   macros to access the underlying structure attached to the AUX field.  */
-#define THREAD_PATH(E) ((vec<jump_thread_edge *> *)(E)->aux)
-
-/* Jump threading statistics.  */
-
-struct thread_stats_d
-{
-  unsigned long num_threaded_edges;
-};
-
-struct thread_stats_d thread_stats;
-
-
-/* Remove the last statement in block BB if it is a control statement
-   Also remove all outgoing edges except the edge which reaches DEST_BB.
-   If DEST_BB is NULL, then remove all outgoing edges.  */
-
-static void
-remove_ctrl_stmt_and_useless_edges (basic_block bb, basic_block dest_bb)
-{
-  gimple_stmt_iterator gsi;
-  edge e;
-  edge_iterator ei;
-
-  gsi = gsi_last_bb (bb);
-
-  /* If the duplicate ends with a control statement, then remove it.
-
-     Note that if we are duplicating the template block rather than the
-     original basic block, then the duplicate might not have any real
-     statements in it.  */
-  if (!gsi_end_p (gsi)
-      && gsi_stmt (gsi)
-      && (gimple_code (gsi_stmt (gsi)) == GIMPLE_COND
-	  || gimple_code (gsi_stmt (gsi)) == GIMPLE_GOTO
-	  || gimple_code (gsi_stmt (gsi)) == GIMPLE_SWITCH))
-    gsi_remove (&gsi, true);
-
-  for (ei = ei_start (bb->succs); (e = ei_safe_edge (ei)); )
-    {
-      if (e->dest != dest_bb)
-	remove_edge (e);
-      else
-	ei_next (&ei);
-    }
-}
-
-/* Create a duplicate of BB.  Record the duplicate block in an array
-   indexed by COUNT stored in RD.  */
-
-static void
-create_block_for_threading (basic_block bb,
-			    struct redirection_data *rd,
-			    unsigned int count,
-			    bitmap *duplicate_blocks)
-{
-  edge_iterator ei;
-  edge e;
-
-  /* We can use the generic block duplication code and simply remove
-     the stuff we do not need.  */
-  rd->dup_blocks[count] = duplicate_block (bb, NULL, NULL);
-
-  FOR_EACH_EDGE (e, ei, rd->dup_blocks[count]->succs)
-    e->aux = NULL;
-
-  /* Zero out the profile, since the block is unreachable for now.  */
-  rd->dup_blocks[count]->frequency = 0;
-  rd->dup_blocks[count]->count = 0;
-  if (duplicate_blocks)
-    bitmap_set_bit (*duplicate_blocks, rd->dup_blocks[count]->index);
-}
-
-/* Main data structure to hold information for duplicates of BB.  */
-
-static hash_table<redirection_data> *redirection_data;
-
-/* Given an outgoing edge E lookup and return its entry in our hash table.
-
-   If INSERT is true, then we insert the entry into the hash table if
-   it is not already present.  INCOMING_EDGE is added to the list of incoming
-   edges associated with E in the hash table.  */
-
-static struct redirection_data *
-lookup_redirection_data (edge e, enum insert_option insert)
-{
-  struct redirection_data **slot;
-  struct redirection_data *elt;
-  vec<jump_thread_edge *> *path = THREAD_PATH (e);
-
- /* Build a hash table element so we can see if E is already
-     in the table.  */
-  elt = XNEW (struct redirection_data);
-  elt->path = path;
-  elt->dup_blocks[0] = NULL;
-  elt->dup_blocks[1] = NULL;
-  elt->incoming_edges = NULL;
-
-  slot = redirection_data->find_slot (elt, insert);
-
-  /* This will only happen if INSERT is false and the entry is not
-     in the hash table.  */
-  if (slot == NULL)
-    {
-      free (elt);
-      return NULL;
-    }
-
-  /* This will only happen if E was not in the hash table and
-     INSERT is true.  */
-  if (*slot == NULL)
-    {
-      *slot = elt;
-      elt->incoming_edges = XNEW (struct el);
-      elt->incoming_edges->e = e;
-      elt->incoming_edges->next = NULL;
-      return elt;
-    }
-  /* E was in the hash table.  */
-  else
-    {
-      /* Free ELT as we do not need it anymore, we will extract the
-	 relevant entry from the hash table itself.  */
-      free (elt);
-
-      /* Get the entry stored in the hash table.  */
-      elt = *slot;
-
-      /* If insertion was requested, then we need to add INCOMING_EDGE
-	 to the list of incoming edges associated with E.  */
-      if (insert)
-	{
-	  struct el *el = XNEW (struct el);
-	  el->next = elt->incoming_edges;
-	  el->e = e;
-	  elt->incoming_edges = el;
-	}
-
-      return elt;
-    }
-}
-
-/* Similar to copy_phi_args, except that the PHI arg exists, it just
-   does not have a value associated with it.  */
-
-static void
-copy_phi_arg_into_existing_phi (edge src_e, edge tgt_e)
-{
-  int src_idx = src_e->dest_idx;
-  int tgt_idx = tgt_e->dest_idx;
-
-  /* Iterate over each PHI in e->dest.  */
-  for (gphi_iterator gsi = gsi_start_phis (src_e->dest),
-			   gsi2 = gsi_start_phis (tgt_e->dest);
-       !gsi_end_p (gsi);
-       gsi_next (&gsi), gsi_next (&gsi2))
-    {
-      gphi *src_phi = gsi.phi ();
-      gphi *dest_phi = gsi2.phi ();
-      tree val = gimple_phi_arg_def (src_phi, src_idx);
-      source_location locus = gimple_phi_arg_location (src_phi, src_idx);
-
-      SET_PHI_ARG_DEF (dest_phi, tgt_idx, val);
-      gimple_phi_arg_set_location (dest_phi, tgt_idx, locus);
-    }
-}
-
-/* Given ssa_name DEF, backtrack jump threading PATH from node IDX
-   to see if it has constant value in a flow sensitive manner.  Set
-   LOCUS to location of the constant phi arg and return the value.
-   Return DEF directly if either PATH or idx is ZERO.  */
-
-static tree
-get_value_locus_in_path (tree def, vec<jump_thread_edge *> *path,
-			 basic_block bb, int idx, source_location *locus)
-{
-  tree arg;
-  gphi *def_phi;
-  basic_block def_bb;
-
-  if (path == NULL || idx == 0)
-    return def;
-
-  def_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (def));
-  if (!def_phi)
-    return def;
-
-  def_bb = gimple_bb (def_phi);
-  /* Don't propagate loop invariants into deeper loops.  */
-  if (!def_bb || bb_loop_depth (def_bb) < bb_loop_depth (bb))
-    return def;
-
-  /* Backtrack jump threading path from IDX to see if def has constant
-     value.  */
-  for (int j = idx - 1; j >= 0; j--)
-    {
-      edge e = (*path)[j]->e;
-      if (e->dest == def_bb)
-	{
-	  arg = gimple_phi_arg_def (def_phi, e->dest_idx);
-	  if (is_gimple_min_invariant (arg))
-	    {
-	      *locus = gimple_phi_arg_location (def_phi, e->dest_idx);
-	      return arg;
-	    }
-	  break;
-	}
-    }
-
-  return def;
-}
-
-/* For each PHI in BB, copy the argument associated with SRC_E to TGT_E.
-   Try to backtrack jump threading PATH from node IDX to see if the arg
-   has constant value, copy constant value instead of argument itself
-   if yes.  */
-
-static void
-copy_phi_args (basic_block bb, edge src_e, edge tgt_e,
-	       vec<jump_thread_edge *> *path, int idx)
-{
-  gphi_iterator gsi;
-  int src_indx = src_e->dest_idx;
-
-  for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
-    {
-      gphi *phi = gsi.phi ();
-      tree def = gimple_phi_arg_def (phi, src_indx);
-      source_location locus = gimple_phi_arg_location (phi, src_indx);
-
-      if (TREE_CODE (def) == SSA_NAME
-	  && !virtual_operand_p (gimple_phi_result (phi)))
-	def = get_value_locus_in_path (def, path, bb, idx, &locus);
-
-      add_phi_arg (phi, def, tgt_e, locus);
-    }
-}
-
-/* We have recently made a copy of ORIG_BB, including its outgoing
-   edges.  The copy is NEW_BB.  Every PHI node in every direct successor of
-   ORIG_BB has a new argument associated with edge from NEW_BB to the
-   successor.  Initialize the PHI argument so that it is equal to the PHI
-   argument associated with the edge from ORIG_BB to the successor.
-   PATH and IDX are used to check if the new PHI argument has constant
-   value in a flow sensitive manner.  */
-
-static void
-update_destination_phis (basic_block orig_bb, basic_block new_bb,
-			 vec<jump_thread_edge *> *path, int idx)
-{
-  edge_iterator ei;
-  edge e;
-
-  FOR_EACH_EDGE (e, ei, orig_bb->succs)
-    {
-      edge e2 = find_edge (new_bb, e->dest);
-      copy_phi_args (e->dest, e, e2, path, idx);
-    }
-}
-
-/* Given a duplicate block and its single destination (both stored
-   in RD).  Create an edge between the duplicate and its single
-   destination.
-
-   Add an additional argument to any PHI nodes at the single
-   destination.  IDX is the start node in jump threading path
-   we start to check to see if the new PHI argument has constant
-   value along the jump threading path.  */
-
-static void
-create_edge_and_update_destination_phis (struct redirection_data *rd,
-					 basic_block bb, int idx)
-{
-  edge e = make_edge (bb, rd->path->last ()->e->dest, EDGE_FALLTHRU);
-
-  rescan_loop_exit (e, true, false);
-  e->probability = REG_BR_PROB_BASE;
-  e->count = bb->count;
-
-  /* We used to copy the thread path here.  That was added in 2007
-     and dutifully updated through the representation changes in 2013.
-
-     In 2013 we added code to thread from an interior node through
-     the backedge to another interior node.  That runs after the code
-     to thread through loop headers from outside the loop.
-
-     The latter may delete edges in the CFG, including those
-     which appeared in the jump threading path we copied here.  Thus
-     we'd end up using a dangling pointer.
-
-     After reviewing the 2007/2011 code, I can't see how anything
-     depended on copying the AUX field and clearly copying the jump
-     threading path is problematical due to embedded edge pointers.
-     It has been removed.  */
-  e->aux = NULL;
-
-  /* If there are any PHI nodes at the destination of the outgoing edge
-     from the duplicate block, then we will need to add a new argument
-     to them.  The argument should have the same value as the argument
-     associated with the outgoing edge stored in RD.  */
-  copy_phi_args (e->dest, rd->path->last ()->e, e, rd->path, idx);
-}
-
-/* Look through PATH beginning at START and return TRUE if there are
-   any additional blocks that need to be duplicated.  Otherwise,
-   return FALSE.  */
-static bool
-any_remaining_duplicated_blocks (vec<jump_thread_edge *> *path,
-				 unsigned int start)
-{
-  for (unsigned int i = start + 1; i < path->length (); i++)
-    {
-      if ((*path)[i]->type == EDGE_COPY_SRC_JOINER_BLOCK
-	  || (*path)[i]->type == EDGE_COPY_SRC_BLOCK)
-	return true;
-    }
-  return false;
-}
-
-
-/* Compute the amount of profile count/frequency coming into the jump threading
-   path stored in RD that we are duplicating, returned in PATH_IN_COUNT_PTR and
-   PATH_IN_FREQ_PTR, as well as the amount of counts flowing out of the
-   duplicated path, returned in PATH_OUT_COUNT_PTR.  LOCAL_INFO is used to
-   identify blocks duplicated for jump threading, which have duplicated
-   edges that need to be ignored in the analysis.  Return true if path contains
-   a joiner, false otherwise.
-
-   In the non-joiner case, this is straightforward - all the counts/frequency
-   flowing into the jump threading path should flow through the duplicated
-   block and out of the duplicated path.
-
-   In the joiner case, it is very tricky.  Some of the counts flowing into
-   the original path go offpath at the joiner.  The problem is that while
-   we know how much total count goes off-path in the original control flow,
-   we don't know how many of the counts corresponding to just the jump
-   threading path go offpath at the joiner.
-
-   For example, assume we have the following control flow and identified
-   jump threading paths:
-
-                A     B     C
-                 \    |    /
-               Ea \   |Eb / Ec
-                   \  |  /
-                    v v v
-                      J       <-- Joiner
-                     / \
-                Eoff/   \Eon
-                   /     \
-                  v       v
-                Soff     Son  <--- Normal
-                         /\
-                      Ed/  \ Ee
-                       /    \
-                      v     v
-                      D      E
-
-            Jump threading paths: A -> J -> Son -> D (path 1)
-                                  C -> J -> Son -> E (path 2)
-
-   Note that the control flow could be more complicated:
-   - Each jump threading path may have more than one incoming edge.  I.e. A and
-   Ea could represent multiple incoming blocks/edges that are included in
-   path 1.
-   - There could be EDGE_NO_COPY_SRC_BLOCK edges after the joiner (either
-   before or after the "normal" copy block).  These are not duplicated onto
-   the jump threading path, as they are single-successor.
-   - Any of the blocks along the path may have other incoming edges that
-   are not part of any jump threading path, but add profile counts along
-   the path.
-
-   In the aboe example, after all jump threading is complete, we will
-   end up with the following control flow:
-
-                A          B            C
-                |          |            |
-              Ea|          |Eb          |Ec
-                |          |            |
-                v          v            v
-               Ja          J           Jc
-               / \        / \Eon'     / \
-          Eona/   \   ---/---\--------   \Eonc
-             /     \ /  /     \           \
-            v       v  v       v          v
-           Sona     Soff      Son        Sonc
-             \                 /\         /
-              \___________    /  \  _____/
-                          \  /    \/
-                           vv      v
-                            D      E
-
-   The main issue to notice here is that when we are processing path 1
-   (A->J->Son->D) we need to figure out the outgoing edge weights to
-   the duplicated edges Ja->Sona and Ja->Soff, while ensuring that the
-   sum of the incoming weights to D remain Ed.  The problem with simply
-   assuming that Ja (and Jc when processing path 2) has the same outgoing
-   probabilities to its successors as the original block J, is that after
-   all paths are processed and other edges/counts removed (e.g. none
-   of Ec will reach D after processing path 2), we may end up with not
-   enough count flowing along duplicated edge Sona->D.
-
-   Therefore, in the case of a joiner, we keep track of all counts
-   coming in along the current path, as well as from predecessors not
-   on any jump threading path (Eb in the above example).  While we
-   first assume that the duplicated Eona for Ja->Sona has the same
-   probability as the original, we later compensate for other jump
-   threading paths that may eliminate edges.  We do that by keep track
-   of all counts coming into the original path that are not in a jump
-   thread (Eb in the above example, but as noted earlier, there could
-   be other predecessors incoming to the path at various points, such
-   as at Son).  Call this cumulative non-path count coming into the path
-   before D as Enonpath.  We then ensure that the count from Sona->D is as at
-   least as big as (Ed - Enonpath), but no bigger than the minimum
-   weight along the jump threading path.  The probabilities of both the
-   original and duplicated joiner block J and Ja will be adjusted
-   accordingly after the updates.  */
-
-static bool
-compute_path_counts (struct redirection_data *rd,
-                     ssa_local_info_t *local_info,
-                     gcov_type *path_in_count_ptr,
-                     gcov_type *path_out_count_ptr,
-                     int *path_in_freq_ptr)
-{
-  edge e = rd->incoming_edges->e;
-  vec<jump_thread_edge *> *path = THREAD_PATH (e);
-  edge elast = path->last ()->e;
-  gcov_type nonpath_count = 0;
-  bool has_joiner = false;
-  gcov_type path_in_count = 0;
-  int path_in_freq = 0;
-
-  /* Start by accumulating incoming edge counts to the path's first bb
-     into a couple buckets:
-        path_in_count: total count of incoming edges that flow into the
-                  current path.
-        nonpath_count: total count of incoming edges that are not
-                  flowing along *any* path.  These are the counts
-                  that will still flow along the original path after
-                  all path duplication is done by potentially multiple
-                  calls to this routine.
-     (any other incoming edge counts are for a different jump threading
-     path that will be handled by a later call to this routine.)
-     To make this easier, start by recording all incoming edges that flow into
-     the current path in a bitmap.  We could add up the path's incoming edge
-     counts here, but we still need to walk all the first bb's incoming edges
-     below to add up the counts of the other edges not included in this jump
-     threading path.  */
-  struct el *next, *el;
-  bitmap in_edge_srcs = BITMAP_ALLOC (NULL);
-  for (el = rd->incoming_edges; el; el = next)
-    {
-      next = el->next;
-      bitmap_set_bit (in_edge_srcs, el->e->src->index);
-    }
-  edge ein;
-  edge_iterator ei;
-  FOR_EACH_EDGE (ein, ei, e->dest->preds)
-    {
-      vec<jump_thread_edge *> *ein_path = THREAD_PATH (ein);
-      /* Simply check the incoming edge src against the set captured above.  */
-      if (ein_path
-          && bitmap_bit_p (in_edge_srcs, (*ein_path)[0]->e->src->index))
-        {
-          /* It is necessary but not sufficient that the last path edges
-             are identical.  There may be different paths that share the
-             same last path edge in the case where the last edge has a nocopy
-             source block.  */
-          gcc_assert (ein_path->last ()->e == elast);
-          path_in_count += ein->count;
-          path_in_freq += EDGE_FREQUENCY (ein);
-        }
-      else if (!ein_path)
-        {
-          /* Keep track of the incoming edges that are not on any jump-threading
-             path.  These counts will still flow out of original path after all
-             jump threading is complete.  */
-            nonpath_count += ein->count;
-        }
-    }
-
-  /* This is needed due to insane incoming frequencies.  */
-  if (path_in_freq > BB_FREQ_MAX)
-    path_in_freq = BB_FREQ_MAX;
-
-  BITMAP_FREE (in_edge_srcs);
-
-  /* Now compute the fraction of the total count coming into the first
-     path bb that is from the current threading path.  */
-  gcov_type total_count = e->dest->count;
-  /* Handle incoming profile insanities.  */
-  if (total_count < path_in_count)
-    path_in_count = total_count;
-  int onpath_scale = GCOV_COMPUTE_SCALE (path_in_count, total_count);
-
-  /* Walk the entire path to do some more computation in order to estimate
-     how much of the path_in_count will flow out of the duplicated threading
-     path.  In the non-joiner case this is straightforward (it should be
-     the same as path_in_count, although we will handle incoming profile
-     insanities by setting it equal to the minimum count along the path).
-
-     In the joiner case, we need to estimate how much of the path_in_count
-     will stay on the threading path after the joiner's conditional branch.
-     We don't really know for sure how much of the counts
-     associated with this path go to each successor of the joiner, but we'll
-     estimate based on the fraction of the total count coming into the path
-     bb was from the threading paths (computed above in onpath_scale).
-     Afterwards, we will need to do some fixup to account for other threading
-     paths and possible profile insanities.
-
-     In order to estimate the joiner case's counts we also need to update
-     nonpath_count with any additional counts coming into the path.  Other
-     blocks along the path may have additional predecessors from outside
-     the path.  */
-  gcov_type path_out_count = path_in_count;
-  gcov_type min_path_count = path_in_count;
-  for (unsigned int i = 1; i < path->length (); i++)
-    {
-      edge epath = (*path)[i]->e;
-      gcov_type cur_count = epath->count;
-      if ((*path)[i]->type == EDGE_COPY_SRC_JOINER_BLOCK)
-        {
-          has_joiner = true;
-          cur_count = apply_probability (cur_count, onpath_scale);
-        }
-      /* In the joiner case we need to update nonpath_count for any edges
-         coming into the path that will contribute to the count flowing
-         into the path successor.  */
-      if (has_joiner && epath != elast)
-      {
-        /* Look for other incoming edges after joiner.  */
-        FOR_EACH_EDGE (ein, ei, epath->dest->preds)
-          {
-            if (ein != epath
-                /* Ignore in edges from blocks we have duplicated for a
-                   threading path, which have duplicated edge counts until
-                   they are redirected by an invocation of this routine.  */
-                && !bitmap_bit_p (local_info->duplicate_blocks,
-                                  ein->src->index))
-              nonpath_count += ein->count;
-          }
-      }
-      if (cur_count < path_out_count)
-        path_out_count = cur_count;
-      if (epath->count < min_path_count)
-        min_path_count = epath->count;
-    }
-
-  /* We computed path_out_count above assuming that this path targeted
-     the joiner's on-path successor with the same likelihood as it
-     reached the joiner.  However, other thread paths through the joiner
-     may take a different path through the normal copy source block
-     (i.e. they have a different elast), meaning that they do not
-     contribute any counts to this path's elast.  As a result, it may
-     turn out that this path must have more count flowing to the on-path
-     successor of the joiner.  Essentially, all of this path's elast
-     count must be contributed by this path and any nonpath counts
-     (since any path through the joiner with a different elast will not
-     include a copy of this elast in its duplicated path).
-     So ensure that this path's path_out_count is at least the
-     difference between elast->count and nonpath_count.  Otherwise the edge
-     counts after threading will not be sane.  */
-  if (has_joiner && path_out_count < elast->count - nonpath_count)
-  {
-    path_out_count = elast->count - nonpath_count;
-    /* But neither can we go above the minimum count along the path
-       we are duplicating.  This can be an issue due to profile
-       insanities coming in to this pass.  */
-    if (path_out_count > min_path_count)
-      path_out_count = min_path_count;
-  }
-
-  *path_in_count_ptr = path_in_count;
-  *path_out_count_ptr = path_out_count;
-  *path_in_freq_ptr = path_in_freq;
-  return has_joiner;
-}
-
-
-/* Update the counts and frequencies for both an original path
-   edge EPATH and its duplicate EDUP.  The duplicate source block
-   will get a count/frequency of PATH_IN_COUNT and PATH_IN_FREQ,
-   and the duplicate edge EDUP will have a count of PATH_OUT_COUNT.  */
-static void
-update_profile (edge epath, edge edup, gcov_type path_in_count,
-                gcov_type path_out_count, int path_in_freq)
-{
-
-  /* First update the duplicated block's count / frequency.  */
-  if (edup)
-    {
-      basic_block dup_block = edup->src;
-      gcc_assert (dup_block->count == 0);
-      gcc_assert (dup_block->frequency == 0);
-      dup_block->count = path_in_count;
-      dup_block->frequency = path_in_freq;
-    }
-
-  /* Now update the original block's count and frequency in the
-     opposite manner - remove the counts/freq that will flow
-     into the duplicated block.  Handle underflow due to precision/
-     rounding issues.  */
-  epath->src->count -= path_in_count;
-  if (epath->src->count < 0)
-    epath->src->count = 0;
-  epath->src->frequency -= path_in_freq;
-  if (epath->src->frequency < 0)
-    epath->src->frequency = 0;
-
-  /* Next update this path edge's original and duplicated counts.  We know
-     that the duplicated path will have path_out_count flowing
-     out of it (in the joiner case this is the count along the duplicated path
-     out of the duplicated joiner).  This count can then be removed from the
-     original path edge.  */
-  if (edup)
-    edup->count = path_out_count;
-  epath->count -= path_out_count;
-  gcc_assert (epath->count >= 0);
-}
-
-
-/* The duplicate and original joiner blocks may end up with different
-   probabilities (different from both the original and from each other).
-   Recompute the probabilities here once we have updated the edge
-   counts and frequencies.  */
-
-static void
-recompute_probabilities (basic_block bb)
-{
-  edge esucc;
-  edge_iterator ei;
-  FOR_EACH_EDGE (esucc, ei, bb->succs)
-    {
-      if (!bb->count)
-        continue;
-
-      /* Prevent overflow computation due to insane profiles.  */
-      if (esucc->count < bb->count)
-        esucc->probability = GCOV_COMPUTE_SCALE (esucc->count,
-                                                 bb->count);
-      else
-        /* Can happen with missing/guessed probabilities, since we
-           may determine that more is flowing along duplicated
-           path than joiner succ probabilities allowed.
-           Counts and freqs will be insane after jump threading,
-           at least make sure probability is sane or we will
-           get a flow verification error.
-           Not much we can do to make counts/freqs sane without
-           redoing the profile estimation.  */
-        esucc->probability = REG_BR_PROB_BASE;
-    }
-}
-
-
-/* Update the counts of the original and duplicated edges from a joiner
-   that go off path, given that we have already determined that the
-   duplicate joiner DUP_BB has incoming count PATH_IN_COUNT and
-   outgoing count along the path PATH_OUT_COUNT.  The original (on-)path
-   edge from joiner is EPATH.  */
-
-static void
-update_joiner_offpath_counts (edge epath, basic_block dup_bb,
-                              gcov_type path_in_count,
-                              gcov_type path_out_count)
-{
-  /* Compute the count that currently flows off path from the joiner.
-     In other words, the total count of joiner's out edges other than
-     epath.  Compute this by walking the successors instead of
-     subtracting epath's count from the joiner bb count, since there
-     are sometimes slight insanities where the total out edge count is
-     larger than the bb count (possibly due to rounding/truncation
-     errors).  */
-  gcov_type total_orig_off_path_count = 0;
-  edge enonpath;
-  edge_iterator ei;
-  FOR_EACH_EDGE (enonpath, ei, epath->src->succs)
-    {
-      if (enonpath == epath)
-        continue;
-      total_orig_off_path_count += enonpath->count;
-    }
-
-  /* For the path that we are duplicating, the amount that will flow
-     off path from the duplicated joiner is the delta between the
-     path's cumulative in count and the portion of that count we
-     estimated above as flowing from the joiner along the duplicated
-     path.  */
-  gcov_type total_dup_off_path_count = path_in_count - path_out_count;
-
-  /* Now do the actual updates of the off-path edges.  */
-  FOR_EACH_EDGE (enonpath, ei, epath->src->succs)
-    {
-      /* Look for edges going off of the threading path.  */
-      if (enonpath == epath)
-        continue;
-
-      /* Find the corresponding edge out of the duplicated joiner.  */
-      edge enonpathdup = find_edge (dup_bb, enonpath->dest);
-      gcc_assert (enonpathdup);
-
-      /* We can't use the original probability of the joiner's out
-         edges, since the probabilities of the original branch
-         and the duplicated branches may vary after all threading is
-         complete.  But apportion the duplicated joiner's off-path
-         total edge count computed earlier (total_dup_off_path_count)
-         among the duplicated off-path edges based on their original
-         ratio to the full off-path count (total_orig_off_path_count).
-         */
-      int scale = GCOV_COMPUTE_SCALE (enonpath->count,
-                                      total_orig_off_path_count);
-      /* Give the duplicated offpath edge a portion of the duplicated
-         total.  */
-      enonpathdup->count = apply_scale (scale,
-                                        total_dup_off_path_count);
-      /* Now update the original offpath edge count, handling underflow
-         due to rounding errors.  */
-      enonpath->count -= enonpathdup->count;
-      if (enonpath->count < 0)
-        enonpath->count = 0;
-    }
-}
-
-
-/* Check if the paths through RD all have estimated frequencies but zero
-   profile counts.  This is more accurate than checking the entry block
-   for a zero profile count, since profile insanities sometimes creep in.  */
-
-static bool
-estimated_freqs_path (struct redirection_data *rd)
-{
-  edge e = rd->incoming_edges->e;
-  vec<jump_thread_edge *> *path = THREAD_PATH (e);
-  edge ein;
-  edge_iterator ei;
-  bool non_zero_freq = false;
-  FOR_EACH_EDGE (ein, ei, e->dest->preds)
-    {
-      if (ein->count)
-        return false;
-      non_zero_freq |= ein->src->frequency != 0;
-    }
-
-  for (unsigned int i = 1; i < path->length (); i++)
-    {
-      edge epath = (*path)[i]->e;
-      if (epath->src->count)
-        return false;
-      non_zero_freq |= epath->src->frequency != 0;
-      edge esucc;
-      FOR_EACH_EDGE (esucc, ei, epath->src->succs)
-        {
-          if (esucc->count)
-            return false;
-          non_zero_freq |= esucc->src->frequency != 0;
-        }
-    }
-  return non_zero_freq;
-}
-
-
-/* Invoked for routines that have guessed frequencies and no profile
-   counts to record the block and edge frequencies for paths through RD
-   in the profile count fields of those blocks and edges.  This is because
-   ssa_fix_duplicate_block_edges incrementally updates the block and
-   edge counts as edges are redirected, and it is difficult to do that
-   for edge frequencies which are computed on the fly from the source
-   block frequency and probability.  When a block frequency is updated
-   its outgoing edge frequencies are affected and become difficult to
-   adjust.  */
-
-static void
-freqs_to_counts_path (struct redirection_data *rd)
-{
-  edge e = rd->incoming_edges->e;
-  vec<jump_thread_edge *> *path = THREAD_PATH (e);
-  edge ein;
-  edge_iterator ei;
-  FOR_EACH_EDGE (ein, ei, e->dest->preds)
-    {
-      /* Scale up the frequency by REG_BR_PROB_BASE, to avoid rounding
-         errors applying the probability when the frequencies are very
-         small.  */
-      ein->count = apply_probability (ein->src->frequency * REG_BR_PROB_BASE,
-                                      ein->probability);
-    }
-
-  for (unsigned int i = 1; i < path->length (); i++)
-    {
-      edge epath = (*path)[i]->e;
-      edge esucc;
-      /* Scale up the frequency by REG_BR_PROB_BASE, to avoid rounding
-         errors applying the edge probability when the frequencies are very
-         small.  */
-      epath->src->count = epath->src->frequency * REG_BR_PROB_BASE;
-      FOR_EACH_EDGE (esucc, ei, epath->src->succs)
-        esucc->count = apply_probability (esucc->src->count,
-                                          esucc->probability);
-    }
-}
-
-
-/* For routines that have guessed frequencies and no profile counts, where we
-   used freqs_to_counts_path to record block and edge frequencies for paths
-   through RD, we clear the counts after completing all updates for RD.
-   The updates in ssa_fix_duplicate_block_edges are based off the count fields,
-   but the block frequencies and edge probabilities were updated as well,
-   so we can simply clear the count fields.  */
-
-static void
-clear_counts_path (struct redirection_data *rd)
-{
-  edge e = rd->incoming_edges->e;
-  vec<jump_thread_edge *> *path = THREAD_PATH (e);
-  edge ein, esucc;
-  edge_iterator ei;
-  FOR_EACH_EDGE (ein, ei, e->dest->preds)
-    ein->count = 0;
-
-  /* First clear counts along original path.  */
-  for (unsigned int i = 1; i < path->length (); i++)
-    {
-      edge epath = (*path)[i]->e;
-      FOR_EACH_EDGE (esucc, ei, epath->src->succs)
-        esucc->count = 0;
-      epath->src->count = 0;
-    }
-  /* Also need to clear the counts along duplicated path.  */
-  for (unsigned int i = 0; i < 2; i++)
-    {
-      basic_block dup = rd->dup_blocks[i];
-      if (!dup)
-        continue;
-      FOR_EACH_EDGE (esucc, ei, dup->succs)
-        esucc->count = 0;
-      dup->count = 0;
-    }
-}
-
-/* Wire up the outgoing edges from the duplicate blocks and
-   update any PHIs as needed.  Also update the profile counts
-   on the original and duplicate blocks and edges.  */
-void
-ssa_fix_duplicate_block_edges (struct redirection_data *rd,
-			       ssa_local_info_t *local_info)
-{
-  bool multi_incomings = (rd->incoming_edges->next != NULL);
-  edge e = rd->incoming_edges->e;
-  vec<jump_thread_edge *> *path = THREAD_PATH (e);
-  edge elast = path->last ()->e;
-  gcov_type path_in_count = 0;
-  gcov_type path_out_count = 0;
-  int path_in_freq = 0;
-
-  /* This routine updates profile counts, frequencies, and probabilities
-     incrementally. Since it is difficult to do the incremental updates
-     using frequencies/probabilities alone, for routines without profile
-     data we first take a snapshot of the existing block and edge frequencies
-     by copying them into the empty profile count fields.  These counts are
-     then used to do the incremental updates, and cleared at the end of this
-     routine.  If the function is marked as having a profile, we still check
-     to see if the paths through RD are using estimated frequencies because
-     the routine had zero profile counts.  */
-  bool do_freqs_to_counts = (profile_status_for_fn (cfun) != PROFILE_READ
-                             || estimated_freqs_path (rd));
-  if (do_freqs_to_counts)
-    freqs_to_counts_path (rd);
-
-  /* First determine how much profile count to move from original
-     path to the duplicate path.  This is tricky in the presence of
-     a joiner (see comments for compute_path_counts), where some portion
-     of the path's counts will flow off-path from the joiner.  In the
-     non-joiner case the path_in_count and path_out_count should be the
-     same.  */
-  bool has_joiner = compute_path_counts (rd, local_info,
-                                         &path_in_count, &path_out_count,
-                                         &path_in_freq);
-
-  int cur_path_freq = path_in_freq;
-  for (unsigned int count = 0, i = 1; i < path->length (); i++)
-    {
-      edge epath = (*path)[i]->e;
-
-      /* If we were threading through an joiner block, then we want
-	 to keep its control statement and redirect an outgoing edge.
-	 Else we want to remove the control statement & edges, then create
-	 a new outgoing edge.  In both cases we may need to update PHIs.  */
-      if ((*path)[i]->type == EDGE_COPY_SRC_JOINER_BLOCK)
-	{
-	  edge victim;
-	  edge e2;
-
-          gcc_assert (has_joiner);
-
-	  /* This updates the PHIs at the destination of the duplicate
-	     block.  Pass 0 instead of i if we are threading a path which
-	     has multiple incoming edges.  */
-	  update_destination_phis (local_info->bb, rd->dup_blocks[count],
-				   path, multi_incomings ? 0 : i);
-
-	  /* Find the edge from the duplicate block to the block we're
-	     threading through.  That's the edge we want to redirect.  */
-	  victim = find_edge (rd->dup_blocks[count], (*path)[i]->e->dest);
-
-	  /* If there are no remaining blocks on the path to duplicate,
-	     then redirect VICTIM to the final destination of the jump
-	     threading path.  */
-	  if (!any_remaining_duplicated_blocks (path, i))
-	    {
-	      e2 = redirect_edge_and_branch (victim, elast->dest);
-	      /* If we redirected the edge, then we need to copy PHI arguments
-		 at the target.  If the edge already existed (e2 != victim
-		 case), then the PHIs in the target already have the correct
-		 arguments.  */
-	      if (e2 == victim)
-		copy_phi_args (e2->dest, elast, e2,
-			       path, multi_incomings ? 0 : i);
-	    }
-	  else
-	    {
-	      /* Redirect VICTIM to the next duplicated block in the path.  */
-	      e2 = redirect_edge_and_branch (victim, rd->dup_blocks[count + 1]);
-
-	      /* We need to update the PHIs in the next duplicated block.  We
-		 want the new PHI args to have the same value as they had
-		 in the source of the next duplicate block.
-
-		 Thus, we need to know which edge we traversed into the
-		 source of the duplicate.  Furthermore, we may have
-		 traversed many edges to reach the source of the duplicate.
-
-		 Walk through the path starting at element I until we
-		 hit an edge marked with EDGE_COPY_SRC_BLOCK.  We want
-		 the edge from the prior element.  */
-	      for (unsigned int j = i + 1; j < path->length (); j++)
-		{
-		  if ((*path)[j]->type == EDGE_COPY_SRC_BLOCK)
-		    {
-		      copy_phi_arg_into_existing_phi ((*path)[j - 1]->e, e2);
-		      break;
-		    }
-		}
-	    }
-
-	  /* Update the counts and frequency of both the original block
-	     and path edge, and the duplicates.  The path duplicate's
-	     incoming count and frequency are the totals for all edges
-	     incoming to this jump threading path computed earlier.
-	     And we know that the duplicated path will have path_out_count
-	     flowing out of it (i.e. along the duplicated path out of the
-	     duplicated joiner).  */
-	  update_profile (epath, e2, path_in_count, path_out_count,
-			  path_in_freq);
-
-	  /* Next we need to update the counts of the original and duplicated
-	     edges from the joiner that go off path.  */
-	  update_joiner_offpath_counts (epath, e2->src, path_in_count,
-                                        path_out_count);
-
-	  /* Finally, we need to set the probabilities on the duplicated
-	     edges out of the duplicated joiner (e2->src).  The probabilities
-	     along the original path will all be updated below after we finish
-	     processing the whole path.  */
-	  recompute_probabilities (e2->src);
-
-	  /* Record the frequency flowing to the downstream duplicated
-	     path blocks.  */
-	  cur_path_freq = EDGE_FREQUENCY (e2);
-	}
-      else if ((*path)[i]->type == EDGE_COPY_SRC_BLOCK)
-	{
-	  remove_ctrl_stmt_and_useless_edges (rd->dup_blocks[count], NULL);
-	  create_edge_and_update_destination_phis (rd, rd->dup_blocks[count],
-						   multi_incomings ? 0 : i);
-	  if (count == 1)
-	    single_succ_edge (rd->dup_blocks[1])->aux = NULL;
-
-	  /* Update the counts and frequency of both the original block
-	     and path edge, and the duplicates.  Since we are now after
-	     any joiner that may have existed on the path, the count
-	     flowing along the duplicated threaded path is path_out_count.
-	     If we didn't have a joiner, then cur_path_freq was the sum
-	     of the total frequencies along all incoming edges to the
-	     thread path (path_in_freq).  If we had a joiner, it would have
-	     been updated at the end of that handling to the edge frequency
-	     along the duplicated joiner path edge.  */
-	  update_profile (epath, EDGE_SUCC (rd->dup_blocks[count], 0),
-			  path_out_count, path_out_count,
-			  cur_path_freq);
-	}
-      else
-        {
-	  /* No copy case.  In this case we don't have an equivalent block
-	     on the duplicated thread path to update, but we do need
-	     to remove the portion of the counts/freqs that were moved
-	     to the duplicated path from the counts/freqs flowing through
-	     this block on the original path.  Since all the no-copy edges
-	     are after any joiner, the removed count is the same as
-	     path_out_count.
-
-	     If we didn't have a joiner, then cur_path_freq was the sum
-	     of the total frequencies along all incoming edges to the
-	     thread path (path_in_freq).  If we had a joiner, it would have
-	     been updated at the end of that handling to the edge frequency
-	     along the duplicated joiner path edge.  */
-	     update_profile (epath, NULL, path_out_count, path_out_count,
-			     cur_path_freq);
-	}
-
-      /* Increment the index into the duplicated path when we processed
-         a duplicated block.  */
-      if ((*path)[i]->type == EDGE_COPY_SRC_JOINER_BLOCK
-          || (*path)[i]->type == EDGE_COPY_SRC_BLOCK)
-      {
-	  count++;
-      }
-    }
-
-  /* Now walk orig blocks and update their probabilities, since the
-     counts and freqs should be updated properly by above loop.  */
-  for (unsigned int i = 1; i < path->length (); i++)
-    {
-      edge epath = (*path)[i]->e;
-      recompute_probabilities (epath->src);
-    }
-
-  /* Done with all profile and frequency updates, clear counts if they
-     were copied.  */
-  if (do_freqs_to_counts)
-    clear_counts_path (rd);
-}
-
-/* Hash table traversal callback routine to create duplicate blocks.  */
-
-int
-ssa_create_duplicates (struct redirection_data **slot,
-		       ssa_local_info_t *local_info)
-{
-  struct redirection_data *rd = *slot;
-
-  /* The second duplicated block in a jump threading path is specific
-     to the path.  So it gets stored in RD rather than in LOCAL_DATA.
-
-     Each time we're called, we have to look through the path and see
-     if a second block needs to be duplicated.
-
-     Note the search starts with the third edge on the path.  The first
-     edge is the incoming edge, the second edge always has its source
-     duplicated.  Thus we start our search with the third edge.  */
-  vec<jump_thread_edge *> *path = rd->path;
-  for (unsigned int i = 2; i < path->length (); i++)
-    {
-      if ((*path)[i]->type == EDGE_COPY_SRC_BLOCK
-	  || (*path)[i]->type == EDGE_COPY_SRC_JOINER_BLOCK)
-	{
-	  create_block_for_threading ((*path)[i]->e->src, rd, 1,
-                                      &local_info->duplicate_blocks);
-	  break;
-	}
-    }
-
-  /* Create a template block if we have not done so already.  Otherwise
-     use the template to create a new block.  */
-  if (local_info->template_block == NULL)
-    {
-      create_block_for_threading ((*path)[1]->e->src, rd, 0,
-                                  &local_info->duplicate_blocks);
-      local_info->template_block = rd->dup_blocks[0];
-
-      /* We do not create any outgoing edges for the template.  We will
-	 take care of that in a later traversal.  That way we do not
-	 create edges that are going to just be deleted.  */
-    }
-  else
-    {
-      create_block_for_threading (local_info->template_block, rd, 0,
-                                  &local_info->duplicate_blocks);
-
-      /* Go ahead and wire up outgoing edges and update PHIs for the duplicate
-	 block.   */
-      ssa_fix_duplicate_block_edges (rd, local_info);
-    }
-
-  /* Keep walking the hash table.  */
-  return 1;
-}
-
-/* We did not create any outgoing edges for the template block during
-   block creation.  This hash table traversal callback creates the
-   outgoing edge for the template block.  */
-
-inline int
-ssa_fixup_template_block (struct redirection_data **slot,
-			  ssa_local_info_t *local_info)
-{
-  struct redirection_data *rd = *slot;
-
-  /* If this is the template block halt the traversal after updating
-     it appropriately.
-
-     If we were threading through an joiner block, then we want
-     to keep its control statement and redirect an outgoing edge.
-     Else we want to remove the control statement & edges, then create
-     a new outgoing edge.  In both cases we may need to update PHIs.  */
-  if (rd->dup_blocks[0] && rd->dup_blocks[0] == local_info->template_block)
-    {
-      ssa_fix_duplicate_block_edges (rd, local_info);
-      return 0;
-    }
-
-  return 1;
-}
-
-/* Hash table traversal callback to redirect each incoming edge
-   associated with this hash table element to its new destination.  */
-
-int
-ssa_redirect_edges (struct redirection_data **slot,
-		    ssa_local_info_t *local_info)
-{
-  struct redirection_data *rd = *slot;
-  struct el *next, *el;
-
-  /* Walk over all the incoming edges associated associated with this
-     hash table entry.  */
-  for (el = rd->incoming_edges; el; el = next)
-    {
-      edge e = el->e;
-      vec<jump_thread_edge *> *path = THREAD_PATH (e);
-
-      /* Go ahead and free this element from the list.  Doing this now
-	 avoids the need for another list walk when we destroy the hash
-	 table.  */
-      next = el->next;
-      free (el);
-
-      thread_stats.num_threaded_edges++;
-
-      if (rd->dup_blocks[0])
-	{
-	  edge e2;
-
-	  if (dump_file && (dump_flags & TDF_DETAILS))
-	    fprintf (dump_file, "  Threaded jump %d --> %d to %d\n",
-		     e->src->index, e->dest->index, rd->dup_blocks[0]->index);
-
-	  /* If we redirect a loop latch edge cancel its loop.  */
-	  if (e->src == e->src->loop_father->latch)
-	    mark_loop_for_removal (e->src->loop_father);
-
-	  /* Redirect the incoming edge (possibly to the joiner block) to the
-	     appropriate duplicate block.  */
-	  e2 = redirect_edge_and_branch (e, rd->dup_blocks[0]);
-	  gcc_assert (e == e2);
-	  flush_pending_stmts (e2);
-	}
-
-      /* Go ahead and clear E->aux.  It's not needed anymore and failure
-	 to clear it will cause all kinds of unpleasant problems later.  */
-      delete_jump_thread_path (path);
-      e->aux = NULL;
-
-    }
-
-  /* Indicate that we actually threaded one or more jumps.  */
-  if (rd->incoming_edges)
-    local_info->jumps_threaded = true;
-
-  return 1;
-}
-
-/* Return true if this block has no executable statements other than
-   a simple ctrl flow instruction.  When the number of outgoing edges
-   is one, this is equivalent to a "forwarder" block.  */
-
-static bool
-redirection_block_p (basic_block bb)
-{
-  gimple_stmt_iterator gsi;
-
-  /* Advance to the first executable statement.  */
-  gsi = gsi_start_bb (bb);
-  while (!gsi_end_p (gsi)
-	 && (gimple_code (gsi_stmt (gsi)) == GIMPLE_LABEL
-	     || is_gimple_debug (gsi_stmt (gsi))
-	     || gimple_nop_p (gsi_stmt (gsi))))
-    gsi_next (&gsi);
-
-  /* Check if this is an empty block.  */
-  if (gsi_end_p (gsi))
-    return true;
-
-  /* Test that we've reached the terminating control statement.  */
-  return gsi_stmt (gsi)
-	 && (gimple_code (gsi_stmt (gsi)) == GIMPLE_COND
-	     || gimple_code (gsi_stmt (gsi)) == GIMPLE_GOTO
-	     || gimple_code (gsi_stmt (gsi)) == GIMPLE_SWITCH);
-}
-
-/* BB is a block which ends with a COND_EXPR or SWITCH_EXPR and when BB
-   is reached via one or more specific incoming edges, we know which
-   outgoing edge from BB will be traversed.
-
-   We want to redirect those incoming edges to the target of the
-   appropriate outgoing edge.  Doing so avoids a conditional branch
-   and may expose new optimization opportunities.  Note that we have
-   to update dominator tree and SSA graph after such changes.
-
-   The key to keeping the SSA graph update manageable is to duplicate
-   the side effects occurring in BB so that those side effects still
-   occur on the paths which bypass BB after redirecting edges.
-
-   We accomplish this by creating duplicates of BB and arranging for
-   the duplicates to unconditionally pass control to one specific
-   successor of BB.  We then revector the incoming edges into BB to
-   the appropriate duplicate of BB.
-
-   If NOLOOP_ONLY is true, we only perform the threading as long as it
-   does not affect the structure of the loops in a nontrivial way.
-
-   If JOINERS is true, then thread through joiner blocks as well.  */
-
-static bool
-thread_block_1 (basic_block bb, bool noloop_only, bool joiners)
-{
-  /* E is an incoming edge into BB that we may or may not want to
-     redirect to a duplicate of BB.  */
-  edge e, e2;
-  edge_iterator ei;
-  ssa_local_info_t local_info;
-
-  local_info.duplicate_blocks = BITMAP_ALLOC (NULL);
-
-  /* To avoid scanning a linear array for the element we need we instead
-     use a hash table.  For normal code there should be no noticeable
-     difference.  However, if we have a block with a large number of
-     incoming and outgoing edges such linear searches can get expensive.  */
-  redirection_data
-    = new hash_table<struct redirection_data> (EDGE_COUNT (bb->succs));
-
-  /* Record each unique threaded destination into a hash table for
-     efficient lookups.  */
-  FOR_EACH_EDGE (e, ei, bb->preds)
-    {
-      if (e->aux == NULL)
-	continue;
-
-      vec<jump_thread_edge *> *path = THREAD_PATH (e);
-
-      if (((*path)[1]->type == EDGE_COPY_SRC_JOINER_BLOCK && !joiners)
-	  || ((*path)[1]->type == EDGE_COPY_SRC_BLOCK && joiners))
-	continue;
-
-      e2 = path->last ()->e;
-      if (!e2 || noloop_only)
-	{
-	  /* If NOLOOP_ONLY is true, we only allow threading through the
-	     header of a loop to exit edges.  */
-
-	  /* One case occurs when there was loop header buried in a jump
-	     threading path that crosses loop boundaries.  We do not try
-	     and thread this elsewhere, so just cancel the jump threading
-	     request by clearing the AUX field now.  */
-	  if ((bb->loop_father != e2->src->loop_father
-	       && !loop_exit_edge_p (e2->src->loop_father, e2))
-	      || (e2->src->loop_father != e2->dest->loop_father
-		  && !loop_exit_edge_p (e2->src->loop_father, e2)))
-	    {
-	      /* Since this case is not handled by our special code
-		 to thread through a loop header, we must explicitly
-		 cancel the threading request here.  */
-	      delete_jump_thread_path (path);
-	      e->aux = NULL;
-	      continue;
-	    }
-
-	  /* Another case occurs when trying to thread through our
-	     own loop header, possibly from inside the loop.  We will
-	     thread these later.  */
-	  unsigned int i;
-	  for (i = 1; i < path->length (); i++)
-	    {
-	      if ((*path)[i]->e->src == bb->loop_father->header
-		  && (!loop_exit_edge_p (bb->loop_father, e2)
-		      || (*path)[1]->type == EDGE_COPY_SRC_JOINER_BLOCK))
-		break;
-	    }
-
-	  if (i != path->length ())
-	    continue;
-	}
-
-      /* Insert the outgoing edge into the hash table if it is not
-	 already in the hash table.  */
-      lookup_redirection_data (e, INSERT);
-    }
-
-  /* We do not update dominance info.  */
-  free_dominance_info (CDI_DOMINATORS);
-
-  /* We know we only thread through the loop header to loop exits.
-     Let the basic block duplication hook know we are not creating
-     a multiple entry loop.  */
-  if (noloop_only
-      && bb == bb->loop_father->header)
-    set_loop_copy (bb->loop_father, loop_outer (bb->loop_father));
-
-  /* Now create duplicates of BB.
-
-     Note that for a block with a high outgoing degree we can waste
-     a lot of time and memory creating and destroying useless edges.
-
-     So we first duplicate BB and remove the control structure at the
-     tail of the duplicate as well as all outgoing edges from the
-     duplicate.  We then use that duplicate block as a template for
-     the rest of the duplicates.  */
-  local_info.template_block = NULL;
-  local_info.bb = bb;
-  local_info.jumps_threaded = false;
-  redirection_data->traverse <ssa_local_info_t *, ssa_create_duplicates>
-			    (&local_info);
-
-  /* The template does not have an outgoing edge.  Create that outgoing
-     edge and update PHI nodes as the edge's target as necessary.
-
-     We do this after creating all the duplicates to avoid creating
-     unnecessary edges.  */
-  redirection_data->traverse <ssa_local_info_t *, ssa_fixup_template_block>
-			    (&local_info);
-
-  /* The hash table traversals above created the duplicate blocks (and the
-     statements within the duplicate blocks).  This loop creates PHI nodes for
-     the duplicated blocks and redirects the incoming edges into BB to reach
-     the duplicates of BB.  */
-  redirection_data->traverse <ssa_local_info_t *, ssa_redirect_edges>
-			    (&local_info);
-
-  /* Done with this block.  Clear REDIRECTION_DATA.  */
-  delete redirection_data;
-  redirection_data = NULL;
-
-  if (noloop_only
-      && bb == bb->loop_father->header)
-    set_loop_copy (bb->loop_father, NULL);
-
-  BITMAP_FREE (local_info.duplicate_blocks);
-  local_info.duplicate_blocks = NULL;
-
-  /* Indicate to our caller whether or not any jumps were threaded.  */
-  return local_info.jumps_threaded;
-}
-
-/* Wrapper for thread_block_1 so that we can first handle jump
-   thread paths which do not involve copying joiner blocks, then
-   handle jump thread paths which have joiner blocks.
-
-   By doing things this way we can be as aggressive as possible and
-   not worry that copying a joiner block will create a jump threading
-   opportunity.  */
-
-static bool
-thread_block (basic_block bb, bool noloop_only)
-{
-  bool retval;
-  retval = thread_block_1 (bb, noloop_only, false);
-  retval |= thread_block_1 (bb, noloop_only, true);
-  return retval;
-}
-
-
-/* Threads edge E through E->dest to the edge THREAD_TARGET (E).  Returns the
-   copy of E->dest created during threading, or E->dest if it was not necessary
-   to copy it (E is its single predecessor).  */
-
-static basic_block
-thread_single_edge (edge e)
-{
-  basic_block bb = e->dest;
-  struct redirection_data rd;
-  vec<jump_thread_edge *> *path = THREAD_PATH (e);
-  edge eto = (*path)[1]->e;
-
-  for (unsigned int i = 0; i < path->length (); i++)
-    delete (*path)[i];
-  delete path;
-  e->aux = NULL;
-
-  thread_stats.num_threaded_edges++;
-
-  if (single_pred_p (bb))
-    {
-      /* If BB has just a single predecessor, we should only remove the
-	 control statements at its end, and successors except for ETO.  */
-      remove_ctrl_stmt_and_useless_edges (bb, eto->dest);
-
-      /* And fixup the flags on the single remaining edge.  */
-      eto->flags &= ~(EDGE_TRUE_VALUE | EDGE_FALSE_VALUE | EDGE_ABNORMAL);
-      eto->flags |= EDGE_FALLTHRU;
-
-      return bb;
-    }
-
-  /* Otherwise, we need to create a copy.  */
-  if (e->dest == eto->src)
-    update_bb_profile_for_threading (bb, EDGE_FREQUENCY (e), e->count, eto);
-
-  vec<jump_thread_edge *> *npath = new vec<jump_thread_edge *> ();
-  jump_thread_edge *x = new jump_thread_edge (e, EDGE_START_JUMP_THREAD);
-  npath->safe_push (x);
-
-  x = new jump_thread_edge (eto, EDGE_COPY_SRC_BLOCK);
-  npath->safe_push (x);
-  rd.path = npath;
-
-  create_block_for_threading (bb, &rd, 0, NULL);
-  remove_ctrl_stmt_and_useless_edges (rd.dup_blocks[0], NULL);
-  create_edge_and_update_destination_phis (&rd, rd.dup_blocks[0], 0);
-
-  if (dump_file && (dump_flags & TDF_DETAILS))
-    fprintf (dump_file, "  Threaded jump %d --> %d to %d\n",
-	     e->src->index, e->dest->index, rd.dup_blocks[0]->index);
-
-  rd.dup_blocks[0]->count = e->count;
-  rd.dup_blocks[0]->frequency = EDGE_FREQUENCY (e);
-  single_succ_edge (rd.dup_blocks[0])->count = e->count;
-  redirect_edge_and_branch (e, rd.dup_blocks[0]);
-  flush_pending_stmts (e);
-
-  return rd.dup_blocks[0];
-}
-
-/* Callback for dfs_enumerate_from.  Returns true if BB is different
-   from STOP and DBDS_CE_STOP.  */
-
-static basic_block dbds_ce_stop;
-static bool
-dbds_continue_enumeration_p (const_basic_block bb, const void *stop)
-{
-  return (bb != (const_basic_block) stop
-	  && bb != dbds_ce_stop);
-}
-
-/* Evaluates the dominance relationship of latch of the LOOP and BB, and
-   returns the state.  */
-
-enum bb_dom_status
-{
-  /* BB does not dominate latch of the LOOP.  */
-  DOMST_NONDOMINATING,
-  /* The LOOP is broken (there is no path from the header to its latch.  */
-  DOMST_LOOP_BROKEN,
-  /* BB dominates the latch of the LOOP.  */
-  DOMST_DOMINATING
-};
-
-static enum bb_dom_status
-determine_bb_domination_status (struct loop *loop, basic_block bb)
-{
-  basic_block *bblocks;
-  unsigned nblocks, i;
-  bool bb_reachable = false;
-  edge_iterator ei;
-  edge e;
-
-  /* This function assumes BB is a successor of LOOP->header.
-     If that is not the case return DOMST_NONDOMINATING which
-     is always safe.  */
-    {
-      bool ok = false;
-
-      FOR_EACH_EDGE (e, ei, bb->preds)
-	{
-     	  if (e->src == loop->header)
-	    {
-	      ok = true;
-	      break;
-	    }
-	}
-
-      if (!ok)
-	return DOMST_NONDOMINATING;
-    }
-
-  if (bb == loop->latch)
-    return DOMST_DOMINATING;
-
-  /* Check that BB dominates LOOP->latch, and that it is back-reachable
-     from it.  */
-
-  bblocks = XCNEWVEC (basic_block, loop->num_nodes);
-  dbds_ce_stop = loop->header;
-  nblocks = dfs_enumerate_from (loop->latch, 1, dbds_continue_enumeration_p,
-				bblocks, loop->num_nodes, bb);
-  for (i = 0; i < nblocks; i++)
-    FOR_EACH_EDGE (e, ei, bblocks[i]->preds)
-      {
-	if (e->src == loop->header)
-	  {
-	    free (bblocks);
-	    return DOMST_NONDOMINATING;
-	  }
-	if (e->src == bb)
-	  bb_reachable = true;
-      }
-
-  free (bblocks);
-  return (bb_reachable ? DOMST_DOMINATING : DOMST_LOOP_BROKEN);
-}
-
-/* Return true if BB is part of the new pre-header that is created
-   when threading the latch to DATA.  */
-
-static bool
-def_split_header_continue_p (const_basic_block bb, const void *data)
-{
-  const_basic_block new_header = (const_basic_block) data;
-  const struct loop *l;
-
-  if (bb == new_header
-      || loop_depth (bb->loop_father) < loop_depth (new_header->loop_father))
-    return false;
-  for (l = bb->loop_father; l; l = loop_outer (l))
-    if (l == new_header->loop_father)
-      return true;
-  return false;
-}
-
-/* Thread jumps through the header of LOOP.  Returns true if cfg changes.
-   If MAY_PEEL_LOOP_HEADERS is false, we avoid threading from entry edges
-   to the inside of the loop.  */
-
-static bool
-thread_through_loop_header (struct loop *loop, bool may_peel_loop_headers)
-{
-  basic_block header = loop->header;
-  edge e, tgt_edge, latch = loop_latch_edge (loop);
-  edge_iterator ei;
-  basic_block tgt_bb, atgt_bb;
-  enum bb_dom_status domst;
-
-  /* We have already threaded through headers to exits, so all the threading
-     requests now are to the inside of the loop.  We need to avoid creating
-     irreducible regions (i.e., loops with more than one entry block), and
-     also loop with several latch edges, or new subloops of the loop (although
-     there are cases where it might be appropriate, it is difficult to decide,
-     and doing it wrongly may confuse other optimizers).
-
-     We could handle more general cases here.  However, the intention is to
-     preserve some information about the loop, which is impossible if its
-     structure changes significantly, in a way that is not well understood.
-     Thus we only handle few important special cases, in which also updating
-     of the loop-carried information should be feasible:
-
-     1) Propagation of latch edge to a block that dominates the latch block
-	of a loop.  This aims to handle the following idiom:
-
-	first = 1;
-	while (1)
-	  {
-	    if (first)
-	      initialize;
-	    first = 0;
-	    body;
-	  }
-
-	After threading the latch edge, this becomes
-
-	first = 1;
-	if (first)
-	  initialize;
-	while (1)
-	  {
-	    first = 0;
-	    body;
-	  }
-
-	The original header of the loop is moved out of it, and we may thread
-	the remaining edges through it without further constraints.
-
-     2) All entry edges are propagated to a single basic block that dominates
-	the latch block of the loop.  This aims to handle the following idiom
-	(normally created for "for" loops):
-
-	i = 0;
-	while (1)
-	  {
-	    if (i >= 100)
-	      break;
-	    body;
-	    i++;
-	  }
-
-	This becomes
-
-	i = 0;
-	while (1)
-	  {
-	    body;
-	    i++;
-	    if (i >= 100)
-	      break;
-	  }
-     */
-
-  /* Threading through the header won't improve the code if the header has just
-     one successor.  */
-  if (single_succ_p (header))
-    goto fail;
-
-  /* If we threaded the latch using a joiner block, we cancel the
-     threading opportunity out of an abundance of caution.  However,
-     still allow threading from outside to inside the loop.  */
-  if (latch->aux)
-    {
-      vec<jump_thread_edge *> *path = THREAD_PATH (latch);
-      if ((*path)[1]->type == EDGE_COPY_SRC_JOINER_BLOCK)
-	{
-	  delete_jump_thread_path (path);
-	  latch->aux = NULL;
-	}
-    }
-
-  if (latch->aux)
-    {
-      vec<jump_thread_edge *> *path = THREAD_PATH (latch);
-      tgt_edge = (*path)[1]->e;
-      tgt_bb = tgt_edge->dest;
-    }
-  else if (!may_peel_loop_headers
-	   && !redirection_block_p (loop->header))
-    goto fail;
-  else
-    {
-      tgt_bb = NULL;
-      tgt_edge = NULL;
-      FOR_EACH_EDGE (e, ei, header->preds)
-	{
-	  if (!e->aux)
-	    {
-	      if (e == latch)
-		continue;
-
-	      /* If latch is not threaded, and there is a header
-		 edge that is not threaded, we would create loop
-		 with multiple entries.  */
-	      goto fail;
-	    }
-
-	  vec<jump_thread_edge *> *path = THREAD_PATH (e);
-
-	  if ((*path)[1]->type == EDGE_COPY_SRC_JOINER_BLOCK)
-	    goto fail;
-	  tgt_edge = (*path)[1]->e;
-	  atgt_bb = tgt_edge->dest;
-	  if (!tgt_bb)
-	    tgt_bb = atgt_bb;
-	  /* Two targets of threading would make us create loop
-	     with multiple entries.  */
-	  else if (tgt_bb != atgt_bb)
-	    goto fail;
-	}
-
-      if (!tgt_bb)
-	{
-	  /* There are no threading requests.  */
-	  return false;
-	}
-
-      /* Redirecting to empty loop latch is useless.  */
-      if (tgt_bb == loop->latch
-	  && empty_block_p (loop->latch))
-	goto fail;
-    }
-
-  /* The target block must dominate the loop latch, otherwise we would be
-     creating a subloop.  */
-  domst = determine_bb_domination_status (loop, tgt_bb);
-  if (domst == DOMST_NONDOMINATING)
-    goto fail;
-  if (domst == DOMST_LOOP_BROKEN)
-    {
-      /* If the loop ceased to exist, mark it as such, and thread through its
-	 original header.  */
-      mark_loop_for_removal (loop);
-      return thread_block (header, false);
-    }
-
-  if (tgt_bb->loop_father->header == tgt_bb)
-    {
-      /* If the target of the threading is a header of a subloop, we need
-	 to create a preheader for it, so that the headers of the two loops
-	 do not merge.  */
-      if (EDGE_COUNT (tgt_bb->preds) > 2)
-	{
-	  tgt_bb = create_preheader (tgt_bb->loop_father, 0);
-	  gcc_assert (tgt_bb != NULL);
-	}
-      else
-	tgt_bb = split_edge (tgt_edge);
-    }
-
-  if (latch->aux)
-    {
-      basic_block *bblocks;
-      unsigned nblocks, i;
-
-      /* First handle the case latch edge is redirected.  We are copying
-	 the loop header but not creating a multiple entry loop.  Make the
-	 cfg manipulation code aware of that fact.  */
-      set_loop_copy (loop, loop);
-      loop->latch = thread_single_edge (latch);
-      set_loop_copy (loop, NULL);
-      gcc_assert (single_succ (loop->latch) == tgt_bb);
-      loop->header = tgt_bb;
-
-      /* Remove the new pre-header blocks from our loop.  */
-      bblocks = XCNEWVEC (basic_block, loop->num_nodes);
-      nblocks = dfs_enumerate_from (header, 0, def_split_header_continue_p,
-				    bblocks, loop->num_nodes, tgt_bb);
-      for (i = 0; i < nblocks; i++)
-	if (bblocks[i]->loop_father == loop)
-	  {
-	    remove_bb_from_loops (bblocks[i]);
-	    add_bb_to_loop (bblocks[i], loop_outer (loop));
-	  }
-      free (bblocks);
-
-      /* If the new header has multiple latches mark it so.  */
-      FOR_EACH_EDGE (e, ei, loop->header->preds)
-	if (e->src->loop_father == loop
-	    && e->src != loop->latch)
-	  {
-	    loop->latch = NULL;
-	    loops_state_set (LOOPS_MAY_HAVE_MULTIPLE_LATCHES);
-	  }
-
-      /* Cancel remaining threading requests that would make the
-	 loop a multiple entry loop.  */
-      FOR_EACH_EDGE (e, ei, header->preds)
-	{
-	  edge e2;
-
-	  if (e->aux == NULL)
-	    continue;
-
-	  vec<jump_thread_edge *> *path = THREAD_PATH (e);
-	  e2 = path->last ()->e;
-
-	  if (e->src->loop_father != e2->dest->loop_father
-	      && e2->dest != loop->header)
-	    {
-	      delete_jump_thread_path (path);
-	      e->aux = NULL;
-	    }
-	}
-
-      /* Thread the remaining edges through the former header.  */
-      thread_block (header, false);
-    }
-  else
-    {
-      basic_block new_preheader;
-
-      /* Now consider the case entry edges are redirected to the new entry
-	 block.  Remember one entry edge, so that we can find the new
-	 preheader (its destination after threading).  */
-      FOR_EACH_EDGE (e, ei, header->preds)
-	{
-	  if (e->aux)
-	    break;
-	}
-
-      /* The duplicate of the header is the new preheader of the loop.  Ensure
-	 that it is placed correctly in the loop hierarchy.  */
-      set_loop_copy (loop, loop_outer (loop));
-
-      thread_block (header, false);
-      set_loop_copy (loop, NULL);
-      new_preheader = e->dest;
-
-      /* Create the new latch block.  This is always necessary, as the latch
-	 must have only a single successor, but the original header had at
-	 least two successors.  */
-      loop->latch = NULL;
-      mfb_kj_edge = single_succ_edge (new_preheader);
-      loop->header = mfb_kj_edge->dest;
-      latch = make_forwarder_block (tgt_bb, mfb_keep_just, NULL);
-      loop->header = latch->dest;
-      loop->latch = latch->src;
-    }
-
-  return true;
-
-fail:
-  /* We failed to thread anything.  Cancel the requests.  */
-  FOR_EACH_EDGE (e, ei, header->preds)
-    {
-      vec<jump_thread_edge *> *path = THREAD_PATH (e);
-
-      if (path)
-	{
-	  delete_jump_thread_path (path);
-	  e->aux = NULL;
-	}
-    }
-  return false;
-}
-
-/* E1 and E2 are edges into the same basic block.  Return TRUE if the
-   PHI arguments associated with those edges are equal or there are no
-   PHI arguments, otherwise return FALSE.  */
-
-static bool
-phi_args_equal_on_edges (edge e1, edge e2)
-{
-  gphi_iterator gsi;
-  int indx1 = e1->dest_idx;
-  int indx2 = e2->dest_idx;
-
-  for (gsi = gsi_start_phis (e1->dest); !gsi_end_p (gsi); gsi_next (&gsi))
-    {
-      gphi *phi = gsi.phi ();
-
-      if (!operand_equal_p (gimple_phi_arg_def (phi, indx1),
-			    gimple_phi_arg_def (phi, indx2), 0))
-	return false;
-    }
-  return true;
-}
-
-/* Walk through the registered jump threads and convert them into a
-   form convenient for this pass.
-
-   Any block which has incoming edges threaded to outgoing edges
-   will have its entry in THREADED_BLOCK set.
-
-   Any threaded edge will have its new outgoing edge stored in the
-   original edge's AUX field.
-
-   This form avoids the need to walk all the edges in the CFG to
-   discover blocks which need processing and avoids unnecessary
-   hash table lookups to map from threaded edge to new target.  */
+/* Dump a jump threading path.  */
 
 static void
-mark_threaded_blocks (bitmap threaded_blocks)
+dump_jump_thread_path (FILE *dump_file, vec<edge> path)
 {
-  unsigned int i;
-  bitmap_iterator bi;
-  bitmap tmp = BITMAP_ALLOC (NULL);
-  basic_block bb;
-  edge e;
-  edge_iterator ei;
-
-  /* It is possible to have jump threads in which one is a subpath
-     of the other.  ie, (A, B), (B, C), (C, D) where B is a joiner
-     block and (B, C), (C, D) where no joiner block exists.
-
-     When this occurs ignore the jump thread request with the joiner
-     block.  It's totally subsumed by the simpler jump thread request.
-
-     This results in less block copying, simpler CFGs.  More importantly,
-     when we duplicate the joiner block, B, in this case we will create
-     a new threading opportunity that we wouldn't be able to optimize
-     until the next jump threading iteration.
-
-     So first convert the jump thread requests which do not require a
-     joiner block.  */
-  for (i = 0; i < paths.length (); i++)
-    {
-      vec<jump_thread_edge *> *path = paths[i];
-
-      if ((*path)[1]->type != EDGE_COPY_SRC_JOINER_BLOCK)
-	{
-	  edge e = (*path)[0]->e;
-	  e->aux = (void *)path;
-	  bitmap_set_bit (tmp, e->dest->index);
-	}
-    }
-
-  /* Now iterate again, converting cases where we want to thread
-     through a joiner block, but only if no other edge on the path
-     already has a jump thread attached to it.  We do this in two passes,
-     to avoid situations where the order in the paths vec can hide overlapping
-     threads (the path is recorded on the incoming edge, so we would miss
-     cases where the second path starts at a downstream edge on the same
-     path).  First record all joiner paths, deleting any in the unexpected
-     case where there is already a path for that incoming edge.  */
-  for (i = 0; i < paths.length (); i++)
-    {
-      vec<jump_thread_edge *> *path = paths[i];
-
-      if ((*path)[1]->type == EDGE_COPY_SRC_JOINER_BLOCK)
-        {
-	  /* Attach the path to the starting edge if none is yet recorded.  */
-          if ((*path)[0]->e->aux == NULL)
-            (*path)[0]->e->aux = path;
-	  else if (dump_file && (dump_flags & TDF_DETAILS))
-	    dump_jump_thread_path (dump_file, *path, false);
-        }
-    }
-  /* Second, look for paths that have any other jump thread attached to
-     them, and either finish converting them or cancel them.  */
-  for (i = 0; i < paths.length (); i++)
-    {
-      vec<jump_thread_edge *> *path = paths[i];
-      edge e = (*path)[0]->e;
-
-      if ((*path)[1]->type == EDGE_COPY_SRC_JOINER_BLOCK && e->aux == path)
-	{
-	  unsigned int j;
-	  for (j = 1; j < path->length (); j++)
-	    if ((*path)[j]->e->aux != NULL)
-	      break;
-
-	  /* If we iterated through the entire path without exiting the loop,
-	     then we are good to go, record it.  */
-	  if (j == path->length ())
-	    bitmap_set_bit (tmp, e->dest->index);
-	  else
-	    {
-	      e->aux = NULL;
-	      if (dump_file && (dump_flags & TDF_DETAILS))
-	        dump_jump_thread_path (dump_file, *path, false);
-	    }
-	}
-    }
-
-  /* If optimizing for size, only thread through block if we don't have
-     to duplicate it or it's an otherwise empty redirection block.  */
-  if (optimize_function_for_size_p (cfun))
-    {
-      EXECUTE_IF_SET_IN_BITMAP (tmp, 0, i, bi)
-	{
-	  bb = BASIC_BLOCK_FOR_FN (cfun, i);
-	  if (EDGE_COUNT (bb->preds) > 1
-	      && !redirection_block_p (bb))
-	    {
-	      FOR_EACH_EDGE (e, ei, bb->preds)
-		{
-		  if (e->aux)
-		    {
-		      vec<jump_thread_edge *> *path = THREAD_PATH (e);
-		      delete_jump_thread_path (path);
-		      e->aux = NULL;
-		    }
-		}
-	    }
-	  else
-	    bitmap_set_bit (threaded_blocks, i);
-	}
-    }
-  else
-    bitmap_copy (threaded_blocks, tmp);
-
-  /* Look for jump threading paths which cross multiple loop headers.
-
-     The code to thread through loop headers will change the CFG in ways
-     that break assumptions made by the loop optimization code.
-
-     We don't want to blindly cancel the requests.  We can instead do better
-     by trimming off the end of the jump thread path.  */
-  EXECUTE_IF_SET_IN_BITMAP (tmp, 0, i, bi)
-    {
-      basic_block bb = BASIC_BLOCK_FOR_FN (cfun, i);
-      FOR_EACH_EDGE (e, ei, bb->preds)
-	{
-	  if (e->aux)
-	    {
-	      vec<jump_thread_edge *> *path = THREAD_PATH (e);
-
-	      for (unsigned int i = 0, crossed_headers = 0;
-		   i < path->length ();
-		   i++)
-		{
-		  basic_block dest = (*path)[i]->e->dest;
-		  crossed_headers += (dest == dest->loop_father->header);
-		  if (crossed_headers > 1)
-		    {
-		      /* Trim from entry I onwards.  */
-		      for (unsigned int j = i; j < path->length (); j++)
-			delete (*path)[j];
-		      path->truncate (i);
-
-		      /* Now that we've truncated the path, make sure
-			 what's left is still valid.   We need at least
-			 two edges on the path and the last edge can not
-			 be a joiner.  This should never happen, but let's
-			 be safe.  */
-		      if (path->length () < 2
-			  || (path->last ()->type
-			      == EDGE_COPY_SRC_JOINER_BLOCK))
-			{
-			  delete_jump_thread_path (path);
-			  e->aux = NULL;
-			}
-		      break;
-		    }
-		}
-	    }
-	}
-    }
-
-  /* If we have a joiner block (J) which has two successors S1 and S2 and
-     we are threading though S1 and the final destination of the thread
-     is S2, then we must verify that any PHI nodes in S2 have the same
-     PHI arguments for the edge J->S2 and J->S1->...->S2.
-
-     We used to detect this prior to registering the jump thread, but
-     that prohibits propagation of edge equivalences into non-dominated
-     PHI nodes as the equivalency test might occur before propagation.
-
-     This must also occur after we truncate any jump threading paths
-     as this scenario may only show up after truncation.
-
-     This works for now, but will need improvement as part of the FSA
-     optimization.
-
-     Note since we've moved the thread request data to the edges,
-     we have to iterate on those rather than the threaded_edges vector.  */
-  EXECUTE_IF_SET_IN_BITMAP (tmp, 0, i, bi)
-    {
-      bb = BASIC_BLOCK_FOR_FN (cfun, i);
-      FOR_EACH_EDGE (e, ei, bb->preds)
-	{
-	  if (e->aux)
-	    {
-	      vec<jump_thread_edge *> *path = THREAD_PATH (e);
-	      bool have_joiner = ((*path)[1]->type == EDGE_COPY_SRC_JOINER_BLOCK);
+  for (unsigned int i = 0; i < path.length (); i++)
+    if (path[i])
+      fprintf (dump_file, " (%d, %d) ",
+	       path[i]->src->index, path[i]->dest->index);
+    else
+      fprintf (dump_file, " (NULL) ");
 
-	      if (have_joiner)
-		{
-		  basic_block joiner = e->dest;
-		  edge final_edge = path->last ()->e;
-		  basic_block final_dest = final_edge->dest;
-		  edge e2 = find_edge (joiner, final_dest);
-
-		  if (e2 && !phi_args_equal_on_edges (e2, final_edge))
-		    {
-		      delete_jump_thread_path (path);
-		      e->aux = NULL;
-		    }
-		}
-	    }
-	}
-    }
-
-  BITMAP_FREE (tmp);
+  fputc ('\n', dump_file);
 }
 
+/* Delete the jump threading path PATH.  */
 
-/* Return TRUE if BB ends with a switch statement or a computed goto.
-   Otherwise return false.  */
-static bool
-bb_ends_with_multiway_branch (basic_block bb ATTRIBUTE_UNUSED)
+void
+delete_jump_thread_path (vec<edge> *path)
 {
-  gimple stmt = last_stmt (bb);
-  if (stmt && gimple_code (stmt) == GIMPLE_SWITCH)
-    return true;
-  if (stmt && gimple_code (stmt) == GIMPLE_GOTO
-      && TREE_CODE (gimple_goto_dest (stmt)) == SSA_NAME)
-    return true;
-  return false;
+  path->release();
+  delete path;
 }
 
-/* Walk through all blocks and thread incoming edges to the appropriate
-   outgoing edge for each edge pair recorded in THREADED_EDGES.
+/* Jump thread all registered paths.
 
-   It is the caller's responsibility to fix the dominance information
-   and rewrite duplicated SSA_NAMEs back into SSA form.
+   It is the caller's responsibility to fix the dominance information.
 
-   If MAY_PEEL_LOOP_HEADERS is false, we avoid threading edges through
-   loop headers if it does not simplify the loop.
+   If MAY_PEEL_LOOP_HEADERS is false, we avoid threading from loop entry edges
+   to the inside of the loop.
 
    Returns true if one or more edges were threaded, false otherwise.  */
 
@@ -2334,183 +95,65 @@ thread_through_all_blocks (bool may_peel_loop_headers)
 {
   bool retval = false;
   unsigned int i;
-  bitmap_iterator bi;
   bitmap threaded_blocks;
-  struct loop *loop;
+  int num_threaded_edges = 0;
 
   if (!paths.exists ())
     return false;
 
   threaded_blocks = BITMAP_ALLOC (NULL);
-  memset (&thread_stats, 0, sizeof (thread_stats));
 
-  for (i = 0; i < paths.length ();)
+  for (i = 0; i < paths.length (); i++)
     {
-      vec<jump_thread_edge *> *path = paths[i];
-      edge entry = (*path)[0]->e;
+      vec<edge> *path = paths[i];
+      edge entry = (*path)[0];
 
-      if ((*path)[0]->type != EDGE_START_FSM_THREAD
-	  /* Do not jump-thread twice from the same block.  */
-	  || bitmap_bit_p (threaded_blocks, entry->src->index)) {
-	i++;
-	continue;
-      }
+      /* Do not jump-thread twice from the same block.  */
+      if (bitmap_bit_p (threaded_blocks, entry->src->index))
+	{
+	  delete_jump_thread_path (path);
+	  continue;
+	}
 
       unsigned len = path->length ();
-      edge exit = (*path)[len - 1]->e;
       basic_block *region = XNEWVEC (basic_block, len - 1);
+      loop_p loop_i = entry->src->loop_father;
+      bool jumps_into_loop_p = false;
 
       for (unsigned int j = 0; j < len - 1; j++)
-	region[j] = (*path)[j]->e->dest;
+	{
+	  region[j] = (*path)[j]->dest;
+	  loop_p loop_j = region[j]->loop_father;
+	  if (loop_i != loop_j && loop_i == loop_outer (loop_j))
+	    jumps_into_loop_p = true;
+	}
 
-      bool success = gimple_duplicate_sese_region (entry, exit, region,
-						   len - 1, NULL, 0);
-      delete_jump_thread_path (path);
-      paths.unordered_remove (i);
+      bool success = false;
+      if (may_peel_loop_headers || !jumps_into_loop_p)
+	success = gimple_duplicate_seme_region (entry, region, len - 1, NULL, 0);
 
       if (success)
 	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    {
+	      fprintf (dump_file, "  Threaded jump %d --> %d to %d\n",
+		       entry->src->index, entry->dest->index,
+		       (*path)[len-1]->dest->index);
+	      dump_jump_thread_path (dump_file, *path);
+	    }
+
+	  retval = true;
+	  num_threaded_edges++;
+
 	  /* We do not update dominance info.  */
 	  free_dominance_info (CDI_DOMINATORS);
 	  bitmap_set_bit (threaded_blocks, entry->src->index);
 	}
-    }
-
-  for (i = 0; i < paths.length ();)
-    {
-      vec<jump_thread_edge *> *path = paths[i];
-      edge entry = (*path)[0]->e;
-
-      /* Do not jump-thread twice from the same block.  */
-      if (bitmap_bit_p (threaded_blocks, entry->src->index))
-	{
-	  delete_jump_thread_path (path);
-	  paths.unordered_remove (i);
-	}
-      else
-	i++;
-    }
-
-  bitmap_clear (threaded_blocks);
-
-  mark_threaded_blocks (threaded_blocks);
-
-  initialize_original_copy_tables ();
-
-  /* First perform the threading requests that do not affect
-     loop structure.  */
-  EXECUTE_IF_SET_IN_BITMAP (threaded_blocks, 0, i, bi)
-    {
-      basic_block bb = BASIC_BLOCK_FOR_FN (cfun, i);
-
-      if (EDGE_COUNT (bb->preds) > 0)
-	retval |= thread_block (bb, true);
-    }
-
-  /* Then perform the threading through loop headers.  We start with the
-     innermost loop, so that the changes in cfg we perform won't affect
-     further threading.  */
-  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
-    {
-      if (!loop->header
-	  || !bitmap_bit_p (threaded_blocks, loop->header->index))
-	continue;
-
-      retval |= thread_through_loop_header (loop, may_peel_loop_headers);
-    }
-
-  /* Any jump threading paths that are still attached to edges at this
-     point must be one of two cases.
-
-     First, we could have a jump threading path which went from outside
-     a loop to inside a loop that was ignored because a prior jump thread
-     across a backedge was realized (which indirectly causes the loop
-     above to ignore the latter thread).  We can detect these because the
-     loop structures will be different and we do not currently try to
-     optimize this case.
-
-     Second, we could be threading across a backedge to a point within the
-     same loop.  This occurrs for the FSA/FSM optimization and we would
-     like to optimize it.  However, we have to be very careful as this
-     may completely scramble the loop structures, with the result being
-     irreducible loops causing us to throw away our loop structure.
 
-     As a compromise for the latter case, if the thread path ends in
-     a block where the last statement is a multiway branch, then go
-     ahead and thread it, else ignore it.  */
-  basic_block bb;
-  edge e;
-  FOR_EACH_BB_FN (bb, cfun)
-    {
-      /* If we do end up threading here, we can remove elements from
-	 BB->preds.  Thus we can not use the FOR_EACH_EDGE iterator.  */
-      for (edge_iterator ei = ei_start (bb->preds);
-	   (e = ei_safe_edge (ei));)
-	if (e->aux)
-	  {
-	    vec<jump_thread_edge *> *path = THREAD_PATH (e);
-
-	    /* Case 1, threading from outside to inside the loop
-	       after we'd already threaded through the header.  */
-	    if ((*path)[0]->e->dest->loop_father
-		!= path->last ()->e->src->loop_father)
-	      {
-		delete_jump_thread_path (path);
-		e->aux = NULL;
-		ei_next (&ei);
-	      }
-	   else if (bb_ends_with_multiway_branch (path->last ()->e->src))
-	      {
-		/* The code to thread through loop headers may have
-		   split a block with jump threads attached to it.
-
-		   We can identify this with a disjoint jump threading
-		   path.  If found, just remove it.  */
-		for (unsigned int i = 0; i < path->length () - 1; i++)
-		  if ((*path)[i]->e->dest != (*path)[i + 1]->e->src)
-		    {
-		      delete_jump_thread_path (path);
-		      e->aux = NULL;
-		      ei_next (&ei);
-		      break;
-		    }
-
-		/* Our path is still valid, thread it.  */
-	        if (e->aux)
-		  {
-		    struct loop *loop = (*path)[0]->e->dest->loop_father;
-
-		    if (thread_block ((*path)[0]->e->dest, false))
-		      {
-			/* This jump thread likely totally scrambled this loop.
-			   So arrange for it to be fixed up.  */
-			loop->header = NULL;
-			loop->latch = NULL;
-			e->aux = NULL;
-		      }
-		    else
-		      {
-		        delete_jump_thread_path (path);
-			e->aux = NULL;
-			ei_next (&ei);
-		      }
-		  }
-	      }
-	   else
-	      {
-		delete_jump_thread_path (path);
-		e->aux = NULL;
-		ei_next (&ei);
-	      }
- 	  }
-	else
-	  ei_next (&ei);
+      delete_jump_thread_path (path);
     }
 
-  statistics_counter_event (cfun, "Jumps threaded",
-			    thread_stats.num_threaded_edges);
-
-  free_original_copy_tables ();
+  statistics_counter_event (cfun, "Jumps threaded", num_threaded_edges);
 
   BITMAP_FREE (threaded_blocks);
   threaded_blocks = NULL;
@@ -2522,18 +165,6 @@ thread_through_all_blocks (bool may_peel_loop_headers)
   return retval;
 }
 
-/* Delete the jump threading path PATH.  We have to explcitly delete
-   each entry in the vector, then the container.  */
-
-void
-delete_jump_thread_path (vec<jump_thread_edge *> *path)
-{
-  for (unsigned int i = 0; i < path->length (); i++)
-    delete (*path)[i];
-  path->release();
-  delete path;
-}
-
 /* Register a jump threading opportunity.  We queue up all the jump
    threading opportunities discovered by a pass and update the CFG
    and SSA form all at once.
@@ -2543,7 +174,7 @@ delete_jump_thread_path (vec<jump_thread_edge *> *path)
    after fixing the SSA graph.  */
 
 void
-register_jump_thread (vec<jump_thread_edge *> *path)
+register_jump_thread (vec<edge> *path)
 {
   if (!dbg_cnt (registered_jump_thread))
     {
@@ -2554,13 +185,13 @@ register_jump_thread (vec<jump_thread_edge *> *path)
   /* First make sure there are no NULL outgoing edges on the jump threading
      path.  That can happen for jumping to a constant address.  */
   for (unsigned int i = 0; i < path->length (); i++)
-    if ((*path)[i]->e == NULL)
+    if ((*path)[i] == NULL)
       {
 	if (dump_file && (dump_flags & TDF_DETAILS))
 	  {
 	    fprintf (dump_file,
 		     "Found NULL edge in jump threading path.  Cancelling jump thread:\n");
-	    dump_jump_thread_path (dump_file, *path, false);
+	    dump_jump_thread_path (dump_file, *path);
 	  }
 
 	delete_jump_thread_path (path);
@@ -2568,7 +199,7 @@ register_jump_thread (vec<jump_thread_edge *> *path)
       }
 
   if (dump_file && (dump_flags & TDF_DETAILS))
-    dump_jump_thread_path (dump_file, *path, true);
+    dump_jump_thread_path (dump_file, *path);
 
   if (!paths.exists ())
     paths.create (5);
diff --git a/gcc/tree-ssa-threadupdate.h b/gcc/tree-ssa-threadupdate.h
index 42c3a9e..c2e9f72 100644
--- a/gcc/tree-ssa-threadupdate.h
+++ b/gcc/tree-ssa-threadupdate.h
@@ -23,25 +23,6 @@ along with GCC; see the file COPYING3.  If not see
 
 /* In tree-ssa-threadupdate.c.  */
 extern bool thread_through_all_blocks (bool);
-enum jump_thread_edge_type
-{
-  EDGE_START_JUMP_THREAD,
-  EDGE_START_FSM_THREAD,
-  EDGE_COPY_SRC_BLOCK,
-  EDGE_COPY_SRC_JOINER_BLOCK,
-  EDGE_NO_COPY_SRC_BLOCK
-};
-
-class jump_thread_edge
-{
-public:
-  jump_thread_edge (edge e, enum jump_thread_edge_type type)
-    : e (e), type (type) {}
-
-  edge e;
-  enum jump_thread_edge_type type;
-};
-
-extern void register_jump_thread (vec <class jump_thread_edge *> *);
-extern void delete_jump_thread_path (vec <class jump_thread_edge *> *);
+extern void register_jump_thread (vec <edge> *);
+extern void delete_jump_thread_path (vec <edge> *);
 #endif
-- 
1.9.1


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-24 22:28                         ` Sebastian Pop
  2014-11-24 23:02                           ` Sebastian Pop
@ 2014-11-24 23:18                           ` Jeff Law
  2014-11-25  9:44                           ` Richard Biener
  2014-11-25 11:03                           ` Markus Trippelsdorf
  3 siblings, 0 replies; 54+ messages in thread
From: Jeff Law @ 2014-11-24 23:18 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

On 11/24/14 15:05, Sebastian Pop wrote:
>
> I did an experiment around these lines over the week-end, and now that you
> mention it, I feel less shy to speak about; well the patch does not yet pass
> bootstrap, and there still are about 20 failing test-cases.  I feel better
> reading the code generation part of jump-threading after this patch ;-)
> Basically I think all the tree-ssa-threadupdate.c can be replaced by
> duplicate_seme_region that generalizes the code generation.
Clearly next stage1 stuff, but definitely the right direction IMHO.  If 
you get the chance look at Bodik's thesis.    As far as I know he's the 
only person to really look at how to structure context sensitive 
optimizations in a sane way.



>
> I got my scripts installed on the gcc-farm.  I first used an x86_64 gcc75 and
> valgrind was crashing not recognizing how to decode an instruction.  Then I
> moved to gcc112 a powerpc64-linux where I got this data from stage2 cc1plus
> compiling the same file alias.ii at -O2: (I got 3 runs of each mostly because
> there is a bit of noise in all these numbers)
Yea, glibc & valgrind really need to update in lock-step as glibc gains 
support for new ISAs.  Certain instructions are supposed to be 
interpreted as nops, but valgrind instead raises an illegal instruction.

There's a bit of noise when using valgrind like this, but it has 
definitely proven useful in the past.

I'm looking at bitmap_ior_and_compl right now.  Not sure if cg_annotate 
is sending me on a wild goose chase yet or not.

jeff

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-24  0:06                     ` Sebastian Pop
  2014-11-24 21:33                       ` Jeff Law
@ 2014-11-24 23:25                       ` Jeff Law
  2014-11-25  0:23                         ` Sebastian Pop
  1 sibling, 1 reply; 54+ messages in thread
From: Jeff Law @ 2014-11-24 23:25 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

On 11/23/14 15:22, Sebastian Pop wrote:
> Jeff Law wrote:
>>> PS: I have run some perf analysis with the patch:
>>> - on a bootstrap of GCC I see 3209 FSM jump threads
>>> - libpng and libjpeg contain FSM jump threads, the perf increase is in the 1%
>>>    (measured on simulators and reduced data sets)
>>> - coremark gets jump threaded (as expected)
>>> - I'm setting up the llvm test-suite and I will report perf differences
>> So that's *far* more jump threads than I would expect this to find
>> in a bootstrap of GCC -- like 3 orders of magnitude more than I'd
>> expect to find.
>
> The second patch attached limits the search for FSM jump threads to loops.  With
> that patch, we are now down to 470 jump threads in an x86_64-linux bootstrap
> (and 424 jump threads on powerpc64-linux bootstrap.)
[ ... ]

So why are we returning -1 (block should not be duplicated and not 
suitable for a joiner) at the end of thread_through_normal_block?


       /* When COND cannot be simplified, try to find paths from a control
          statement back through the PHI nodes which would affect that 
control
          statement.  */
       vec<basic_block, va_gc> *bb_path;
       vec_alloc (bb_path, n_basic_blocks_for_fn (cfun));
       vec_safe_push (bb_path, e->dest);
       hash_set<gimple> *visited_phis = new hash_set<gimple>;

       max_threaded_paths = PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATHS);
       fsm_find_control_statement_thread_paths (cond, visited_phis, 
bb_path);

       delete visited_phis;
       vec_free (bb_path);

       return -1;

Returning -1 (instead of 0) says stop, there's no possibility to 
threading something on that path.   I think that's suppressing some 
profitable jump threads.  I haven't done  more than verify the bitmap 
code returns to its prior state with that change.

jeff


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-24 23:25                       ` Jeff Law
@ 2014-11-25  0:23                         ` Sebastian Pop
  2014-11-25  3:11                           ` Sebastian Pop
  0 siblings, 1 reply; 54+ messages in thread
From: Sebastian Pop @ 2014-11-25  0:23 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

Jeff Law wrote:
> On 11/23/14 15:22, Sebastian Pop wrote:
> >Jeff Law wrote:
> >>>PS: I have run some perf analysis with the patch:
> >>>- on a bootstrap of GCC I see 3209 FSM jump threads
> >>>- libpng and libjpeg contain FSM jump threads, the perf increase is in the 1%
> >>>   (measured on simulators and reduced data sets)
> >>>- coremark gets jump threaded (as expected)
> >>>- I'm setting up the llvm test-suite and I will report perf differences
> >>So that's *far* more jump threads than I would expect this to find
> >>in a bootstrap of GCC -- like 3 orders of magnitude more than I'd
> >>expect to find.
> >
> >The second patch attached limits the search for FSM jump threads to loops.  With
> >that patch, we are now down to 470 jump threads in an x86_64-linux bootstrap
> >(and 424 jump threads on powerpc64-linux bootstrap.)
> [ ... ]
> 
> So why are we returning -1 (block should not be duplicated and not
> suitable for a joiner) at the end of thread_through_normal_block?
> 
> 
>       /* When COND cannot be simplified, try to find paths from a control
>          statement back through the PHI nodes which would affect
> that control
>          statement.  */
>       vec<basic_block, va_gc> *bb_path;
>       vec_alloc (bb_path, n_basic_blocks_for_fn (cfun));
>       vec_safe_push (bb_path, e->dest);
>       hash_set<gimple> *visited_phis = new hash_set<gimple>;
> 
>       max_threaded_paths = PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATHS);
>       fsm_find_control_statement_thread_paths (cond, visited_phis,
> bb_path);
> 
>       delete visited_phis;
>       vec_free (bb_path);
> 
>       return -1;
> 
> Returning -1 (instead of 0) says stop, there's no possibility to
> threading something on that path.   I think that's suppressing some
> profitable jump threads.  

Thanks for spotting this.

> I haven't done  more than verify the
> bitmap code returns to its prior state with that change.

I removed the return -1 and started a bootstrap on powerpc64-linux.
I will report the valgrind output.

Thanks,
Sebastian

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-25  0:23                         ` Sebastian Pop
@ 2014-11-25  3:11                           ` Sebastian Pop
  2014-11-25  7:51                             ` Jeff Law
  0 siblings, 1 reply; 54+ messages in thread
From: Sebastian Pop @ 2014-11-25  3:11 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

Sebastian Pop wrote:
> I removed the return -1 and started a bootstrap on powerpc64-linux.

Bootstrap passed on top of the 4 previous patches on powerpc64-linux.

> I will report the valgrind output.

The output from valgrind looks closer to the output of master with no other
patches: still 1M more instructions executed, and 300K more branches

master no-patch:

==129233== I   refs:      13,910,221,913
==129233==
==129233== Branches:       1,925,715,095  (1,879,277,776 cond + 46,437,319 ind)
==129233== Mispredicts:      144,133,332  (  131,510,534 cond + 12,622,798 ind)
==129233== Mispred rate:             7.4% (          6.9%     +       27.1%   )

4 previous patches + patch to return 0:

==149012== I   refs:      13,911,870,743
==149012== 
==149012== Branches:       1,926,092,629  (1,879,657,768 cond + 46,434,861 ind)
==149012== Mispredicts:      145,551,513  (  132,827,091 cond + 12,724,422 ind)
==149012== Mispred rate:             7.5% (          7.0%     +       27.4%   )

==4492== I   refs:      13,911,899,691
==4492== 
==4492== Branches:       1,926,096,214  (1,879,661,186 cond + 46,435,028 ind)
==4492== Mispredicts:      145,551,707  (  132,827,231 cond + 12,724,476 ind)
==4492== Mispred rate:             7.5% (          7.0%     +       27.4%   )

==19521== I   refs:      13,911,855,711
==19521== 
==19521== Branches:       1,926,090,982  (1,879,656,202 cond + 46,434,780 ind)
==19521== Mispredicts:      145,551,343  (  132,826,948 cond + 12,724,395 ind)
==19521== Mispred rate:             7.5% (          7.0%     +       27.4%   )


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-25  3:11                           ` Sebastian Pop
@ 2014-11-25  7:51                             ` Jeff Law
  2014-11-25 16:41                               ` Jeff Law
  0 siblings, 1 reply; 54+ messages in thread
From: Jeff Law @ 2014-11-25  7:51 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

On 11/24/14 18:09, Sebastian Pop wrote:
> Sebastian Pop wrote:
>> I removed the return -1 and started a bootstrap on powerpc64-linux.
>
> Bootstrap passed on top of the 4 previous patches on powerpc64-linux.
>
>> I will report the valgrind output.
>
> The output from valgrind looks closer to the output of master with no other
> patches: still 1M more instructions executed, and 300K more branches
Just ran my suite where we get ~25k more branches, which definitely puts 
us in the noise.  (that's with all 4 patches + fixing the return value 
).  I'm going to look at little closer at this stuff tomorrow, but I 
think we've resolved the performance issue.  I'll dig deeper into the 
implementation tomorrow as well.

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-24 22:28                         ` Sebastian Pop
  2014-11-24 23:02                           ` Sebastian Pop
  2014-11-24 23:18                           ` Jeff Law
@ 2014-11-25  9:44                           ` Richard Biener
  2014-11-25 11:03                           ` Markus Trippelsdorf
  3 siblings, 0 replies; 54+ messages in thread
From: Richard Biener @ 2014-11-25  9:44 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Jeff Law, James Greenhalgh, Steve Ellcey, GCC Patches

On Mon, Nov 24, 2014 at 11:05 PM, Sebastian Pop <sebpop@gmail.com> wrote:
> Jeff Law wrote:
>> On 11/23/14 15:22, Sebastian Pop wrote:
>> >The second patch attached limits the search for FSM jump threads to loops.  With
>> >that patch, we are now down to 470 jump threads in an x86_64-linux bootstrap
>> >(and 424 jump threads on powerpc64-linux bootstrap.)
>> >
>> Yea, that was one of the things I was going to poke at as well as a
>> quick scan of your patch gave me the impression it wasn't limited to
>> loops.
>>
>> Again, I haven't looked much at the patch, but I got the impression
>> you're doing a backwards walk through the predecessors to discover
>> the result of the COND_EXPR.  Correct?
>
> Yes.
>
>>
>> That's something I'd been wanting to do -- basically start with a
>> COND_EXPR, then walk the dataflow backwards substituting values into
>> the COND_EXPR (possibly creating non-gimple).  Ultimately the goal
>> is to substitute and fold, getting to a constant :-)
>>
>> The forward exhaustive stuff we do now is, crazy.   The backwards
>> approach could be decoupled from DOM & VRP into an independent pass,
>> which I think would be wise.
>>
>> Using a SEME region copier is also something I really wanted to do
>> long term.  In fact, I believe a lot of tree-ssa-threadupdate.c
>> ought to be ripped out and replaced with a SEME based copier.
>
> I did an experiment around these lines over the week-end, and now that you
> mention it, I feel less shy to speak about; well the patch does not yet pass
> bootstrap, and there still are about 20 failing test-cases.  I feel better
> reading the code generation part of jump-threading after this patch ;-)
> Basically I think all the tree-ssa-threadupdate.c can be replaced by
> duplicate_seme_region that generalizes the code generation.

Btw I once thought about doing on-the-fly lattice use/update and folding
during basic-block copying (or even re-generating expressions via
simplifying gimple_build ()).  Or have a substitute-and-fold like
facility that can run on SEME regions and do this.

Richard.

>> It appears you've built at least parts of two pieces needed to all
>> this as a Bodik style optimizer.  Which is exactly the long term
>> direction I think this code ought to take.
>>
>>
>> >
>> >One of the reasons I think we see more branches is that in sese region copying we
>> >do not use the knowledge of the value of the condition for the last branch in a
>> >jump-thread path: we rely on other propagation passes to remove the branch.  The
>> >last attached patch adds:
>> >
>> >   /* Remove the last branch in the jump thread path.  */
>> >   remove_ctrl_stmt_and_useless_edges (region_copy[n_region - 1], exit->dest);
>> That's certainly a possibility.  But I would expect that even with
>> this limitation something would be picking up the fact that the
>> branch is statically computable (even if it's an RTL optimizer).
>> But it's definitely something to look for.
>>
>> >
>> >Please let me know if the attached patches are producing better results on gcc.
>>
>> For the trunk:
>>   instructions:1339016494968
>>   branches     :243568982489
>>
>> First version of your patch:
>>
>>   instructions:1339739533291
>>   branches:     243806615986
>>
>> Latest version of your patch:
>>
>>   instructions:1339749122609
>>   branches:     243809838262
>
> I think I got about the same results.
>
> I got my scripts installed on the gcc-farm.  I first used an x86_64 gcc75 and
> valgrind was crashing not recognizing how to decode an instruction.  Then I
> moved to gcc112 a powerpc64-linux where I got this data from stage2 cc1plus
> compiling the same file alias.ii at -O2: (I got 3 runs of each mostly because
> there is a bit of noise in all these numbers)
>
> $ valgrind --tool=cachegrind --cache-sim=no --branch-sim=yes ./cc1plus -O2 ~/alias.ii
>
> all 4 patches:
>
> ==153617== I   refs:      13,914,038,211
> ==153617==
> ==153617== Branches:       1,926,407,760  (1,879,827,481 cond + 46,580,279 ind)
> ==153617== Mispredicts:      144,890,904  (  132,094,105 cond + 12,796,799 ind)
> ==153617== Mispred rate:             7.5% (          7.0%     +       27.4%   )
>
> ==34993== I   refs:      13,915,335,629
> ==34993==
> ==34993== Branches:       1,926,597,919  (1,880,017,558 cond + 46,580,361 ind)
> ==34993== Mispredicts:      144,974,266  (  132,177,440 cond + 12,796,826 ind)
> ==34993== Mispred rate:             7.5% (          7.0%     +       27.4%   )
>
> ==140841== I   refs:      13,915,334,459
> ==140841==
> ==140841== Branches:       1,926,597,819  (1,880,017,458 cond + 46,580,361 ind)
> ==140841== Mispredicts:      144,974,296  (  132,177,470 cond + 12,796,826 ind)
> ==140841== Mispred rate:             7.5% (          7.0%     +       27.4%   )
>
> patch 1:
>
> ==99902== I   refs:      13,915,069,710
> ==99902==
> ==99902== Branches:       1,926,963,813  (1,880,376,148 cond + 46,587,665 ind)
> ==99902== Mispredicts:      145,501,564  (  132,656,576 cond + 12,844,988 ind)
> ==99902== Mispred rate:             7.5% (          7.0%     +       27.5%   )
>
> ==3907== I   refs:      13,915,082,469
> ==3907==
> ==3907== Branches:       1,926,965,218  (1,880,377,471 cond + 46,587,747 ind)
> ==3907== Mispredicts:      145,501,569  (  132,656,554 cond + 12,845,015 ind)
> ==3907== Mispred rate:             7.5% (          7.0%     +       27.5%   )
>
> ==44271== I   refs:      13,915,111,997
> ==44271==
> ==44271== Branches:       1,926,968,863  (1,880,380,952 cond + 46,587,911 ind)
> ==44271== Mispredicts:      145,501,858  (  132,656,789 cond + 12,845,069 ind)
> ==44271== Mispred rate:             7.5% (          7.0%     +       27.5%   )
>
> master no-patch:
>
> ==129233== I   refs:      13,910,221,913
> ==129233==
> ==129233== Branches:       1,925,715,095  (1,879,277,776 cond + 46,437,319 ind)
> ==129233== Mispredicts:      144,133,332  (  131,510,534 cond + 12,622,798 ind)
> ==129233== Mispred rate:             7.4% (          6.9%     +       27.1%   )
>
> ==147659== I   refs:      13,910,216,249
> ==147659==
> ==147659== Branches:       1,925,714,029  (1,879,276,708 cond + 46,437,321 ind)
> ==147659== Mispredicts:      144,127,970  (  131,505,172 cond + 12,622,798 ind)
> ==147659== Mispred rate:             7.4% (          6.9%     +       27.1%   )
>
> ==155206== I   refs:      13,910,201,237
> ==155206==
> ==155206== Branches:       1,925,712,267  (1,879,275,030 cond + 46,437,237 ind)
> ==155206== Mispredicts:      144,128,313  (  131,505,542 cond + 12,622,771 ind)
> ==155206== Mispred rate:             7.4% (          6.9%     +       27.1%   )
>
>
> I think that there are about 5 million more instructions executed with the first
> patch, and the other patches on top do not really help.
>
>>
>> Which is in the noise for this test.  Which makes me wonder if I
>> botched something on the latest run.  It doesn't appear so, but I'm
>> re-running just to be sure.  I'm also turning on -g so that I can
>> use cg_annotate to poke a bit deeper and perhaps identify one or
>> more concrete examples where your patch is making this worse.
>
> Thanks,
> Sebastian

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-24 22:28                         ` Sebastian Pop
                                             ` (2 preceding siblings ...)
  2014-11-25  9:44                           ` Richard Biener
@ 2014-11-25 11:03                           ` Markus Trippelsdorf
  3 siblings, 0 replies; 54+ messages in thread
From: Markus Trippelsdorf @ 2014-11-25 11:03 UTC (permalink / raw)
  To: Sebastian Pop
  Cc: Jeff Law, Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

On 2014.11.24 at 22:05 +0000, Sebastian Pop wrote:
> I got my scripts installed on the gcc-farm.  I first used an x86_64 gcc75 and
> valgrind was crashing not recognizing how to decode an instruction.  Then I
> moved to gcc112 a powerpc64-linux where I got this data from stage2 cc1plus
> compiling the same file alias.ii at -O2: (I got 3 runs of each mostly because
> there is a bit of noise in all these numbers)
> 
> $ valgrind --tool=cachegrind --cache-sim=no --branch-sim=yes ./cc1plus -O2 ~/alias.ii

BTW perf is also available on gcc112:

trippels@gcc2-power8 ~ % perf list

List of pre-defined events (to be used in -e):
  cpu-cycles OR cycles                               [Hardware event]
  instructions                                       [Hardware event]
  cache-references                                   [Hardware event]
  cache-misses                                       [Hardware event]
  branch-instructions OR branches                    [Hardware event]
  branch-misses                                      [Hardware event]
  stalled-cycles-frontend OR idle-cycles-frontend    [Hardware event]
  stalled-cycles-backend OR idle-cycles-backend      [Hardware event]

  cpu-clock                                          [Software event]
  task-clock                                         [Software event]
  page-faults OR faults                              [Software event]
  context-switches OR cs                             [Software event]
  cpu-migrations OR migrations                       [Software event]
  minor-faults                                       [Software event]
  major-faults                                       [Software event]
  alignment-faults                                   [Software event]
  emulation-faults                                   [Software event]
  dummy                                              [Software event]

  L1-dcache-loads                                    [Hardware cache event]
  L1-dcache-load-misses                              [Hardware cache event]
  L1-dcache-store-misses                             [Hardware cache event]
  L1-dcache-prefetches                               [Hardware cache event]
  L1-icache-loads                                    [Hardware cache event]
  L1-icache-load-misses                              [Hardware cache event]
  L1-icache-prefetches                               [Hardware cache event]
  LLC-loads                                          [Hardware cache event]
  LLC-load-misses                                    [Hardware cache event]
  LLC-stores                                         [Hardware cache event]
  LLC-store-misses                                   [Hardware cache event]
  LLC-prefetches                                     [Hardware cache event]
  dTLB-load-misses                                   [Hardware cache event]
  iTLB-load-misses                                   [Hardware cache event]
  branch-loads                                       [Hardware cache event]
  branch-load-misses                                 [Hardware cache event]

  rNNN                                               [Raw hardware event descriptor]
  cpu/t1=v1[,t2=v2,t3 ...]/modifier                  [Raw hardware event descriptor]
   (see 'man perf-list' on how to encode it)

  mem:<addr>[:access]                                [Hardware breakpoint]

-- 
Markus

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-25  7:51                             ` Jeff Law
@ 2014-11-25 16:41                               ` Jeff Law
  2014-11-25 18:35                                 ` Sebastian Pop
  0 siblings, 1 reply; 54+ messages in thread
From: Jeff Law @ 2014-11-25 16:41 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

On 11/24/14 21:55, Jeff Law wrote:
> On 11/24/14 18:09, Sebastian Pop wrote:
>> Sebastian Pop wrote:
>>> I removed the return -1 and started a bootstrap on powerpc64-linux.
>>
>> Bootstrap passed on top of the 4 previous patches on powerpc64-linux.
>>
>>> I will report the valgrind output.
>>
>> The output from valgrind looks closer to the output of master with no
>> other
>> patches: still 1M more instructions executed, and 300K more branches
> Just ran my suite where we get ~25k more branches, which definitely puts
> us in the noise.  (that's with all 4 patches + fixing the return value
> ).  I'm going to look at little closer at this stuff tomorrow, but I
> think we've resolved the performance issue.  I'll dig deeper into the
> implementation tomorrow as well.
I was running without your followup patches (must have used the wrong 
bits from my git stash), so those results are bogus, but in a good way.

After fixing that goof, I'm seeing consistent improvements with your set 
of 4 patches and the fix for the wrong return code.  Across the suite, 
~140M fewer branches, not huge, but definitely not in the noise.

So, time to dig into the implementation :-)

Jeff

ps.  In case you're curious about the noise, it's primarily address hashing.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-25 16:41                               ` Jeff Law
@ 2014-11-25 18:35                                 ` Sebastian Pop
  2014-11-25 21:54                                   ` Sebastian Pop
  0 siblings, 1 reply; 54+ messages in thread
From: Sebastian Pop @ 2014-11-25 18:35 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1589 bytes --]

Jeff Law wrote:
> On 11/24/14 21:55, Jeff Law wrote:
> >On 11/24/14 18:09, Sebastian Pop wrote:
> >>Sebastian Pop wrote:
> >>>I removed the return -1 and started a bootstrap on powerpc64-linux.
> >>
> >>Bootstrap passed on top of the 4 previous patches on powerpc64-linux.
> >>
> >>>I will report the valgrind output.
> >>
> >>The output from valgrind looks closer to the output of master with no
> >>other
> >>patches: still 1M more instructions executed, and 300K more branches
> >Just ran my suite where we get ~25k more branches, which definitely puts
> >us in the noise.  (that's with all 4 patches + fixing the return value
> >).  I'm going to look at little closer at this stuff tomorrow, but I
> >think we've resolved the performance issue.  I'll dig deeper into the
> >implementation tomorrow as well.
> I was running without your followup patches (must have used the
> wrong bits from my git stash), so those results are bogus, but in a
> good way.
> 
> After fixing that goof, I'm seeing consistent improvements with your
> set of 4 patches and the fix for the wrong return code.  Across the
> suite, ~140M fewer branches, not huge, but definitely not in the
> noise.

Thanks for your testing.

> 
> So, time to dig into the implementation :-)
> 

To ease the review, I squashed all the patches in a single one.

I will bootstrap and regression test this patch on x86_64-linux and
powerpc64-linux.  I will also run it on our internal benchmarks, coremark, and
the llvm test-suite.

I will also include a longer testcase that makes sure we do not regress on
coremark.

Sebastian

[-- Attachment #2: 0001-extend-jump-thread-for-finite-state-automata-PR-5474.patch --]
[-- Type: text/x-diff, Size: 20311 bytes --]

From db0f6817768920b497225484fab24a20e5ddf556 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <s.pop@samsung.com>
Date: Fri, 26 Sep 2014 14:54:20 -0500
Subject: [PATCH] extend jump thread for finite state automata PR 54742

Adapted from a patch from James Greenhalgh.

	* params.def (max-fsm-thread-path-insns, max-fsm-thread-length,
	max-fsm-thread-paths): New.

	* doc/invoke.texi (max-fsm-thread-path-insns, max-fsm-thread-length,
	max-fsm-thread-paths): Documented.

	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c: New.

	* tree-cfg.c (split_edge_bb_loc): Export.
	* tree-cfg.h (split_edge_bb_loc): Declared extern.

	* tree-ssa-threadedge.c (simplify_control_stmt_condition): Restore the
	original value of cond when simplification fails.
	(fsm_find_thread_path): New.
	(fsm_find_control_statement_thread_paths): New.
	(fsm_thread_through_normal_block): Call find_control_statement_thread_paths.

	* tree-ssa-threadupdate.c (dump_jump_thread_path): Pretty print
	EDGE_START_FSM_THREAD.
	(duplicate_seme_region): New.
	(thread_through_all_blocks): Generate code for EDGE_START_FSM_THREAD edges
	calling gimple_duplicate_sese_region.

	* tree-ssa-threadupdate.h (jump_thread_edge_type): Add EDGE_START_FSM_THREAD.
---
 gcc/doc/invoke.texi                              |  12 ++
 gcc/params.def                                   |  15 ++
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c |  38 ++++
 gcc/tree-cfg.c                                   |   2 +-
 gcc/tree-cfg.h                                   |   1 +
 gcc/tree-ssa-threadedge.c                        | 215 ++++++++++++++++++++++-
 gcc/tree-ssa-threadupdate.c                      | 198 ++++++++++++++++++++-
 gcc/tree-ssa-threadupdate.h                      |   1 +
 8 files changed, 479 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 89edddb..074183f 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -10624,6 +10624,18 @@ large and significantly increase compile time at optimization level
 @option{-O1} and higher.  This parameter is a maximum nubmer of statements
 in a single generated constructor.  Default value is 5000.
 
+@item max-fsm-thread-path-insns
+Maximum number of instructions to copy when duplicating blocks on a
+finite state automaton jump thread path.  The default is 100.
+
+@item max-fsm-thread-length
+Maximum number of basic blocks on a finite state automaton jump thread
+path.  The default is 10.
+
+@item max-fsm-thread-paths
+Maximum number of new jump thread paths to create for a finite state
+automaton.  The default is 50.
+
 @end table
 @end table
 
diff --git a/gcc/params.def b/gcc/params.def
index 9b21c07..edf3f53 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1140,6 +1140,21 @@ DEFPARAM (PARAM_CHKP_MAX_CTOR_SIZE,
 	  "Maximum number of statements to be included into a single static "
 	  "constructor generated by Pointer Bounds Checker",
 	  5000, 100, 0)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_PATH_INSNS,
+	  "max-fsm-thread-path-insns",
+	  "Maximum number of instructions to copy when duplicating blocks on a finite state automaton jump thread path",
+	  100, 1, 999999)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_LENGTH,
+	  "max-fsm-thread-length",
+	  "Maximum number of basic blocks on a finite state automaton jump thread path",
+	  10, 1, 999999)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_PATHS,
+	  "max-fsm-thread-paths",
+	  "Maximum number of new jump thread paths to create for a finite state automaton",
+	  50, 1, 999999)
 /*
 
 Local variables:
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
new file mode 100644
index 0000000..310d3db
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
@@ -0,0 +1,38 @@
+int sum0, sum1, sum2, sum3;
+int foo (char *s, char **ret)
+{
+  int state=0;
+  char c;
+
+  for (; *s && state != 4; s++)
+    {
+      c = *s;
+      if (c == '*')
+	{
+	  s++;
+	  break;
+	}
+      switch (state)
+	{
+	case 0:
+	  if (c == '+')
+	    state = 1;
+	  else if (c != '-')
+	    sum0+=c;
+	  break;
+	case 1:
+	  if (c == '+')
+	    state = 2;
+	  else if (c == '-')
+	    state = 0;
+	  else
+	    sum1+=c;
+	  break;
+	default:
+	  break;
+	}
+
+    }
+  *ret = s;
+  return state;
+}
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index e78554f..b3471d9 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -2666,7 +2666,7 @@ reinstall_phi_args (edge new_edge, edge old_edge)
    near its "logical" location.  This is of most help to humans looking
    at debugging dumps.  */
 
-static basic_block
+basic_block
 split_edge_bb_loc (edge edge_in)
 {
   basic_block dest = edge_in->dest;
diff --git a/gcc/tree-cfg.h b/gcc/tree-cfg.h
index 626e973..51f0899 100644
--- a/gcc/tree-cfg.h
+++ b/gcc/tree-cfg.h
@@ -67,6 +67,7 @@ extern void verify_gimple_in_cfg (struct function *, bool);
 extern tree gimple_block_label (basic_block);
 extern void add_phi_args_after_copy_bb (basic_block);
 extern void add_phi_args_after_copy (basic_block *, unsigned, edge);
+extern basic_block split_edge_bb_loc (edge);
 extern bool gimple_duplicate_sese_region (edge, edge, basic_block *, unsigned,
 					basic_block *, bool);
 extern bool gimple_duplicate_sese_tail (edge, edge, basic_block *, unsigned,
diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
index 8b0b7b8..c9fe212 100644
--- a/gcc/tree-ssa-threadedge.c
+++ b/gcc/tree-ssa-threadedge.c
@@ -56,6 +56,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "tree-ssa-threadedge.h"
 #include "builtins.h"
+#include "cfg.h"
+#include "cfganal.h"
 
 /* To avoid code explosion due to jump threading, we limit the
    number of statements we are going to copy.  This variable
@@ -661,6 +663,7 @@ simplify_control_stmt_condition (edge e,
      rather than use a relational operator.  These are simpler to handle.  */
   if (TREE_CODE (cond) == SSA_NAME)
     {
+      tree original_lhs = cond;
       cached_lhs = cond;
 
       /* Get the variable's current value from the equivalence chains.
@@ -689,6 +692,12 @@ simplify_control_stmt_condition (edge e,
 	 pass specific callback to try and simplify it further.  */
       if (cached_lhs && ! is_gimple_min_invariant (cached_lhs))
         cached_lhs = (*simplify) (stmt, stmt);
+
+      /* We couldn't find an invariant.  But, callers of this
+	 function may be able to do something useful with the
+	 unmodified destination.  */
+      if (!cached_lhs)
+	cached_lhs = original_lhs;
     }
   else
     cached_lhs = NULL;
@@ -948,6 +957,188 @@ thread_around_empty_blocks (edge taken_edge,
   return false;
 }
 
+/* Return true if there is at least one path from START_BB to END_BB.
+   VISITED_BBS is used to make sure we don't fall into an infinite loop.  */
+
+static bool
+fsm_find_thread_path (basic_block start_bb, basic_block end_bb,
+		      vec<basic_block, va_gc> *&path,
+		      hash_set<basic_block> *visited_bbs, int n_insns)
+{
+  if (start_bb == end_bb)
+    {
+      vec_safe_push (path, start_bb);
+      return true;
+    }
+
+  if (!visited_bbs->add (start_bb))
+    {
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, start_bb->succs)
+	if (fsm_find_thread_path (e->dest, end_bb, path, visited_bbs, n_insns))
+	  {
+	    vec_safe_push (path, start_bb);
+	    return true;
+	  }
+    }
+
+  return false;
+}
+
+static int max_threaded_paths;
+
+/* We trace the value of the variable EXPR back through any phi nodes looking
+   for places where it gets a constant value and save the path.  Stop after
+   having recorded MAX_PATHS jump threading paths.  */
+
+static void
+fsm_find_control_statement_thread_paths (tree expr,
+					 hash_set<gimple> *visited_phis,
+					 vec<basic_block, va_gc> *&path)
+{
+  tree var = SSA_NAME_VAR (expr);
+  gimple def_stmt = SSA_NAME_DEF_STMT (expr);
+  basic_block var_bb = gimple_bb (def_stmt);
+
+  if (var == NULL || var_bb == NULL)
+    return;
+
+  vec<basic_block, va_gc> *next_path;
+  vec_alloc (next_path, n_basic_blocks_for_fn (cfun));
+
+  basic_block last_bb_in_path = path->last ();
+
+  /* Put the path from var_bb to last_bb_in_path into next_path.  */
+  if (var_bb != last_bb_in_path)
+    {
+      edge e;
+      int e_count = 0;
+      edge_iterator ei;
+
+      FOR_EACH_EDGE (e, ei, last_bb_in_path->preds)
+	{
+	  hash_set<basic_block> *visited_bbs = new hash_set<basic_block>;
+
+	  if (fsm_find_thread_path (var_bb, e->src, next_path, visited_bbs, 0))
+	    ++e_count;
+
+	  delete visited_bbs;
+
+	  /* If there is more than one path, stop.  */
+	  if (e_count > 1)
+	    {
+	      vec_free (next_path);
+	      return;
+	    }
+	}
+    }
+
+  /* Visit PHI nodes once.  */
+  if (gimple_code (def_stmt) != GIMPLE_PHI
+      || visited_phis->add (def_stmt))
+    {
+      vec_free (next_path);
+      return;
+    }
+
+  gphi *phi = as_a <gphi *> (def_stmt);
+
+  /* Append all the nodes from next_path to path.  */
+  vec_safe_splice (path, next_path);
+  gcc_assert (path->last () == var_bb);
+
+  /* Iterate over the arguments of PHI.  */
+  unsigned int i;
+  for (i = 0; i < gimple_phi_num_args (phi); i++)
+    {
+      tree arg = gimple_phi_arg_def (phi, i);
+      basic_block bbi = gimple_phi_arg_edge (phi, i)->src;
+
+      /* Skip edges pointing outside the current loop.  */
+      if (!arg || var_bb->loop_father != bbi->loop_father)
+	continue;
+
+      /* Add BBI to the path.  */
+      vec_safe_push (path, bbi);
+
+      if (TREE_CODE (arg) == INTEGER_CST)
+	{
+	  int n = path->length ();
+
+	  /* A path with less than 3 nodes should not be jump-threaded.  */
+	  if (n > 2 && n < PARAM_VALUE (PARAM_MAX_FSM_THREAD_LENGTH)
+	      && max_threaded_paths > 0)
+	    {
+	      int n_insns = 0;
+	      gimple_stmt_iterator gsi;
+	      int j;
+	      loop_p loop = (*path)[0]->loop_father;
+	      bool path_crosses_loops = false;
+
+	      for (j = 1; j < n - 1; j++)
+		{
+		  basic_block bb = (*path)[j];
+		  if (bb->loop_father != loop)
+		    {
+		      path_crosses_loops = true;
+		      break;
+		    }
+		  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
+		       gsi_next (&gsi))
+		    ++n_insns;
+		}
+
+	      if (!path_crosses_loops
+		  && n_insns < PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATH_INSNS))
+		{
+		  vec<jump_thread_edge *> *jump_thread_path
+		    = new vec<jump_thread_edge *> ();
+		  int joiners = 0;
+
+		  for (j = 0; j < n - 1; j++)
+		    {
+		      edge e = find_edge ((*path)[n - j - 1],
+					  (*path)[n - j - 2]);
+		      gcc_assert (e);
+		      enum jump_thread_edge_type kind;
+
+		      if (j == 0)
+			kind = EDGE_START_FSM_THREAD;
+		      else if (single_pred_p (e->src))
+			kind = EDGE_NO_COPY_SRC_BLOCK;
+		      else {
+			kind = EDGE_COPY_SRC_JOINER_BLOCK;
+			++joiners;
+		      }
+
+		      jump_thread_edge *x = new jump_thread_edge (e, kind);
+		      jump_thread_path->safe_push (x);
+		    }
+
+		  /* Add the edge taken when the control variable has value ARG.  */
+		  edge taken_edge = find_taken_edge ((*path)[0], arg);
+		  jump_thread_edge *x
+		    = new jump_thread_edge (taken_edge, EDGE_NO_COPY_SRC_BLOCK);
+		  jump_thread_path->safe_push (x);
+
+		  register_jump_thread (jump_thread_path);
+		  --max_threaded_paths;
+		}
+	    }
+	}
+      else if (TREE_CODE (arg) == SSA_NAME)
+	fsm_find_control_statement_thread_paths (arg, visited_phis, path);
+
+      /* Remove BBI from the path.  */
+      path->pop ();
+    }
+
+  /* Remove all the nodes that we added from next_path.  */
+  vec_safe_truncate (path, (path->length () - next_path->length ()));
+  vec_free (next_path);
+}
+
 /* We are exiting E->src, see if E->dest ends with a conditional
    jump which has a known value when reached via E.
 
@@ -1033,7 +1224,10 @@ thread_through_normal_block (edge e,
       cond = simplify_control_stmt_condition (e, stmt, dummy_cond, simplify,
 					      handle_dominating_asserts);
 
-      if (cond && is_gimple_min_invariant (cond))
+      if (!cond)
+	return 0;
+
+      if (is_gimple_min_invariant (cond))
 	{
 	  edge taken_edge = find_taken_edge (e->dest, cond);
 	  basic_block dest = (taken_edge ? taken_edge->dest : NULL);
@@ -1079,6 +1273,25 @@ thread_through_normal_block (edge e,
 				      backedge_seen_p);
 	  return 1;
 	}
+
+      if (TREE_CODE (cond) != SSA_NAME
+	  || e->dest->loop_father != e->src->loop_father
+	  || loop_depth (e->dest->loop_father) == 0)
+	return 0;
+
+      /* When COND cannot be simplified, try to find paths from a control
+	 statement back through the PHI nodes which would affect that control
+	 statement.  */
+      vec<basic_block, va_gc> *bb_path;
+      vec_alloc (bb_path, n_basic_blocks_for_fn (cfun));
+      vec_safe_push (bb_path, e->dest);
+      hash_set<gimple> *visited_phis = new hash_set<gimple>;
+
+      max_threaded_paths = PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATHS);
+      fsm_find_control_statement_thread_paths (cond, visited_phis, bb_path);
+
+      delete visited_phis;
+      vec_free (bb_path);
     }
   return 0;
 }
diff --git a/gcc/tree-ssa-threadupdate.c b/gcc/tree-ssa-threadupdate.c
index ca0b8bf..5243d0f 100644
--- a/gcc/tree-ssa-threadupdate.c
+++ b/gcc/tree-ssa-threadupdate.c
@@ -167,8 +167,9 @@ dump_jump_thread_path (FILE *dump_file, vec<jump_thread_edge *> path,
 		       bool registering)
 {
   fprintf (dump_file,
-	   "  %s jump thread: (%d, %d) incoming edge; ",
+	   "  %s%s jump thread: (%d, %d) incoming edge; ",
 	   (registering ? "Registering" : "Cancelling"),
+	   (path[0]->type == EDGE_START_FSM_THREAD ? " FSM": ""),
 	   path[0]->e->src->index, path[0]->e->dest->index);
 
   for (unsigned int i = 1; i < path.length (); i++)
@@ -2317,6 +2318,152 @@ bb_ends_with_multiway_branch (basic_block bb ATTRIBUTE_UNUSED)
   return false;
 }
 
+/* Duplicates a Single Entry Multiple Exit REGION (set of N_REGION basic
+   blocks).  The ENTRY edge is redirected to the duplicate of the region.  If
+   REGION is not a Single Entry region, ignore any incoming edges other than
+   ENTRY: this makes the copied region a Single Entry region.
+
+   Remove the last conditional statement in the last basic block in the REGION,
+   and create a single fallthru edge pointing to the same destination as the
+   EXIT edge.
+
+   The new basic blocks are stored to REGION_COPY in the same order as they had
+   in REGION, provided that REGION_COPY is not NULL.
+
+   Returns false if it is unable to copy the region, true otherwise.  */
+
+static bool
+duplicate_seme_region (edge entry, edge exit,
+		       basic_block *region, unsigned n_region,
+		       basic_block *region_copy)
+{
+  unsigned i;
+  bool free_region_copy = false, copying_header = false;
+  struct loop *loop = entry->dest->loop_father;
+  edge exit_copy;
+  edge redirected;
+  int total_freq = 0, entry_freq = 0;
+  gcov_type total_count = 0, entry_count = 0;
+
+  if (!can_copy_bbs_p (region, n_region))
+    return false;
+
+  /* Some sanity checking.  Note that we do not check for all possible
+     missuses of the functions.  I.e. if you ask to copy something weird,
+     it will work, but the state of structures probably will not be
+     correct.  */
+  for (i = 0; i < n_region; i++)
+    {
+      /* We do not handle subloops, i.e. all the blocks must belong to the
+	 same loop.  */
+      if (region[i]->loop_father != loop)
+	return false;
+    }
+
+  initialize_original_copy_tables ();
+
+  if (copying_header)
+    set_loop_copy (loop, loop_outer (loop));
+  else
+    set_loop_copy (loop, loop);
+
+  if (!region_copy)
+    {
+      region_copy = XNEWVEC (basic_block, n_region);
+      free_region_copy = true;
+    }
+
+  if (entry->dest->count)
+    {
+      total_count = entry->dest->count;
+      entry_count = entry->count;
+      /* Fix up corner cases, to avoid division by zero or creation of negative
+	 frequencies.  */
+      if (entry_count > total_count)
+	entry_count = total_count;
+    }
+  else
+    {
+      total_freq = entry->dest->frequency;
+      entry_freq = EDGE_FREQUENCY (entry);
+      /* Fix up corner cases, to avoid division by zero or creation of negative
+	 frequencies.  */
+      if (total_freq == 0)
+	total_freq = 1;
+      else if (entry_freq > total_freq)
+	entry_freq = total_freq;
+    }
+
+  copy_bbs (region, n_region, region_copy, &exit, 1, &exit_copy, loop,
+	    split_edge_bb_loc (entry), 0);
+  if (total_count)
+    {
+      scale_bbs_frequencies_gcov_type (region, n_region,
+				       total_count - entry_count,
+				       total_count);
+      scale_bbs_frequencies_gcov_type (region_copy, n_region, entry_count,
+				       total_count);
+    }
+  else
+    {
+      scale_bbs_frequencies_int (region, n_region, total_freq - entry_freq,
+				 total_freq);
+      scale_bbs_frequencies_int (region_copy, n_region, entry_freq, total_freq);
+    }
+
+#ifdef ENABLE_CHECKING
+  /* Make sure no edge other than ENTRY is entering the copied region.  */
+  for (i = 0; i < n_region; i++)
+    {
+      edge e;
+      edge_iterator ei;
+      basic_block bb = region_copy[i];
+
+      if (single_pred_p (bb))
+	continue;
+
+      for (ei = ei_start (bb->preds); (e = ei_safe_edge (ei)); ei_next (&ei))
+	{
+	  basic_block x = e->src;
+	  bool found = false;
+
+	  for (unsigned j = 0; j < n_region; j++)
+	    if (x == region_copy[j])
+	      {
+		found = true;
+		break;
+	      }
+
+	  gcc_assert (found);
+	}
+    }
+#endif
+
+  /* Remove the last branch in the jump thread path.  */
+  remove_ctrl_stmt_and_useless_edges (region_copy[n_region - 1], exit->dest);
+  edge e = make_edge (region_copy[n_region - 1], exit->dest, EDGE_FALLTHRU);
+
+  if (e) {
+    rescan_loop_exit (e, true, false);
+    e->probability = REG_BR_PROB_BASE;
+    e->count = region_copy[n_region - 1]->count;
+  }
+
+  /* Redirect the entry and add the phi node arguments.  */
+  redirected = redirect_edge_and_branch (entry, get_bb_copy (entry->dest));
+  gcc_assert (redirected != NULL);
+  flush_pending_stmts (entry);
+
+  /* Add the other PHI node arguments.  */
+  add_phi_args_after_copy (region_copy, n_region, NULL);
+
+  if (free_region_copy)
+    free (region_copy);
+
+  free_original_copy_tables ();
+  return true;
+}
+
 /* Walk through all blocks and thread incoming edges to the appropriate
    outgoing edge for each edge pair recorded in THREADED_EDGES.
 
@@ -2343,6 +2490,55 @@ thread_through_all_blocks (bool may_peel_loop_headers)
   threaded_blocks = BITMAP_ALLOC (NULL);
   memset (&thread_stats, 0, sizeof (thread_stats));
 
+  for (i = 0; i < paths.length ();)
+    {
+      vec<jump_thread_edge *> *path = paths[i];
+      edge entry = (*path)[0]->e;
+
+      if ((*path)[0]->type != EDGE_START_FSM_THREAD
+	  /* Do not jump-thread twice from the same block.  */
+	  || bitmap_bit_p (threaded_blocks, entry->src->index)) {
+	i++;
+	continue;
+      }
+
+      unsigned len = path->length ();
+      edge exit = (*path)[len - 1]->e;
+      basic_block *region = XNEWVEC (basic_block, len - 1);
+
+      for (unsigned int j = 0; j < len - 1; j++)
+	region[j] = (*path)[j]->e->dest;
+
+      bool success = duplicate_seme_region (entry, exit, region,
+					    len - 1, NULL);
+      if (success)
+	{
+	  /* We do not update dominance info.  */
+	  free_dominance_info (CDI_DOMINATORS);
+	  bitmap_set_bit (threaded_blocks, entry->src->index);
+	}
+
+      delete_jump_thread_path (path);
+      paths.unordered_remove (i);
+    }
+
+  for (i = 0; i < paths.length ();)
+    {
+      vec<jump_thread_edge *> *path = paths[i];
+      edge entry = (*path)[0]->e;
+
+      /* Do not jump-thread twice from the same block.  */
+      if (bitmap_bit_p (threaded_blocks, entry->src->index))
+	{
+	  delete_jump_thread_path (path);
+	  paths.unordered_remove (i);
+	}
+      else
+	i++;
+    }
+
+  bitmap_clear (threaded_blocks);
+
   mark_threaded_blocks (threaded_blocks);
 
   initialize_original_copy_tables ();
diff --git a/gcc/tree-ssa-threadupdate.h b/gcc/tree-ssa-threadupdate.h
index 426aca5..42c3a9e 100644
--- a/gcc/tree-ssa-threadupdate.h
+++ b/gcc/tree-ssa-threadupdate.h
@@ -26,6 +26,7 @@ extern bool thread_through_all_blocks (bool);
 enum jump_thread_edge_type
 {
   EDGE_START_JUMP_THREAD,
+  EDGE_START_FSM_THREAD,
   EDGE_COPY_SRC_BLOCK,
   EDGE_COPY_SRC_JOINER_BLOCK,
   EDGE_NO_COPY_SRC_BLOCK
-- 
1.9.3


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-25 18:35                                 ` Sebastian Pop
@ 2014-11-25 21:54                                   ` Sebastian Pop
  2014-12-01 21:06                                     ` Jeff Law
  0 siblings, 1 reply; 54+ messages in thread
From: Sebastian Pop @ 2014-11-25 21:54 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 447 bytes --]

Sebastian Pop wrote:
> I will bootstrap and regression test this patch on x86_64-linux and
> powerpc64-linux.  I will also run it on our internal benchmarks, coremark, and
> the llvm test-suite.
> 
> I will also include a longer testcase that makes sure we do not regress on
> coremark.

Done all the above.  Attached is the new patch with a new testcase.  I have also
added verify_seme inspired by the recent patch adding verify_sese.

Sebastian

[-- Attachment #2: 0001-extend-jump-thread-for-finite-state-automata-PR-5474.patch --]
[-- Type: text/x-diff, Size: 23045 bytes --]

From ca222d5222fb976c7aa258d3e3c04e593f42f7a2 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <s.pop@samsung.com>
Date: Fri, 26 Sep 2014 14:54:20 -0500
Subject: [PATCH] extend jump thread for finite state automata PR 54742

Adapted from a patch from James Greenhalgh.

	* params.def (max-fsm-thread-path-insns, max-fsm-thread-length,
	max-fsm-thread-paths): New.

	* doc/invoke.texi (max-fsm-thread-path-insns, max-fsm-thread-length,
	max-fsm-thread-paths): Documented.

	* tree-cfg.c (split_edge_bb_loc): Export.
	* tree-cfg.h (split_edge_bb_loc): Declared extern.

	* tree-ssa-threadedge.c (simplify_control_stmt_condition): Restore the
	original value of cond when simplification fails.
	(fsm_find_thread_path): New.
	(fsm_find_control_statement_thread_paths): New.
	(fsm_thread_through_normal_block): Call find_control_statement_thread_paths.

	* tree-ssa-threadupdate.c (dump_jump_thread_path): Pretty print
	EDGE_START_FSM_THREAD.
	(verify_seme): New.
	(duplicate_seme_region): New.
	(thread_through_all_blocks): Generate code for EDGE_START_FSM_THREAD edges
	calling gimple_duplicate_sese_region.

	* tree-ssa-threadupdate.h (jump_thread_edge_type): Add EDGE_START_FSM_THREAD.

	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c: New.
	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c: New.
---
 gcc/doc/invoke.texi                              |   12 ++
 gcc/params.def                                   |   15 ++
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c |   43 +++++
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c |  127 +++++++++++++
 gcc/tree-cfg.c                                   |    2 +-
 gcc/tree-cfg.h                                   |    1 +
 gcc/tree-ssa-threadedge.c                        |  215 +++++++++++++++++++++-
 gcc/tree-ssa-threadupdate.c                      |  201 +++++++++++++++++++-
 gcc/tree-ssa-threadupdate.h                      |    1 +
 9 files changed, 614 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 89edddb..074183f 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -10624,6 +10624,18 @@ large and significantly increase compile time at optimization level
 @option{-O1} and higher.  This parameter is a maximum nubmer of statements
 in a single generated constructor.  Default value is 5000.
 
+@item max-fsm-thread-path-insns
+Maximum number of instructions to copy when duplicating blocks on a
+finite state automaton jump thread path.  The default is 100.
+
+@item max-fsm-thread-length
+Maximum number of basic blocks on a finite state automaton jump thread
+path.  The default is 10.
+
+@item max-fsm-thread-paths
+Maximum number of new jump thread paths to create for a finite state
+automaton.  The default is 50.
+
 @end table
 @end table
 
diff --git a/gcc/params.def b/gcc/params.def
index 9b21c07..edf3f53 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1140,6 +1140,21 @@ DEFPARAM (PARAM_CHKP_MAX_CTOR_SIZE,
 	  "Maximum number of statements to be included into a single static "
 	  "constructor generated by Pointer Bounds Checker",
 	  5000, 100, 0)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_PATH_INSNS,
+	  "max-fsm-thread-path-insns",
+	  "Maximum number of instructions to copy when duplicating blocks on a finite state automaton jump thread path",
+	  100, 1, 999999)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_LENGTH,
+	  "max-fsm-thread-length",
+	  "Maximum number of basic blocks on a finite state automaton jump thread path",
+	  10, 1, 999999)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_PATHS,
+	  "max-fsm-thread-paths",
+	  "Maximum number of new jump thread paths to create for a finite state automaton",
+	  50, 1, 999999)
 /*
 
 Local variables:
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
new file mode 100644
index 0000000..bb34a74
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-dom1-details" } */
+/* { dg-final { scan-tree-dump-times "FSM" 6 "dom1" } } */
+/* { dg-final { cleanup-tree-dump "dom1" } } */
+
+int sum0, sum1, sum2, sum3;
+int foo (char *s, char **ret)
+{
+  int state=0;
+  char c;
+
+  for (; *s && state != 4; s++)
+    {
+      c = *s;
+      if (c == '*')
+	{
+	  s++;
+	  break;
+	}
+      switch (state)
+	{
+	case 0:
+	  if (c == '+')
+	    state = 1;
+	  else if (c != '-')
+	    sum0+=c;
+	  break;
+	case 1:
+	  if (c == '+')
+	    state = 2;
+	  else if (c == '-')
+	    state = 0;
+	  else
+	    sum1+=c;
+	  break;
+	default:
+	  break;
+	}
+
+    }
+  *ret = s;
+  return state;
+}
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
new file mode 100644
index 0000000..21474f0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
@@ -0,0 +1,127 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-dom1-details" } */
+/* { dg-final { scan-tree-dump-times "FSM" 19 "dom1" } } */
+/* { dg-final { cleanup-tree-dump "dom1" } } */
+
+enum STATE {
+  S0=0,
+  SI,
+  S1,
+  S2,
+  S3,
+  S4,
+  S5,
+  S6
+};
+
+int bar (enum STATE s);
+
+enum STATE foo (unsigned char **y, unsigned *c)
+{
+  unsigned char *x = *y;
+  unsigned char n;
+  enum STATE s = S0;
+
+  for( ; *x && s != SI; x++ )
+    {
+      n = *x;
+      if (n == 'x')
+	{
+	  x++;
+	  break;
+	}
+      switch(s)
+	{
+	case S0:
+	  if(bar(n))
+	    s = S3;
+	  else if( n == 'a' || n == 'b' )
+	    s = S1;
+	  else if( n == 'c' )
+	    s = S4;
+	  else
+	    {
+	      s = SI;
+	      c[SI]++;
+	    }
+	  c[S0]++;
+	  break;
+	case S1:
+	  if(bar(n))
+	    {
+	      s = S3;
+	      c[S1]++;
+	    }
+	  else if( n == 'c' )
+	    {
+	      s = S4;
+	      c[S1]++;
+	    }
+	  else
+	    {
+	      s = SI;
+	      c[S1]++;
+	    }
+	  break;
+	case S3:
+	  if( n == 'c' )
+	    {
+	      s = S4;
+	      c[S3]++;
+	    }
+	  else if(!bar(n))
+	    {
+	      s = SI;
+	      c[S3]++;
+	    }
+	  break;
+	case S4:
+	  if( n == 'E' || n == 'e' )
+	    {
+	      s = S2;
+	      c[S4]++;
+	    }
+	  else if(!bar(n))
+	    {
+	      s = SI;
+	      c[S4]++;
+	    }
+	  break;
+	case S2:
+	  if( n == 'a' || n == 'b' )
+	    {
+	      s = S5;
+	      c[S2]++;
+	    }
+	  else
+	    {
+	      s = SI;
+	      c[S2]++;
+	    }
+	  break;
+	case S5:
+	  if(bar(n))
+	    {
+	      s = S6;
+	      c[S5]++;
+	    }
+	  else
+	    {
+	      s = SI;
+	      c[S5]++;
+	    }
+	  break;
+	case S6:
+	  if(!bar(n))
+	    {
+	      s = SI;
+	      c[SI]++;
+	    }
+	  break;
+	default:
+	  break;
+	}
+    }
+  *y=x;
+  return s;
+}
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index 0a8d7a9..a4ac9d8 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -2666,7 +2666,7 @@ reinstall_phi_args (edge new_edge, edge old_edge)
    near its "logical" location.  This is of most help to humans looking
    at debugging dumps.  */
 
-static basic_block
+basic_block
 split_edge_bb_loc (edge edge_in)
 {
   basic_block dest = edge_in->dest;
diff --git a/gcc/tree-cfg.h b/gcc/tree-cfg.h
index d35e5ba..834fa71 100644
--- a/gcc/tree-cfg.h
+++ b/gcc/tree-cfg.h
@@ -67,6 +67,7 @@ extern void verify_gimple_in_cfg (struct function *, bool);
 extern tree gimple_block_label (basic_block);
 extern void add_phi_args_after_copy_bb (basic_block);
 extern void add_phi_args_after_copy (basic_block *, unsigned, edge);
+extern basic_block split_edge_bb_loc (edge);
 extern bool gimple_duplicate_sese_region (edge, edge, basic_block *, unsigned,
 					basic_block *, bool);
 extern bool gimple_duplicate_sese_tail (edge, edge, basic_block *, unsigned,
diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
index 8b0b7b8..c9fe212 100644
--- a/gcc/tree-ssa-threadedge.c
+++ b/gcc/tree-ssa-threadedge.c
@@ -56,6 +56,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "tree-ssa-threadedge.h"
 #include "builtins.h"
+#include "cfg.h"
+#include "cfganal.h"
 
 /* To avoid code explosion due to jump threading, we limit the
    number of statements we are going to copy.  This variable
@@ -661,6 +663,7 @@ simplify_control_stmt_condition (edge e,
      rather than use a relational operator.  These are simpler to handle.  */
   if (TREE_CODE (cond) == SSA_NAME)
     {
+      tree original_lhs = cond;
       cached_lhs = cond;
 
       /* Get the variable's current value from the equivalence chains.
@@ -689,6 +692,12 @@ simplify_control_stmt_condition (edge e,
 	 pass specific callback to try and simplify it further.  */
       if (cached_lhs && ! is_gimple_min_invariant (cached_lhs))
         cached_lhs = (*simplify) (stmt, stmt);
+
+      /* We couldn't find an invariant.  But, callers of this
+	 function may be able to do something useful with the
+	 unmodified destination.  */
+      if (!cached_lhs)
+	cached_lhs = original_lhs;
     }
   else
     cached_lhs = NULL;
@@ -948,6 +957,188 @@ thread_around_empty_blocks (edge taken_edge,
   return false;
 }
 
+/* Return true if there is at least one path from START_BB to END_BB.
+   VISITED_BBS is used to make sure we don't fall into an infinite loop.  */
+
+static bool
+fsm_find_thread_path (basic_block start_bb, basic_block end_bb,
+		      vec<basic_block, va_gc> *&path,
+		      hash_set<basic_block> *visited_bbs, int n_insns)
+{
+  if (start_bb == end_bb)
+    {
+      vec_safe_push (path, start_bb);
+      return true;
+    }
+
+  if (!visited_bbs->add (start_bb))
+    {
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, start_bb->succs)
+	if (fsm_find_thread_path (e->dest, end_bb, path, visited_bbs, n_insns))
+	  {
+	    vec_safe_push (path, start_bb);
+	    return true;
+	  }
+    }
+
+  return false;
+}
+
+static int max_threaded_paths;
+
+/* We trace the value of the variable EXPR back through any phi nodes looking
+   for places where it gets a constant value and save the path.  Stop after
+   having recorded MAX_PATHS jump threading paths.  */
+
+static void
+fsm_find_control_statement_thread_paths (tree expr,
+					 hash_set<gimple> *visited_phis,
+					 vec<basic_block, va_gc> *&path)
+{
+  tree var = SSA_NAME_VAR (expr);
+  gimple def_stmt = SSA_NAME_DEF_STMT (expr);
+  basic_block var_bb = gimple_bb (def_stmt);
+
+  if (var == NULL || var_bb == NULL)
+    return;
+
+  vec<basic_block, va_gc> *next_path;
+  vec_alloc (next_path, n_basic_blocks_for_fn (cfun));
+
+  basic_block last_bb_in_path = path->last ();
+
+  /* Put the path from var_bb to last_bb_in_path into next_path.  */
+  if (var_bb != last_bb_in_path)
+    {
+      edge e;
+      int e_count = 0;
+      edge_iterator ei;
+
+      FOR_EACH_EDGE (e, ei, last_bb_in_path->preds)
+	{
+	  hash_set<basic_block> *visited_bbs = new hash_set<basic_block>;
+
+	  if (fsm_find_thread_path (var_bb, e->src, next_path, visited_bbs, 0))
+	    ++e_count;
+
+	  delete visited_bbs;
+
+	  /* If there is more than one path, stop.  */
+	  if (e_count > 1)
+	    {
+	      vec_free (next_path);
+	      return;
+	    }
+	}
+    }
+
+  /* Visit PHI nodes once.  */
+  if (gimple_code (def_stmt) != GIMPLE_PHI
+      || visited_phis->add (def_stmt))
+    {
+      vec_free (next_path);
+      return;
+    }
+
+  gphi *phi = as_a <gphi *> (def_stmt);
+
+  /* Append all the nodes from next_path to path.  */
+  vec_safe_splice (path, next_path);
+  gcc_assert (path->last () == var_bb);
+
+  /* Iterate over the arguments of PHI.  */
+  unsigned int i;
+  for (i = 0; i < gimple_phi_num_args (phi); i++)
+    {
+      tree arg = gimple_phi_arg_def (phi, i);
+      basic_block bbi = gimple_phi_arg_edge (phi, i)->src;
+
+      /* Skip edges pointing outside the current loop.  */
+      if (!arg || var_bb->loop_father != bbi->loop_father)
+	continue;
+
+      /* Add BBI to the path.  */
+      vec_safe_push (path, bbi);
+
+      if (TREE_CODE (arg) == INTEGER_CST)
+	{
+	  int n = path->length ();
+
+	  /* A path with less than 3 nodes should not be jump-threaded.  */
+	  if (n > 2 && n < PARAM_VALUE (PARAM_MAX_FSM_THREAD_LENGTH)
+	      && max_threaded_paths > 0)
+	    {
+	      int n_insns = 0;
+	      gimple_stmt_iterator gsi;
+	      int j;
+	      loop_p loop = (*path)[0]->loop_father;
+	      bool path_crosses_loops = false;
+
+	      for (j = 1; j < n - 1; j++)
+		{
+		  basic_block bb = (*path)[j];
+		  if (bb->loop_father != loop)
+		    {
+		      path_crosses_loops = true;
+		      break;
+		    }
+		  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
+		       gsi_next (&gsi))
+		    ++n_insns;
+		}
+
+	      if (!path_crosses_loops
+		  && n_insns < PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATH_INSNS))
+		{
+		  vec<jump_thread_edge *> *jump_thread_path
+		    = new vec<jump_thread_edge *> ();
+		  int joiners = 0;
+
+		  for (j = 0; j < n - 1; j++)
+		    {
+		      edge e = find_edge ((*path)[n - j - 1],
+					  (*path)[n - j - 2]);
+		      gcc_assert (e);
+		      enum jump_thread_edge_type kind;
+
+		      if (j == 0)
+			kind = EDGE_START_FSM_THREAD;
+		      else if (single_pred_p (e->src))
+			kind = EDGE_NO_COPY_SRC_BLOCK;
+		      else {
+			kind = EDGE_COPY_SRC_JOINER_BLOCK;
+			++joiners;
+		      }
+
+		      jump_thread_edge *x = new jump_thread_edge (e, kind);
+		      jump_thread_path->safe_push (x);
+		    }
+
+		  /* Add the edge taken when the control variable has value ARG.  */
+		  edge taken_edge = find_taken_edge ((*path)[0], arg);
+		  jump_thread_edge *x
+		    = new jump_thread_edge (taken_edge, EDGE_NO_COPY_SRC_BLOCK);
+		  jump_thread_path->safe_push (x);
+
+		  register_jump_thread (jump_thread_path);
+		  --max_threaded_paths;
+		}
+	    }
+	}
+      else if (TREE_CODE (arg) == SSA_NAME)
+	fsm_find_control_statement_thread_paths (arg, visited_phis, path);
+
+      /* Remove BBI from the path.  */
+      path->pop ();
+    }
+
+  /* Remove all the nodes that we added from next_path.  */
+  vec_safe_truncate (path, (path->length () - next_path->length ()));
+  vec_free (next_path);
+}
+
 /* We are exiting E->src, see if E->dest ends with a conditional
    jump which has a known value when reached via E.
 
@@ -1033,7 +1224,10 @@ thread_through_normal_block (edge e,
       cond = simplify_control_stmt_condition (e, stmt, dummy_cond, simplify,
 					      handle_dominating_asserts);
 
-      if (cond && is_gimple_min_invariant (cond))
+      if (!cond)
+	return 0;
+
+      if (is_gimple_min_invariant (cond))
 	{
 	  edge taken_edge = find_taken_edge (e->dest, cond);
 	  basic_block dest = (taken_edge ? taken_edge->dest : NULL);
@@ -1079,6 +1273,25 @@ thread_through_normal_block (edge e,
 				      backedge_seen_p);
 	  return 1;
 	}
+
+      if (TREE_CODE (cond) != SSA_NAME
+	  || e->dest->loop_father != e->src->loop_father
+	  || loop_depth (e->dest->loop_father) == 0)
+	return 0;
+
+      /* When COND cannot be simplified, try to find paths from a control
+	 statement back through the PHI nodes which would affect that control
+	 statement.  */
+      vec<basic_block, va_gc> *bb_path;
+      vec_alloc (bb_path, n_basic_blocks_for_fn (cfun));
+      vec_safe_push (bb_path, e->dest);
+      hash_set<gimple> *visited_phis = new hash_set<gimple>;
+
+      max_threaded_paths = PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATHS);
+      fsm_find_control_statement_thread_paths (cond, visited_phis, bb_path);
+
+      delete visited_phis;
+      vec_free (bb_path);
     }
   return 0;
 }
diff --git a/gcc/tree-ssa-threadupdate.c b/gcc/tree-ssa-threadupdate.c
index ca0b8bf..022f399 100644
--- a/gcc/tree-ssa-threadupdate.c
+++ b/gcc/tree-ssa-threadupdate.c
@@ -167,8 +167,9 @@ dump_jump_thread_path (FILE *dump_file, vec<jump_thread_edge *> path,
 		       bool registering)
 {
   fprintf (dump_file,
-	   "  %s jump thread: (%d, %d) incoming edge; ",
+	   "  %s%s jump thread: (%d, %d) incoming edge; ",
 	   (registering ? "Registering" : "Cancelling"),
+	   (path[0]->type == EDGE_START_FSM_THREAD ? " FSM": ""),
 	   path[0]->e->src->index, path[0]->e->dest->index);
 
   for (unsigned int i = 1; i < path.length (); i++)
@@ -2317,6 +2318,155 @@ bb_ends_with_multiway_branch (basic_block bb ATTRIBUTE_UNUSED)
   return false;
 }
 
+/* Verify that the REGION is a Single Entry Multiple Exits region: make sure no
+   edge other than ENTRY is entering the REGION.  */
+
+DEBUG_FUNCTION void
+verify_seme (edge entry, basic_block *region, unsigned n_region)
+{
+  bitmap bbs = BITMAP_ALLOC (NULL);
+
+  for (unsigned i = 0; i < n_region; i++)
+    bitmap_set_bit (bbs, region[i]->index);
+
+  for (unsigned i = 0; i < n_region; i++)
+    {
+      edge e;
+      edge_iterator ei;
+      basic_block bb = region[i];
+
+      /* All predecessors other than ENTRY->src should be in the region.  */
+      for (ei = ei_start (bb->preds); (e = ei_safe_edge (ei)); ei_next (&ei))
+	if (e != entry)
+	  gcc_assert (bitmap_bit_p (bbs, e->src->index));
+    }
+
+  BITMAP_FREE (bbs);
+}
+
+/* Duplicates a Single Entry Multiple Exit REGION (set of N_REGION basic
+   blocks).  The ENTRY edge is redirected to the duplicate of the region.  If
+   REGION is not a Single Entry region, ignore any incoming edges other than
+   ENTRY: this makes the copied region a Single Entry region.
+
+   Remove the last conditional statement in the last basic block in the REGION,
+   and create a single fallthru edge pointing to the same destination as the
+   EXIT edge.
+
+   The new basic blocks are stored to REGION_COPY in the same order as they had
+   in REGION, provided that REGION_COPY is not NULL.
+
+   Returns false if it is unable to copy the region, true otherwise.  */
+
+static bool
+duplicate_seme_region (edge entry, edge exit,
+		       basic_block *region, unsigned n_region,
+		       basic_block *region_copy)
+{
+  unsigned i;
+  bool free_region_copy = false, copying_header = false;
+  struct loop *loop = entry->dest->loop_father;
+  edge exit_copy;
+  edge redirected;
+  int total_freq = 0, entry_freq = 0;
+  gcov_type total_count = 0, entry_count = 0;
+
+  if (!can_copy_bbs_p (region, n_region))
+    return false;
+
+  /* Some sanity checking.  Note that we do not check for all possible
+     missuses of the functions.  I.e. if you ask to copy something weird,
+     it will work, but the state of structures probably will not be
+     correct.  */
+  for (i = 0; i < n_region; i++)
+    {
+      /* We do not handle subloops, i.e. all the blocks must belong to the
+	 same loop.  */
+      if (region[i]->loop_father != loop)
+	return false;
+    }
+
+  initialize_original_copy_tables ();
+
+  if (copying_header)
+    set_loop_copy (loop, loop_outer (loop));
+  else
+    set_loop_copy (loop, loop);
+
+  if (!region_copy)
+    {
+      region_copy = XNEWVEC (basic_block, n_region);
+      free_region_copy = true;
+    }
+
+  if (entry->dest->count)
+    {
+      total_count = entry->dest->count;
+      entry_count = entry->count;
+      /* Fix up corner cases, to avoid division by zero or creation of negative
+	 frequencies.  */
+      if (entry_count > total_count)
+	entry_count = total_count;
+    }
+  else
+    {
+      total_freq = entry->dest->frequency;
+      entry_freq = EDGE_FREQUENCY (entry);
+      /* Fix up corner cases, to avoid division by zero or creation of negative
+	 frequencies.  */
+      if (total_freq == 0)
+	total_freq = 1;
+      else if (entry_freq > total_freq)
+	entry_freq = total_freq;
+    }
+
+  copy_bbs (region, n_region, region_copy, &exit, 1, &exit_copy, loop,
+	    split_edge_bb_loc (entry), 0);
+  if (total_count)
+    {
+      scale_bbs_frequencies_gcov_type (region, n_region,
+				       total_count - entry_count,
+				       total_count);
+      scale_bbs_frequencies_gcov_type (region_copy, n_region, entry_count,
+				       total_count);
+    }
+  else
+    {
+      scale_bbs_frequencies_int (region, n_region, total_freq - entry_freq,
+				 total_freq);
+      scale_bbs_frequencies_int (region_copy, n_region, entry_freq, total_freq);
+    }
+
+#ifdef ENABLE_CHECKING
+  /* Make sure no edge other than ENTRY is entering the copied region.  */
+  verify_seme (entry, region_copy, n_region);
+#endif
+
+  /* Remove the last branch in the jump thread path.  */
+  remove_ctrl_stmt_and_useless_edges (region_copy[n_region - 1], exit->dest);
+  edge e = make_edge (region_copy[n_region - 1], exit->dest, EDGE_FALLTHRU);
+
+  if (e) {
+    rescan_loop_exit (e, true, false);
+    e->probability = REG_BR_PROB_BASE;
+    e->count = region_copy[n_region - 1]->count;
+  }
+
+  /* Redirect the entry and add the phi node arguments.  */
+  redirected = redirect_edge_and_branch (entry, get_bb_copy (entry->dest));
+  gcc_assert (redirected != NULL);
+  flush_pending_stmts (entry);
+
+  /* Add the other PHI node arguments.  */
+  add_phi_args_after_copy (region_copy, n_region, NULL);
+
+  if (free_region_copy)
+    free (region_copy);
+
+  free_original_copy_tables ();
+  return true;
+}
+
 /* Walk through all blocks and thread incoming edges to the appropriate
    outgoing edge for each edge pair recorded in THREADED_EDGES.
 
@@ -2343,6 +2493,55 @@ thread_through_all_blocks (bool may_peel_loop_headers)
   threaded_blocks = BITMAP_ALLOC (NULL);
   memset (&thread_stats, 0, sizeof (thread_stats));
 
+  for (i = 0; i < paths.length ();)
+    {
+      vec<jump_thread_edge *> *path = paths[i];
+      edge entry = (*path)[0]->e;
+
+      if ((*path)[0]->type != EDGE_START_FSM_THREAD
+	  /* Do not jump-thread twice from the same block.  */
+	  || bitmap_bit_p (threaded_blocks, entry->src->index)) {
+	i++;
+	continue;
+      }
+
+      unsigned len = path->length ();
+      edge exit = (*path)[len - 1]->e;
+      basic_block *region = XNEWVEC (basic_block, len - 1);
+
+      for (unsigned int j = 0; j < len - 1; j++)
+	region[j] = (*path)[j]->e->dest;
+
+      bool success = duplicate_seme_region (entry, exit, region,
+					    len - 1, NULL);
+      if (success)
+	{
+	  /* We do not update dominance info.  */
+	  free_dominance_info (CDI_DOMINATORS);
+	  bitmap_set_bit (threaded_blocks, entry->src->index);
+	}
+
+      delete_jump_thread_path (path);
+      paths.unordered_remove (i);
+    }
+
+  for (i = 0; i < paths.length ();)
+    {
+      vec<jump_thread_edge *> *path = paths[i];
+      edge entry = (*path)[0]->e;
+
+      /* Do not jump-thread twice from the same block.  */
+      if (bitmap_bit_p (threaded_blocks, entry->src->index))
+	{
+	  delete_jump_thread_path (path);
+	  paths.unordered_remove (i);
+	}
+      else
+	i++;
+    }
+
+  bitmap_clear (threaded_blocks);
+
   mark_threaded_blocks (threaded_blocks);
 
   initialize_original_copy_tables ();
diff --git a/gcc/tree-ssa-threadupdate.h b/gcc/tree-ssa-threadupdate.h
index 426aca5..42c3a9e 100644
--- a/gcc/tree-ssa-threadupdate.h
+++ b/gcc/tree-ssa-threadupdate.h
@@ -26,6 +26,7 @@ extern bool thread_through_all_blocks (bool);
 enum jump_thread_edge_type
 {
   EDGE_START_JUMP_THREAD,
+  EDGE_START_FSM_THREAD,
   EDGE_COPY_SRC_BLOCK,
   EDGE_COPY_SRC_JOINER_BLOCK,
   EDGE_NO_COPY_SRC_BLOCK
-- 
1.7.10.4


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-11-25 21:54                                   ` Sebastian Pop
@ 2014-12-01 21:06                                     ` Jeff Law
  2014-12-02 10:15                                       ` Richard Biener
                                                         ` (2 more replies)
  0 siblings, 3 replies; 54+ messages in thread
From: Jeff Law @ 2014-12-01 21:06 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

On 11/25/14 14:16, Sebastian Pop wrote:
> Sebastian Pop wrote:
>> >I will bootstrap and regression test this patch on x86_64-linux and
>> >powerpc64-linux.  I will also run it on our internal benchmarks, coremark, and
>> >the llvm test-suite.
>> >
>> >I will also include a longer testcase that makes sure we do not regress on
>> >coremark.
> Done all the above.  Attached is the new patch with a new testcase.  I have also
> added verify_seme inspired by the recent patch adding verify_sese.
>
> Sebastian
>
>
> 0001-extend-jump-thread-for-finite-state-automata-PR-5474.patch
>
>
>  From ca222d5222fb976c7aa258d3e3c04e593f42f7a2 Mon Sep 17 00:00:00 2001
> From: Sebastian Pop<s.pop@samsung.com>
> Date: Fri, 26 Sep 2014 14:54:20 -0500
> Subject: [PATCH] extend jump thread for finite state automata PR 54742
>
> Adapted from a patch from James Greenhalgh.
>
> 	* params.def (max-fsm-thread-path-insns, max-fsm-thread-length,
> 	max-fsm-thread-paths): New.
>
> 	* doc/invoke.texi (max-fsm-thread-path-insns, max-fsm-thread-length,
> 	max-fsm-thread-paths): Documented.
>
> 	* tree-cfg.c (split_edge_bb_loc): Export.
> 	* tree-cfg.h (split_edge_bb_loc): Declared extern.
>
> 	* tree-ssa-threadedge.c (simplify_control_stmt_condition): Restore the
> 	original value of cond when simplification fails.
> 	(fsm_find_thread_path): New.
> 	(fsm_find_control_statement_thread_paths): New.
> 	(fsm_thread_through_normal_block): Call find_control_statement_thread_paths.
>
> 	* tree-ssa-threadupdate.c (dump_jump_thread_path): Pretty print
> 	EDGE_START_FSM_THREAD.
> 	(verify_seme): New.
> 	(duplicate_seme_region): New.
> 	(thread_through_all_blocks): Generate code for EDGE_START_FSM_THREAD edges
> 	calling gimple_duplicate_sese_region.
>
> 	* tree-ssa-threadupdate.h (jump_thread_edge_type): Add EDGE_START_FSM_THREAD.
>
> 	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c: New.
> 	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c: New.
> ---
>   gcc/doc/invoke.texi                              |   12 ++
>   gcc/params.def                                   |   15 ++
>   gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c |   43 +++++
>   gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c |  127 +++++++++++++
>   gcc/tree-cfg.c                                   |    2 +-
>   gcc/tree-cfg.h                                   |    1 +
>   gcc/tree-ssa-threadedge.c                        |  215 +++++++++++++++++++++-
>   gcc/tree-ssa-threadupdate.c                      |  201 +++++++++++++++++++-
>   gcc/tree-ssa-threadupdate.h                      |    1 +
>   9 files changed, 614 insertions(+), 3 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
>   create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
>
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 89edddb..074183f 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -10624,6 +10624,18 @@ large and significantly increase compile time at optimization level
>   @option{-O1} and higher.  This parameter is a maximum nubmer of statements
>   in a single generated constructor.  Default value is 5000.
>
> +@item max-fsm-thread-path-insns
> +Maximum number of instructions to copy when duplicating blocks on a
> +finite state automaton jump thread path.  The default is 100.
> +
> +@item max-fsm-thread-length
> +Maximum number of basic blocks on a finite state automaton jump thread
> +path.  The default is 10.
> +
> +@item max-fsm-thread-paths
> +Maximum number of new jump thread paths to create for a finite state
> +automaton.  The default is 50.
Has there been any tuning on these defaults.  I don't have any strong 
opinions about what they ought to be, this is more to get any such 
information recorded on the lists for historical purposes.

I think it's worth a note in the debug dump anytime you abort threading 
when you hit a limit.

I'm a bit worried about compile-time impacts of the all the recursion, 
but I'm willing to wait and see if it turns out to be a problem in practice.


> diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
> index 8b0b7b8..c9fe212 100644
> --- a/gcc/tree-ssa-threadedge.c
> +++ b/gcc/tree-ssa-threadedge.c
> @@ -56,6 +56,8 @@ along with GCC; see the file COPYING3.  If not see
>   #include "params.h"
>   #include "tree-ssa-threadedge.h"
>   #include "builtins.h"
> +#include "cfg.h"
> +#include "cfganal.h"
>
>   /* To avoid code explosion due to jump threading, we limit the
>      number of statements we are going to copy.  This variable
> @@ -661,6 +663,7 @@ simplify_control_stmt_condition (edge e,
>        rather than use a relational operator.  These are simpler to handle.  */
>     if (TREE_CODE (cond) == SSA_NAME)
>       {
> +      tree original_lhs = cond;
>         cached_lhs = cond;
>
>         /* Get the variable's current value from the equivalence chains.
> @@ -689,6 +692,12 @@ simplify_control_stmt_condition (edge e,
>   	 pass specific callback to try and simplify it further.  */
>         if (cached_lhs && ! is_gimple_min_invariant (cached_lhs))
>           cached_lhs = (*simplify) (stmt, stmt);
> +
> +      /* We couldn't find an invariant.  But, callers of this
> +	 function may be able to do something useful with the
> +	 unmodified destination.  */
> +      if (!cached_lhs)
> +	cached_lhs = original_lhs;
>       }
>     else
>       cached_lhs = NULL;
Can't you just use COND rather than stuffing its value away into 
ORIGINAL_LHS?    CACHED_LHS may be better in some cases if it's an 
SSA_NAME (and it should be), but I doubt it matters in practice.

Or is it the case that you have to have the original condition -- 
without any context sensitive equivalences used to "simplify" the condition.


> @@ -948,6 +957,188 @@ thread_around_empty_blocks (edge taken_edge,
>     return false;
>   }
>
> +/* Return true if there is at least one path from START_BB to END_BB.
> +   VISITED_BBS is used to make sure we don't fall into an infinite loop.  */
> +
> +static bool
> +fsm_find_thread_path (basic_block start_bb, basic_block end_bb,
> +		      vec<basic_block, va_gc> *&path,
> +		      hash_set<basic_block> *visited_bbs, int n_insns)
> +{
> +  if (start_bb == end_bb)
> +    {
> +      vec_safe_push (path, start_bb);
> +      return true;
> +    }
> +
> +  if (!visited_bbs->add (start_bb))
> +    {
> +      edge e;
> +      edge_iterator ei;
> +      FOR_EACH_EDGE (e, ei, start_bb->succs)
> +	if (fsm_find_thread_path (e->dest, end_bb, path, visited_bbs, n_insns))
> +	  {
> +	    vec_safe_push (path, start_bb);
> +	    return true;
> +	  }
> +    }
> +
> +  return false;
Update comment to indicate how PATH is used to return a path from 
START_BB to END_BB.



> +		  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
> +		       gsi_next (&gsi))
> +		    ++n_insns;
Probably don't want to count labels and GIMPLE_NOPs.  Probably do want 
to count non-virtual PHIs since those may end up as a copies or constant 
initializations.

> +		      if (j == 0)
> +			kind = EDGE_START_FSM_THREAD;
> +		      else if (single_pred_p (e->src))
> +			kind = EDGE_NO_COPY_SRC_BLOCK;
> +		      else {
> +			kind = EDGE_COPY_SRC_JOINER_BLOCK;
> +			++joiners;
> +		      }
Presumably the mis-formatting was added when you tracked the # joiners. 
  AFAICT that is a write-only variable and ought to be removed.  Along 
with the braces on the final ELSE which should restore proper formatting.


> @@ -2343,6 +2493,55 @@ thread_through_all_blocks (bool may_peel_loop_headers)
>     threaded_blocks = BITMAP_ALLOC (NULL);
>     memset (&thread_stats, 0, sizeof (thread_stats));
>
> +  for (i = 0; i < paths.length ();)
Comment before this loop.  I can see what you're doing, but I'm already 
very familiar with this code.  Basically what are you looking for in 
this loop and what do you do?

Overall I think this is very very close and I really like the overall 
direction.  There's a few minor issues noted above and with those 
addressed, I think we should be ready to go.

Looking further out....


Removing most of tree-ssa-threadupdate.c and using SEME duplication 
would be a huge step forward for making this code more understandable. I 
look forward to any work you do in this space in the future.

Similarly moving towards a backwards dataflow driven model is definitely 
on my long term plan for this code.  Ideally with some kind of knob that 
says "optimize the trivial jump threads you can find and do so very 
quickly" (say by restricting the lookup to a single block) and a more 
expensive version.

The simple version could run early which would solve some problems Jan 
has run into.  Running the simple version early would also help DOM/VRP.

Ideally I want to disentangle threading from VRP and DOM -- most 
threading opportunities are fairly simple to find and exploit.  Yet 
right now we have to run DOM or VRP which are insanely expensive.

Jeff

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-01 21:06                                     ` Jeff Law
@ 2014-12-02 10:15                                       ` Richard Biener
  2014-12-02 20:17                                         ` Jeff Law
  2014-12-04  8:38                                       ` Sebastian Pop
  2014-12-04 11:04                                       ` Sebastian Pop
  2 siblings, 1 reply; 54+ messages in thread
From: Richard Biener @ 2014-12-02 10:15 UTC (permalink / raw)
  To: Jeff Law; +Cc: Sebastian Pop, James Greenhalgh, Steve Ellcey, GCC Patches

On Mon, Dec 1, 2014 at 10:06 PM, Jeff Law <law@redhat.com> wrote:
> On 11/25/14 14:16, Sebastian Pop wrote:
>>
>> Sebastian Pop wrote:
>>>
>>> >I will bootstrap and regression test this patch on x86_64-linux and
>>> >powerpc64-linux.  I will also run it on our internal benchmarks,
>>> > coremark, and
>>> >the llvm test-suite.
>>> >
>>> >I will also include a longer testcase that makes sure we do not regress
>>> > on
>>> >coremark.
>>
>> Done all the above.  Attached is the new patch with a new testcase.  I
>> have also
>> added verify_seme inspired by the recent patch adding verify_sese.
>>
>> Sebastian
>>
>>
>> 0001-extend-jump-thread-for-finite-state-automata-PR-5474.patch
>>
>>
>>  From ca222d5222fb976c7aa258d3e3c04e593f42f7a2 Mon Sep 17 00:00:00 2001
>> From: Sebastian Pop<s.pop@samsung.com>
>> Date: Fri, 26 Sep 2014 14:54:20 -0500
>> Subject: [PATCH] extend jump thread for finite state automata PR 54742
>>
>> Adapted from a patch from James Greenhalgh.
>>
>>         * params.def (max-fsm-thread-path-insns, max-fsm-thread-length,
>>         max-fsm-thread-paths): New.
>>
>>         * doc/invoke.texi (max-fsm-thread-path-insns,
>> max-fsm-thread-length,
>>         max-fsm-thread-paths): Documented.
>>
>>         * tree-cfg.c (split_edge_bb_loc): Export.
>>         * tree-cfg.h (split_edge_bb_loc): Declared extern.
>>
>>         * tree-ssa-threadedge.c (simplify_control_stmt_condition): Restore
>> the
>>         original value of cond when simplification fails.
>>         (fsm_find_thread_path): New.
>>         (fsm_find_control_statement_thread_paths): New.
>>         (fsm_thread_through_normal_block): Call
>> find_control_statement_thread_paths.
>>
>>         * tree-ssa-threadupdate.c (dump_jump_thread_path): Pretty print
>>         EDGE_START_FSM_THREAD.
>>         (verify_seme): New.
>>         (duplicate_seme_region): New.
>>         (thread_through_all_blocks): Generate code for
>> EDGE_START_FSM_THREAD edges
>>         calling gimple_duplicate_sese_region.
>>
>>         * tree-ssa-threadupdate.h (jump_thread_edge_type): Add
>> EDGE_START_FSM_THREAD.
>>
>>         * testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c: New.
>>         * testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c: New.
>> ---
>>   gcc/doc/invoke.texi                              |   12 ++
>>   gcc/params.def                                   |   15 ++
>>   gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c |   43 +++++
>>   gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c |  127 +++++++++++++
>>   gcc/tree-cfg.c                                   |    2 +-
>>   gcc/tree-cfg.h                                   |    1 +
>>   gcc/tree-ssa-threadedge.c                        |  215
>> +++++++++++++++++++++-
>>   gcc/tree-ssa-threadupdate.c                      |  201
>> +++++++++++++++++++-
>>   gcc/tree-ssa-threadupdate.h                      |    1 +
>>   9 files changed, 614 insertions(+), 3 deletions(-)
>>   create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
>>   create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
>>
>> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
>> index 89edddb..074183f 100644
>> --- a/gcc/doc/invoke.texi
>> +++ b/gcc/doc/invoke.texi
>> @@ -10624,6 +10624,18 @@ large and significantly increase compile time at
>> optimization level
>>   @option{-O1} and higher.  This parameter is a maximum nubmer of
>> statements
>>   in a single generated constructor.  Default value is 5000.
>>
>> +@item max-fsm-thread-path-insns
>> +Maximum number of instructions to copy when duplicating blocks on a
>> +finite state automaton jump thread path.  The default is 100.
>> +
>> +@item max-fsm-thread-length
>> +Maximum number of basic blocks on a finite state automaton jump thread
>> +path.  The default is 10.
>> +
>> +@item max-fsm-thread-paths
>> +Maximum number of new jump thread paths to create for a finite state
>> +automaton.  The default is 50.
>
> Has there been any tuning on these defaults.  I don't have any strong
> opinions about what they ought to be, this is more to get any such
> information recorded on the lists for historical purposes.
>
> I think it's worth a note in the debug dump anytime you abort threading when
> you hit a limit.
>
> I'm a bit worried about compile-time impacts of the all the recursion, but
> I'm willing to wait and see if it turns out to be a problem in practice.

Please consider restricting it to -fexpensive-optimizations (-O2+).

Richard.

>
>> diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
>> index 8b0b7b8..c9fe212 100644
>> --- a/gcc/tree-ssa-threadedge.c
>> +++ b/gcc/tree-ssa-threadedge.c
>> @@ -56,6 +56,8 @@ along with GCC; see the file COPYING3.  If not see
>>   #include "params.h"
>>   #include "tree-ssa-threadedge.h"
>>   #include "builtins.h"
>> +#include "cfg.h"
>> +#include "cfganal.h"
>>
>>   /* To avoid code explosion due to jump threading, we limit the
>>      number of statements we are going to copy.  This variable
>> @@ -661,6 +663,7 @@ simplify_control_stmt_condition (edge e,
>>        rather than use a relational operator.  These are simpler to
>> handle.  */
>>     if (TREE_CODE (cond) == SSA_NAME)
>>       {
>> +      tree original_lhs = cond;
>>         cached_lhs = cond;
>>
>>         /* Get the variable's current value from the equivalence chains.
>> @@ -689,6 +692,12 @@ simplify_control_stmt_condition (edge e,
>>          pass specific callback to try and simplify it further.  */
>>         if (cached_lhs && ! is_gimple_min_invariant (cached_lhs))
>>           cached_lhs = (*simplify) (stmt, stmt);
>> +
>> +      /* We couldn't find an invariant.  But, callers of this
>> +        function may be able to do something useful with the
>> +        unmodified destination.  */
>> +      if (!cached_lhs)
>> +       cached_lhs = original_lhs;
>>       }
>>     else
>>       cached_lhs = NULL;
>
> Can't you just use COND rather than stuffing its value away into
> ORIGINAL_LHS?    CACHED_LHS may be better in some cases if it's an SSA_NAME
> (and it should be), but I doubt it matters in practice.
>
> Or is it the case that you have to have the original condition -- without
> any context sensitive equivalences used to "simplify" the condition.
>
>
>> @@ -948,6 +957,188 @@ thread_around_empty_blocks (edge taken_edge,
>>     return false;
>>   }
>>
>> +/* Return true if there is at least one path from START_BB to END_BB.
>> +   VISITED_BBS is used to make sure we don't fall into an infinite loop.
>> */
>> +
>> +static bool
>> +fsm_find_thread_path (basic_block start_bb, basic_block end_bb,
>> +                     vec<basic_block, va_gc> *&path,
>> +                     hash_set<basic_block> *visited_bbs, int n_insns)
>> +{
>> +  if (start_bb == end_bb)
>> +    {
>> +      vec_safe_push (path, start_bb);
>> +      return true;
>> +    }
>> +
>> +  if (!visited_bbs->add (start_bb))
>> +    {
>> +      edge e;
>> +      edge_iterator ei;
>> +      FOR_EACH_EDGE (e, ei, start_bb->succs)
>> +       if (fsm_find_thread_path (e->dest, end_bb, path, visited_bbs,
>> n_insns))
>> +         {
>> +           vec_safe_push (path, start_bb);
>> +           return true;
>> +         }
>> +    }
>> +
>> +  return false;
>
> Update comment to indicate how PATH is used to return a path from START_BB
> to END_BB.
>
>
>
>> +                 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
>> +                      gsi_next (&gsi))
>> +                   ++n_insns;
>
> Probably don't want to count labels and GIMPLE_NOPs.  Probably do want to
> count non-virtual PHIs since those may end up as a copies or constant
> initializations.
>
>> +                     if (j == 0)
>> +                       kind = EDGE_START_FSM_THREAD;
>> +                     else if (single_pred_p (e->src))
>> +                       kind = EDGE_NO_COPY_SRC_BLOCK;
>> +                     else {
>> +                       kind = EDGE_COPY_SRC_JOINER_BLOCK;
>> +                       ++joiners;
>> +                     }
>
> Presumably the mis-formatting was added when you tracked the # joiners.
> AFAICT that is a write-only variable and ought to be removed.  Along with
> the braces on the final ELSE which should restore proper formatting.
>
>
>> @@ -2343,6 +2493,55 @@ thread_through_all_blocks (bool
>> may_peel_loop_headers)
>>     threaded_blocks = BITMAP_ALLOC (NULL);
>>     memset (&thread_stats, 0, sizeof (thread_stats));
>>
>> +  for (i = 0; i < paths.length ();)
>
> Comment before this loop.  I can see what you're doing, but I'm already very
> familiar with this code.  Basically what are you looking for in this loop
> and what do you do?
>
> Overall I think this is very very close and I really like the overall
> direction.  There's a few minor issues noted above and with those addressed,
> I think we should be ready to go.
>
> Looking further out....
>
>
> Removing most of tree-ssa-threadupdate.c and using SEME duplication would be
> a huge step forward for making this code more understandable. I look forward
> to any work you do in this space in the future.
>
> Similarly moving towards a backwards dataflow driven model is definitely on
> my long term plan for this code.  Ideally with some kind of knob that says
> "optimize the trivial jump threads you can find and do so very quickly" (say
> by restricting the lookup to a single block) and a more expensive version.
>
> The simple version could run early which would solve some problems Jan has
> run into.  Running the simple version early would also help DOM/VRP.
>
> Ideally I want to disentangle threading from VRP and DOM -- most threading
> opportunities are fairly simple to find and exploit.  Yet right now we have
> to run DOM or VRP which are insanely expensive.
>
> Jeff

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-02 10:15                                       ` Richard Biener
@ 2014-12-02 20:17                                         ` Jeff Law
  0 siblings, 0 replies; 54+ messages in thread
From: Jeff Law @ 2014-12-02 20:17 UTC (permalink / raw)
  To: Richard Biener; +Cc: Sebastian Pop, James Greenhalgh, Steve Ellcey, GCC Patches

On 12/02/14 03:15, Richard Biener wrote:
>>
>> I'm a bit worried about compile-time impacts of the all the recursion, but
>> I'm willing to wait and see if it turns out to be a problem in practice.
>
> Please consider restricting it to -fexpensive-optimizations (-O2+).
Yea, let's go ahead and do that.

jeff

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-01 21:06                                     ` Jeff Law
  2014-12-02 10:15                                       ` Richard Biener
@ 2014-12-04  8:38                                       ` Sebastian Pop
  2014-12-04  9:14                                         ` Sebastian Pop
  2014-12-04 11:04                                       ` Sebastian Pop
  2 siblings, 1 reply; 54+ messages in thread
From: Sebastian Pop @ 2014-12-04  8:38 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 5030 bytes --]

Jeff Law wrote:
> >+@item max-fsm-thread-path-insns
> >+Maximum number of instructions to copy when duplicating blocks on a
> >+finite state automaton jump thread path.  The default is 100.
> >+
> >+@item max-fsm-thread-length
> >+Maximum number of basic blocks on a finite state automaton jump thread
> >+path.  The default is 10.
> >+
> >+@item max-fsm-thread-paths
> >+Maximum number of new jump thread paths to create for a finite state
> >+automaton.  The default is 50.
> Has there been any tuning on these defaults.  I don't have any
> strong opinions about what they ought to be, this is more to get any
> such information recorded on the lists for historical purposes.

I have not tuned any of these defaults other than making sure that coremark is
still jump-threaded.  gcc.dg/tree-ssa/ssa-dom-thread-7.c is a test-case that
will check that we always optimize coremark.

> I think it's worth a note in the debug dump anytime you abort
> threading when you hit a limit.

Done.

> I'm a bit worried about compile-time impacts of the all the
> recursion, but I'm willing to wait and see if it turns out to be a
> problem in practice.

Done, as Richi suggested, checking the flag_expensive_optimizations.

> >@@ -689,6 +692,12 @@ simplify_control_stmt_condition (edge e,
> >  	 pass specific callback to try and simplify it further.  */
> >        if (cached_lhs && ! is_gimple_min_invariant (cached_lhs))
> >          cached_lhs = (*simplify) (stmt, stmt);
> >+
> >+      /* We couldn't find an invariant.  But, callers of this
> >+	 function may be able to do something useful with the
> >+	 unmodified destination.  */
> >+      if (!cached_lhs)
> >+	cached_lhs = original_lhs;
> >      }
> >    else
> >      cached_lhs = NULL;
> Can't you just use COND rather than stuffing its value away into
> ORIGINAL_LHS?    CACHED_LHS may be better in some cases if it's an
> SSA_NAME (and it should be), but I doubt it matters in practice.
> 
> Or is it the case that you have to have the original condition --
> without any context sensitive equivalences used to "simplify" the
> condition.

I think we need to start the search for FSM jump-threads with the original non
simplified condition.

> >+/* Return true if there is at least one path from START_BB to END_BB.
> >+   VISITED_BBS is used to make sure we don't fall into an infinite loop.  */
> >+
> >+static bool
> >+fsm_find_thread_path (basic_block start_bb, basic_block end_bb,
> >+		      vec<basic_block, va_gc> *&path,
> >+		      hash_set<basic_block> *visited_bbs, int n_insns)
> Update comment to indicate how PATH is used to return a path from
> START_BB to END_BB.

Done.

> >+		  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
> >+		       gsi_next (&gsi))
> >+		    ++n_insns;
> Probably don't want to count labels and GIMPLE_NOPs.  Probably do
> want to count non-virtual PHIs since those may end up as a copies or
> constant initializations.

Done.

> 
> >+		      if (j == 0)
> >+			kind = EDGE_START_FSM_THREAD;
> >+		      else if (single_pred_p (e->src))
> >+			kind = EDGE_NO_COPY_SRC_BLOCK;
> >+		      else {
> >+			kind = EDGE_COPY_SRC_JOINER_BLOCK;
> >+			++joiners;
> >+		      }
> Presumably the mis-formatting was added when you tracked the #
> joiners.  AFAICT that is a write-only variable and ought to be
> removed.  Along with the braces on the final ELSE which should
> restore proper formatting.

Done.

> 
> 
> >@@ -2343,6 +2493,55 @@ thread_through_all_blocks (bool may_peel_loop_headers)
> >    threaded_blocks = BITMAP_ALLOC (NULL);
> >    memset (&thread_stats, 0, sizeof (thread_stats));
> >
> >+  for (i = 0; i < paths.length ();)
> Comment before this loop.  I can see what you're doing, but I'm
> already very familiar with this code.  Basically what are you
> looking for in this loop and what do you do?

Done.

> Overall I think this is very very close and I really like the
> overall direction.  There's a few minor issues noted above and with
> those addressed, I think we should be ready to go.

Thanks for your careful review.  Please let me know if there still are things I
can improve in the attached patch.
The path passes bootstrap on x86_64-linux and powerpc64-linux, and regtest except 
a fail I have not seen in the past:

FAIL: gcc.c-torture/compile/pr27571.c   -Os  (internal compiler error)

I am still investigating why this fails: as far as I can see for now this is
because in copying the FSM path we create an internal loop that is then
discovered by the loop verifier as a natural loop and is not yet in the existing
loop sturctures.  I will try to fix this in duplicate_seme by invalidating the
loop structure after we code generated all the FSM paths.  I will submit an
updated patch when it passes regtest.

> Removing most of tree-ssa-threadupdate.c and using SEME duplication
> would be a huge step forward for making this code more
> understandable. I look forward to any work you do in this space in
> the future.

I will clean up the patch and I will submit it for review (for stage 1.)

Sebastian

[-- Attachment #2: 0001-extend-jump-thread-for-finite-state-automata-PR-5474.patch --]
[-- Type: text/x-diff, Size: 25213 bytes --]

From 80fe8b173a0ca913d7a51594f99c232885640b8c Mon Sep 17 00:00:00 2001
From: Sebastian Pop <s.pop@samsung.com>
Date: Fri, 26 Sep 2014 14:54:20 -0500
Subject: [PATCH] extend jump thread for finite state automata PR 54742

Adapted from a patch from James Greenhalgh.

	* params.def (max-fsm-thread-path-insns, max-fsm-thread-length,
	max-fsm-thread-paths): New.

	* doc/invoke.texi (max-fsm-thread-path-insns, max-fsm-thread-length,
	max-fsm-thread-paths): Documented.

	* tree-cfg.c (split_edge_bb_loc): Export.
	* tree-cfg.h (split_edge_bb_loc): Declared extern.

	* tree-ssa-threadedge.c (simplify_control_stmt_condition): Restore the
	original value of cond when simplification fails.
	(fsm_find_thread_path): New.
	(fsm_find_control_statement_thread_paths): New.
	(fsm_thread_through_normal_block): Call
	find_control_statement_thread_paths.

	* tree-ssa-threadupdate.c (dump_jump_thread_path): Pretty print
	EDGE_FSM_THREAD.
	(verify_seme): New.
	(duplicate_seme_region): New.
	(thread_through_all_blocks): Generate code for EDGE_FSM_THREAD edges
	calling duplicate_seme_region.

	* tree-ssa-threadupdate.h (jump_thread_edge_type): Add EDGE_FSM_THREAD.

	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c: New.
	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c: New.
---
 gcc/doc/invoke.texi                              |   12 +
 gcc/params.def                                   |   15 ++
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c |   43 ++++
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c |  127 +++++++++++
 gcc/tree-cfg.c                                   |    2 +-
 gcc/tree-cfg.h                                   |    1 +
 gcc/tree-ssa-threadedge.c                        |  264 +++++++++++++++++++++-
 gcc/tree-ssa-threadupdate.c                      |  202 ++++++++++++++++-
 gcc/tree-ssa-threadupdate.h                      |    1 +
 9 files changed, 664 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 89edddb..074183f 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -10624,6 +10624,18 @@ large and significantly increase compile time at optimization level
 @option{-O1} and higher.  This parameter is a maximum nubmer of statements
 in a single generated constructor.  Default value is 5000.
 
+@item max-fsm-thread-path-insns
+Maximum number of instructions to copy when duplicating blocks on a
+finite state automaton jump thread path.  The default is 100.
+
+@item max-fsm-thread-length
+Maximum number of basic blocks on a finite state automaton jump thread
+path.  The default is 10.
+
+@item max-fsm-thread-paths
+Maximum number of new jump thread paths to create for a finite state
+automaton.  The default is 50.
+
 @end table
 @end table
 
diff --git a/gcc/params.def b/gcc/params.def
index 9b21c07..edf3f53 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1140,6 +1140,21 @@ DEFPARAM (PARAM_CHKP_MAX_CTOR_SIZE,
 	  "Maximum number of statements to be included into a single static "
 	  "constructor generated by Pointer Bounds Checker",
 	  5000, 100, 0)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_PATH_INSNS,
+	  "max-fsm-thread-path-insns",
+	  "Maximum number of instructions to copy when duplicating blocks on a finite state automaton jump thread path",
+	  100, 1, 999999)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_LENGTH,
+	  "max-fsm-thread-length",
+	  "Maximum number of basic blocks on a finite state automaton jump thread path",
+	  10, 1, 999999)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_PATHS,
+	  "max-fsm-thread-paths",
+	  "Maximum number of new jump thread paths to create for a finite state automaton",
+	  50, 1, 999999)
 /*
 
 Local variables:
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
new file mode 100644
index 0000000..bb34a74
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-dom1-details" } */
+/* { dg-final { scan-tree-dump-times "FSM" 6 "dom1" } } */
+/* { dg-final { cleanup-tree-dump "dom1" } } */
+
+int sum0, sum1, sum2, sum3;
+int foo (char *s, char **ret)
+{
+  int state=0;
+  char c;
+
+  for (; *s && state != 4; s++)
+    {
+      c = *s;
+      if (c == '*')
+	{
+	  s++;
+	  break;
+	}
+      switch (state)
+	{
+	case 0:
+	  if (c == '+')
+	    state = 1;
+	  else if (c != '-')
+	    sum0+=c;
+	  break;
+	case 1:
+	  if (c == '+')
+	    state = 2;
+	  else if (c == '-')
+	    state = 0;
+	  else
+	    sum1+=c;
+	  break;
+	default:
+	  break;
+	}
+
+    }
+  *ret = s;
+  return state;
+}
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
new file mode 100644
index 0000000..21474f0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
@@ -0,0 +1,127 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-dom1-details" } */
+/* { dg-final { scan-tree-dump-times "FSM" 19 "dom1" } } */
+/* { dg-final { cleanup-tree-dump "dom1" } } */
+
+enum STATE {
+  S0=0,
+  SI,
+  S1,
+  S2,
+  S3,
+  S4,
+  S5,
+  S6
+};
+
+int bar (enum STATE s);
+
+enum STATE foo (unsigned char **y, unsigned *c)
+{
+  unsigned char *x = *y;
+  unsigned char n;
+  enum STATE s = S0;
+
+  for( ; *x && s != SI; x++ )
+    {
+      n = *x;
+      if (n == 'x')
+	{
+	  x++;
+	  break;
+	}
+      switch(s)
+	{
+	case S0:
+	  if(bar(n))
+	    s = S3;
+	  else if( n == 'a' || n == 'b' )
+	    s = S1;
+	  else if( n == 'c' )
+	    s = S4;
+	  else
+	    {
+	      s = SI;
+	      c[SI]++;
+	    }
+	  c[S0]++;
+	  break;
+	case S1:
+	  if(bar(n))
+	    {
+	      s = S3;
+	      c[S1]++;
+	    }
+	  else if( n == 'c' )
+	    {
+	      s = S4;
+	      c[S1]++;
+	    }
+	  else
+	    {
+	      s = SI;
+	      c[S1]++;
+	    }
+	  break;
+	case S3:
+	  if( n == 'c' )
+	    {
+	      s = S4;
+	      c[S3]++;
+	    }
+	  else if(!bar(n))
+	    {
+	      s = SI;
+	      c[S3]++;
+	    }
+	  break;
+	case S4:
+	  if( n == 'E' || n == 'e' )
+	    {
+	      s = S2;
+	      c[S4]++;
+	    }
+	  else if(!bar(n))
+	    {
+	      s = SI;
+	      c[S4]++;
+	    }
+	  break;
+	case S2:
+	  if( n == 'a' || n == 'b' )
+	    {
+	      s = S5;
+	      c[S2]++;
+	    }
+	  else
+	    {
+	      s = SI;
+	      c[S2]++;
+	    }
+	  break;
+	case S5:
+	  if(bar(n))
+	    {
+	      s = S6;
+	      c[S5]++;
+	    }
+	  else
+	    {
+	      s = SI;
+	      c[S5]++;
+	    }
+	  break;
+	case S6:
+	  if(!bar(n))
+	    {
+	      s = SI;
+	      c[SI]++;
+	    }
+	  break;
+	default:
+	  break;
+	}
+    }
+  *y=x;
+  return s;
+}
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index 0a8d7a9..a4ac9d8 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -2666,7 +2666,7 @@ reinstall_phi_args (edge new_edge, edge old_edge)
    near its "logical" location.  This is of most help to humans looking
    at debugging dumps.  */
 
-static basic_block
+basic_block
 split_edge_bb_loc (edge edge_in)
 {
   basic_block dest = edge_in->dest;
diff --git a/gcc/tree-cfg.h b/gcc/tree-cfg.h
index d35e5ba..834fa71 100644
--- a/gcc/tree-cfg.h
+++ b/gcc/tree-cfg.h
@@ -67,6 +67,7 @@ extern void verify_gimple_in_cfg (struct function *, bool);
 extern tree gimple_block_label (basic_block);
 extern void add_phi_args_after_copy_bb (basic_block);
 extern void add_phi_args_after_copy (basic_block *, unsigned, edge);
+extern basic_block split_edge_bb_loc (edge);
 extern bool gimple_duplicate_sese_region (edge, edge, basic_block *, unsigned,
 					basic_block *, bool);
 extern bool gimple_duplicate_sese_tail (edge, edge, basic_block *, unsigned,
diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
index 8b0b7b8..a6fb361 100644
--- a/gcc/tree-ssa-threadedge.c
+++ b/gcc/tree-ssa-threadedge.c
@@ -56,6 +56,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "tree-ssa-threadedge.h"
 #include "builtins.h"
+#include "cfg.h"
+#include "cfganal.h"
 
 /* To avoid code explosion due to jump threading, we limit the
    number of statements we are going to copy.  This variable
@@ -661,6 +663,7 @@ simplify_control_stmt_condition (edge e,
      rather than use a relational operator.  These are simpler to handle.  */
   if (TREE_CODE (cond) == SSA_NAME)
     {
+      tree original_lhs = cond;
       cached_lhs = cond;
 
       /* Get the variable's current value from the equivalence chains.
@@ -689,6 +692,12 @@ simplify_control_stmt_condition (edge e,
 	 pass specific callback to try and simplify it further.  */
       if (cached_lhs && ! is_gimple_min_invariant (cached_lhs))
         cached_lhs = (*simplify) (stmt, stmt);
+
+      /* We couldn't find an invariant.  But, callers of this
+	 function may be able to do something useful with the
+	 unmodified destination.  */
+      if (!cached_lhs)
+	cached_lhs = original_lhs;
     }
   else
     cached_lhs = NULL;
@@ -948,6 +957,236 @@ thread_around_empty_blocks (edge taken_edge,
   return false;
 }
 
+/* Return true if the CFG contains at least one path from START_BB to END_BB.
+   When a path is found, record in PATH the blocks from END_BB to START_BB.
+   VISITED_BBS is used to make sure we don't fall into an infinite loop.  */
+
+static bool
+fsm_find_thread_path (basic_block start_bb, basic_block end_bb,
+		      vec<basic_block, va_gc> *&path,
+		      hash_set<basic_block> *visited_bbs, int n_insns)
+{
+  if (start_bb == end_bb)
+    {
+      vec_safe_push (path, start_bb);
+      return true;
+    }
+
+  if (!visited_bbs->add (start_bb))
+    {
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, start_bb->succs)
+	if (fsm_find_thread_path (e->dest, end_bb, path, visited_bbs, n_insns))
+	  {
+	    vec_safe_push (path, start_bb);
+	    return true;
+	  }
+    }
+
+  return false;
+}
+
+static int max_threaded_paths;
+
+/* We trace the value of the variable EXPR back through any phi nodes looking
+   for places where it gets a constant value and save the path.  Stop after
+   having recorded MAX_PATHS jump threading paths.  */
+
+static void
+fsm_find_control_statement_thread_paths (tree expr,
+					 hash_set<gimple> *visited_phis,
+					 vec<basic_block, va_gc> *&path)
+{
+  tree var = SSA_NAME_VAR (expr);
+  gimple def_stmt = SSA_NAME_DEF_STMT (expr);
+  basic_block var_bb = gimple_bb (def_stmt);
+
+  if (var == NULL || var_bb == NULL)
+    return;
+
+  /* For the moment we assume that an SSA chain only contains phi nodes, and
+     eventually one of the phi arguments will be an integer constant.  In the
+     future, this could be extended to also handle simple assignments of
+     arithmetic operations.  */
+  if (gimple_code (def_stmt) != GIMPLE_PHI)
+    return;
+
+  /* Avoid infinite recursion.  */
+  if (visited_phis->add (def_stmt))
+    return;
+
+  gphi *phi = as_a <gphi *> (def_stmt);
+  int next_path_length = 0;
+  basic_block last_bb_in_path = path->last ();
+
+  /* Following the chain of SSA_NAME definitions, we jumped from a definition in
+     LAST_BB_IN_PATH to a definition in VAR_BB.  When these basic blocks are
+     different, append to PATH the blocks from LAST_BB_IN_PATH to VAR_BB.  */
+  if (var_bb != last_bb_in_path)
+    {
+      edge e;
+      int e_count = 0;
+      edge_iterator ei;
+      vec<basic_block, va_gc> *next_path;
+      vec_alloc (next_path, n_basic_blocks_for_fn (cfun));
+
+      FOR_EACH_EDGE (e, ei, last_bb_in_path->preds)
+	{
+	  hash_set<basic_block> *visited_bbs = new hash_set<basic_block>;
+
+	  if (fsm_find_thread_path (var_bb, e->src, next_path, visited_bbs, 0))
+	    ++e_count;
+
+	  delete visited_bbs;
+
+	  /* If there is more than one path, stop.  */
+	  if (e_count > 1)
+	    {
+	      vec_free (next_path);
+	      return;
+	    }
+	}
+
+      /* Append all the nodes from NEXT_PATH to PATH.  */
+      vec_safe_splice (path, next_path);
+      next_path_length = next_path->length ();
+      vec_free (next_path);
+    }
+
+  gcc_assert (path->last () == var_bb);
+
+  /* Iterate over the arguments of PHI.  */
+  unsigned int i;
+  for (i = 0; i < gimple_phi_num_args (phi); i++)
+    {
+      tree arg = gimple_phi_arg_def (phi, i);
+      basic_block bbi = gimple_phi_arg_edge (phi, i)->src;
+
+      /* Skip edges pointing outside the current loop.  */
+      if (!arg || var_bb->loop_father != bbi->loop_father)
+	continue;
+
+      if (TREE_CODE (arg) == SSA_NAME)
+	{
+	  vec_safe_push (path, bbi);
+	  /* Recursively follow SSA_NAMEs looking for a constant definition.  */
+	  fsm_find_control_statement_thread_paths (arg, visited_phis, path);
+	  path->pop ();
+	  continue;
+	}
+
+      if (TREE_CODE (arg) != INTEGER_CST)
+	continue;
+
+      int path_length = path->length ();
+      /* A path with less than 2 basic blocks should not be jump-threaded.  */
+      if (path_length < 2)
+	continue;
+
+      if (path_length > PARAM_VALUE (PARAM_MAX_FSM_THREAD_LENGTH))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "FSM jump-thread path not considered: "
+		     "the number of basic blocks on the path "
+		     "exceeds PARAM_MAX_FSM_THREAD_LENGTH.\n");
+	  continue;
+	}
+
+      if (max_threaded_paths <= 0)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "FSM jump-thread path not considered: "
+		     "the number of previously recorded FSM paths to thread "
+		     "exceeds PARAM_MAX_FSM_THREAD_PATHS.\n");
+	  continue;
+	}
+
+      /* Add BBI to the path.  */
+      vec_safe_push (path, bbi);
+      ++path_length;
+
+      int n_insns = 0;
+      gimple_stmt_iterator gsi;
+      int j;
+      loop_p loop = (*path)[0]->loop_father;
+      bool path_crosses_loops = false;
+
+      /* Count the number of instructions on the path: as these instructions
+	 will have to be duplicated, we will not record the path if there are
+	 too many instructions on the path.  Also check that all the blocks in
+	 the path belong to a single loop.  */
+      for (j = 1; j < path_length - 1; j++)
+	{
+	  basic_block bb = (*path)[j];
+
+	  if (bb->loop_father != loop)
+	    {
+	      path_crosses_loops = true;
+	      break;
+	    }
+
+	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+	    {
+	      gimple stmt = gsi_stmt (gsi);
+	      /* Do not count empty statements and labels.  */
+	      if (gimple_code (stmt) != GIMPLE_NOP
+		  && gimple_code (stmt) != GIMPLE_LABEL
+		  && !is_gimple_debug (stmt))
+		++n_insns;
+	    }
+	}
+
+      if (path_crosses_loops)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "FSM jump-thread path not considered: "
+		     "the path crosses loops.\n");
+	  path->pop ();
+	  continue;
+	}
+
+      if (n_insns >= PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATH_INSNS))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "FSM jump-thread path not considered: "
+		     "the number of instructions on the path "
+		     "exceeds PARAM_MAX_FSM_THREAD_PATH_INSNS.\n");
+	  path->pop ();
+	  continue;
+	}
+
+      vec<jump_thread_edge *> *jump_thread_path
+	= new vec<jump_thread_edge *> ();
+
+      /* Record the edges between the blocks in PATH.  */
+      for (j = 0; j < path_length - 1; j++)
+	{
+	  edge e = find_edge ((*path)[path_length - j - 1],
+			      (*path)[path_length - j - 2]);
+	  gcc_assert (e);
+	  jump_thread_edge *x = new jump_thread_edge (e, EDGE_FSM_THREAD);
+	  jump_thread_path->safe_push (x);
+	}
+
+      /* Add the edge taken when the control variable has value ARG.  */
+      edge taken_edge = find_taken_edge ((*path)[0], arg);
+      jump_thread_edge *x
+	= new jump_thread_edge (taken_edge, EDGE_NO_COPY_SRC_BLOCK);
+      jump_thread_path->safe_push (x);
+
+      register_jump_thread (jump_thread_path);
+      --max_threaded_paths;
+
+      /* Remove BBI from the path.  */
+      path->pop ();
+    }
+
+  /* Remove all the nodes that we added from NEXT_PATH.  */
+  if (next_path_length)
+    vec_safe_truncate (path, (path->length () - next_path_length));
+}
+
 /* We are exiting E->src, see if E->dest ends with a conditional
    jump which has a known value when reached via E.
 
@@ -1033,7 +1272,10 @@ thread_through_normal_block (edge e,
       cond = simplify_control_stmt_condition (e, stmt, dummy_cond, simplify,
 					      handle_dominating_asserts);
 
-      if (cond && is_gimple_min_invariant (cond))
+      if (!cond)
+	return 0;
+
+      if (is_gimple_min_invariant (cond))
 	{
 	  edge taken_edge = find_taken_edge (e->dest, cond);
 	  basic_block dest = (taken_edge ? taken_edge->dest : NULL);
@@ -1079,6 +1321,26 @@ thread_through_normal_block (edge e,
 				      backedge_seen_p);
 	  return 1;
 	}
+
+      if (!flag_expensive_optimizations
+	  || TREE_CODE (cond) != SSA_NAME
+	  || e->dest->loop_father != e->src->loop_father
+	  || loop_depth (e->dest->loop_father) == 0)
+	return 0;
+
+      /* When COND cannot be simplified, try to find paths from a control
+	 statement back through the PHI nodes which would affect that control
+	 statement.  */
+      vec<basic_block, va_gc> *bb_path;
+      vec_alloc (bb_path, n_basic_blocks_for_fn (cfun));
+      vec_safe_push (bb_path, e->dest);
+      hash_set<gimple> *visited_phis = new hash_set<gimple>;
+
+      max_threaded_paths = PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATHS);
+      fsm_find_control_statement_thread_paths (cond, visited_phis, bb_path);
+
+      delete visited_phis;
+      vec_free (bb_path);
     }
   return 0;
 }
diff --git a/gcc/tree-ssa-threadupdate.c b/gcc/tree-ssa-threadupdate.c
index ca0b8bf..1dbffee 100644
--- a/gcc/tree-ssa-threadupdate.c
+++ b/gcc/tree-ssa-threadupdate.c
@@ -167,8 +167,9 @@ dump_jump_thread_path (FILE *dump_file, vec<jump_thread_edge *> path,
 		       bool registering)
 {
   fprintf (dump_file,
-	   "  %s jump thread: (%d, %d) incoming edge; ",
+	   "  %s%s jump thread: (%d, %d) incoming edge; ",
 	   (registering ? "Registering" : "Cancelling"),
+	   (path[0]->type == EDGE_FSM_THREAD ? " FSM": ""),
 	   path[0]->e->src->index, path[0]->e->dest->index);
 
   for (unsigned int i = 1; i < path.length (); i++)
@@ -2317,6 +2318,155 @@ bb_ends_with_multiway_branch (basic_block bb ATTRIBUTE_UNUSED)
   return false;
 }
 
+/* Verify that the REGION is a Single Entry Multiple Exits region: make sure no
+   edge other than ENTRY is entering the REGION.  */
+
+DEBUG_FUNCTION void
+verify_seme (edge entry, basic_block *region, unsigned n_region)
+{
+  bitmap bbs = BITMAP_ALLOC (NULL);
+
+  for (unsigned i = 0; i < n_region; i++)
+    bitmap_set_bit (bbs, region[i]->index);
+
+  for (unsigned i = 0; i < n_region; i++)
+    {
+      edge e;
+      edge_iterator ei;
+      basic_block bb = region[i];
+
+      /* All predecessors other than ENTRY->src should be in the region.  */
+      for (ei = ei_start (bb->preds); (e = ei_safe_edge (ei)); ei_next (&ei))
+	if (e != entry)
+	  gcc_assert (bitmap_bit_p (bbs, e->src->index));
+    }
+
+  BITMAP_FREE (bbs);
+}
+
+/* Duplicates a Single Entry Multiple Exit REGION (set of N_REGION basic
+   blocks).  The ENTRY edge is redirected to the duplicate of the region.  If
+   REGION is not a Single Entry region, ignore any incoming edges other than
+   ENTRY: this makes the copied region a Single Entry region.
+
+   Remove the last conditional statement in the last basic block in the REGION,
+   and create a single fallthru edge pointing to the same destination as the
+   EXIT edge.
+
+   The new basic blocks are stored to REGION_COPY in the same order as they had
+   in REGION, provided that REGION_COPY is not NULL.
+
+   Returns false if it is unable to copy the region, true otherwise.  */
+
+static bool
+duplicate_seme_region (edge entry, edge exit,
+		       basic_block *region, unsigned n_region,
+		       basic_block *region_copy)
+{
+  unsigned i;
+  bool free_region_copy = false, copying_header = false;
+  struct loop *loop = entry->dest->loop_father;
+  edge exit_copy;
+  edge redirected;
+  int total_freq = 0, entry_freq = 0;
+  gcov_type total_count = 0, entry_count = 0;
+
+  if (!can_copy_bbs_p (region, n_region))
+    return false;
+
+  /* Some sanity checking.  Note that we do not check for all possible
+     missuses of the functions.  I.e. if you ask to copy something weird,
+     it will work, but the state of structures probably will not be
+     correct.  */
+  for (i = 0; i < n_region; i++)
+    {
+      /* We do not handle subloops, i.e. all the blocks must belong to the
+	 same loop.  */
+      if (region[i]->loop_father != loop)
+	return false;
+    }
+
+  initialize_original_copy_tables ();
+
+  if (copying_header)
+    set_loop_copy (loop, loop_outer (loop));
+  else
+    set_loop_copy (loop, loop);
+
+  if (!region_copy)
+    {
+      region_copy = XNEWVEC (basic_block, n_region);
+      free_region_copy = true;
+    }
+
+  if (entry->dest->count)
+    {
+      total_count = entry->dest->count;
+      entry_count = entry->count;
+      /* Fix up corner cases, to avoid division by zero or creation of negative
+	 frequencies.  */
+      if (entry_count > total_count)
+	entry_count = total_count;
+    }
+  else
+    {
+      total_freq = entry->dest->frequency;
+      entry_freq = EDGE_FREQUENCY (entry);
+      /* Fix up corner cases, to avoid division by zero or creation of negative
+	 frequencies.  */
+      if (total_freq == 0)
+	total_freq = 1;
+      else if (entry_freq > total_freq)
+	entry_freq = total_freq;
+    }
+
+  copy_bbs (region, n_region, region_copy, &exit, 1, &exit_copy, loop,
+	    split_edge_bb_loc (entry), 0);
+  if (total_count)
+    {
+      scale_bbs_frequencies_gcov_type (region, n_region,
+				       total_count - entry_count,
+				       total_count);
+      scale_bbs_frequencies_gcov_type (region_copy, n_region, entry_count,
+				       total_count);
+    }
+  else
+    {
+      scale_bbs_frequencies_int (region, n_region, total_freq - entry_freq,
+				 total_freq);
+      scale_bbs_frequencies_int (region_copy, n_region, entry_freq, total_freq);
+    }
+
+#ifdef ENABLE_CHECKING
+  /* Make sure no edge other than ENTRY is entering the copied region.  */
+  verify_seme (entry, region_copy, n_region);
+#endif
+
+  /* Remove the last branch in the jump thread path.  */
+  remove_ctrl_stmt_and_useless_edges (region_copy[n_region - 1], exit->dest);
+  edge e = make_edge (region_copy[n_region - 1], exit->dest, EDGE_FALLTHRU);
+
+  if (e) {
+    rescan_loop_exit (e, true, false);
+    e->probability = REG_BR_PROB_BASE;
+    e->count = region_copy[n_region - 1]->count;
+  }
+
+  /* Redirect the entry and add the phi node arguments.  */
+  redirected = redirect_edge_and_branch (entry, get_bb_copy (entry->dest));
+  gcc_assert (redirected != NULL);
+  flush_pending_stmts (entry);
+
+  /* Add the other PHI node arguments.  */
+  add_phi_args_after_copy (region_copy, n_region, NULL);
+
+  if (free_region_copy)
+    free (region_copy);
+
+  free_original_copy_tables ();
+  return true;
+}
+
 /* Walk through all blocks and thread incoming edges to the appropriate
    outgoing edge for each edge pair recorded in THREADED_EDGES.
 
@@ -2343,6 +2493,56 @@ thread_through_all_blocks (bool may_peel_loop_headers)
   threaded_blocks = BITMAP_ALLOC (NULL);
   memset (&thread_stats, 0, sizeof (thread_stats));
 
+  /* Jump-thread all FSM threads before other jump-threads.  */
+  for (i = 0; i < paths.length ();)
+    {
+      vec<jump_thread_edge *> *path = paths[i];
+      edge entry = (*path)[0]->e;
+
+      if ((*path)[0]->type != EDGE_FSM_THREAD
+	  /* Do not jump-thread twice from the same block.  */
+	  || bitmap_bit_p (threaded_blocks, entry->src->index)) {
+	i++;
+	continue;
+      }
+
+      unsigned len = path->length ();
+      edge exit = (*path)[len - 1]->e;
+      basic_block *region = XNEWVEC (basic_block, len - 1);
+
+      for (unsigned int j = 0; j < len - 1; j++)
+	region[j] = (*path)[j]->e->dest;
+
+      if (duplicate_seme_region (entry, exit, region, len - 1, NULL))
+	{
+	  /* We do not update dominance info.  */
+	  free_dominance_info (CDI_DOMINATORS);
+	  bitmap_set_bit (threaded_blocks, entry->src->index);
+	}
+
+      delete_jump_thread_path (path);
+      paths.unordered_remove (i);
+    }
+
+  /* Remove from PATHS all the jump-threads starting with an edge already
+     jump-threaded.  */
+  for (i = 0; i < paths.length ();)
+    {
+      vec<jump_thread_edge *> *path = paths[i];
+      edge entry = (*path)[0]->e;
+
+      /* Do not jump-thread twice from the same block.  */
+      if (bitmap_bit_p (threaded_blocks, entry->src->index))
+	{
+	  delete_jump_thread_path (path);
+	  paths.unordered_remove (i);
+	}
+      else
+	i++;
+    }
+
+  bitmap_clear (threaded_blocks);
+
   mark_threaded_blocks (threaded_blocks);
 
   initialize_original_copy_tables ();
diff --git a/gcc/tree-ssa-threadupdate.h b/gcc/tree-ssa-threadupdate.h
index 426aca5..22c5bce 100644
--- a/gcc/tree-ssa-threadupdate.h
+++ b/gcc/tree-ssa-threadupdate.h
@@ -26,6 +26,7 @@ extern bool thread_through_all_blocks (bool);
 enum jump_thread_edge_type
 {
   EDGE_START_JUMP_THREAD,
+  EDGE_FSM_THREAD,
   EDGE_COPY_SRC_BLOCK,
   EDGE_COPY_SRC_JOINER_BLOCK,
   EDGE_NO_COPY_SRC_BLOCK
-- 
1.7.10.4


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-04  8:38                                       ` Sebastian Pop
@ 2014-12-04  9:14                                         ` Sebastian Pop
  2014-12-04 11:00                                           ` Sebastian Pop
  2014-12-05 20:08                                           ` Jeff Law
  0 siblings, 2 replies; 54+ messages in thread
From: Sebastian Pop @ 2014-12-04  9:14 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

Sebastian Pop wrote:
> a fail I have not seen in the past:
> 
> FAIL: gcc.c-torture/compile/pr27571.c   -Os  (internal compiler error)
> 
> I am still investigating why this fails: as far as I can see for now this is
> because in copying the FSM path we create an internal loop that is then
> discovered by the loop verifier as a natural loop and is not yet in the existing
> loop sturctures.  I will try to fix this in duplicate_seme by invalidating the
> loop structure after we code generated all the FSM paths.  I will submit an
> updated patch when it passes regtest.

We need at least this patch to fix the fail:

@@ -2518,6 +2518,7 @@ thread_through_all_blocks (bool may_peel_loop_headers)
      if (duplicate_seme_region (entry, exit, region, len - 1, NULL))                                                                                                                                                            
        {                                                             
          /* We do not update dominance info.  */
          free_dominance_info (CDI_DOMINATORS);
          bitmap_set_bit (threaded_blocks, entry->src->index);
+         retval = true;
        }

And this will trigger in the end of the code gen function:

 if (retval)
    loops_state_set (LOOPS_NEED_FIXUP);

That will fix the loop structures.  I'm testing this patch on top of the one I
have just sent out.

Sebastian

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-04  9:14                                         ` Sebastian Pop
@ 2014-12-04 11:00                                           ` Sebastian Pop
  2014-12-05 20:08                                           ` Jeff Law
  1 sibling, 0 replies; 54+ messages in thread
From: Sebastian Pop @ 2014-12-04 11:00 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

Sebastian Pop wrote:
> Sebastian Pop wrote:
> > a fail I have not seen in the past:
> > 
> > FAIL: gcc.c-torture/compile/pr27571.c   -Os  (internal compiler error)
> > 
> > I am still investigating why this fails: as far as I can see for now this is
> > because in copying the FSM path we create an internal loop that is then
> > discovered by the loop verifier as a natural loop and is not yet in the existing
> > loop sturctures.  I will try to fix this in duplicate_seme by invalidating the
> > loop structure after we code generated all the FSM paths.  I will submit an
> > updated patch when it passes regtest.
> 
> We need at least this patch to fix the fail:
> 
> @@ -2518,6 +2518,7 @@ thread_through_all_blocks (bool may_peel_loop_headers)
>       if (duplicate_seme_region (entry, exit, region, len - 1, NULL))                                                                                                                                                            
>         {                                                             
>           /* We do not update dominance info.  */
>           free_dominance_info (CDI_DOMINATORS);
>           bitmap_set_bit (threaded_blocks, entry->src->index);
> +         retval = true;
>         }
> 
> And this will trigger in the end of the code gen function:
> 
>  if (retval)
>     loops_state_set (LOOPS_NEED_FIXUP);
> 
> That will fix the loop structures.  I'm testing this patch on top of the one I
> have just sent out.

This passed bootstrap and regression test on x86_64-linux.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-01 21:06                                     ` Jeff Law
  2014-12-02 10:15                                       ` Richard Biener
  2014-12-04  8:38                                       ` Sebastian Pop
@ 2014-12-04 11:04                                       ` Sebastian Pop
  2014-12-04 14:30                                         ` Sebastian Pop
  2 siblings, 1 reply; 54+ messages in thread
From: Sebastian Pop @ 2014-12-04 11:04 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

Jeff Law wrote:
> I'm a bit worried about compile-time impacts of the all the
> recursion

I will also restrict the recursion to the loop in which we look for the FSM
thread, like this:

diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
index a6fb361..9a153bb 100644
--- a/gcc/tree-ssa-threadedge.c
+++ b/gcc/tree-ssa-threadedge.c
@@ -959,13 +959,17 @@ thread_around_empty_blocks (edge taken_edge,
 
 /* Return true if the CFG contains at least one path from START_BB to END_BB.
    When a path is found, record in PATH the blocks from END_BB to START_BB.
-   VISITED_BBS is used to make sure we don't fall into an infinite loop.  */
+   VISITED_BBS is used to make sure we don't fall into an infinite loop.  Bound
+   the recursion to basic blocks belonging to LOOP.  */
 
 static bool
 fsm_find_thread_path (basic_block start_bb, basic_block end_bb,
                      vec<basic_block, va_gc> *&path,
-                     hash_set<basic_block> *visited_bbs, int n_insns)
+                     hash_set<basic_block> *visited_bbs, loop_p loop)
 {
+  if (loop != start_bb->loop_father)
+    return false;
+
   if (start_bb == end_bb)
     {
       vec_safe_push (path, start_bb);
@@ -977,7 +981,7 @@ fsm_find_thread_path (basic_block start_bb, basic_block end_bb,
       edge e;
       edge_iterator ei;
       FOR_EACH_EDGE (e, ei, start_bb->succs)
-       if (fsm_find_thread_path (e->dest, end_bb, path, visited_bbs, n_insns))
+       if (fsm_find_thread_path (e->dest, end_bb, path, visited_bbs, loop))
          {
            vec_safe_push (path, start_bb);
            return true;
@@ -1035,7 +1039,8 @@ fsm_find_control_statement_thread_paths (tree expr,
        {
          hash_set<basic_block> *visited_bbs = new hash_set<basic_block>;
 
-         if (fsm_find_thread_path (var_bb, e->src, next_path, visited_bbs, 0))
+         if (fsm_find_thread_path (var_bb, e->src, next_path, visited_bbs,
+                                   e->src->loop_father))
            ++e_count;
 
          delete visited_bbs;

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-04 11:04                                       ` Sebastian Pop
@ 2014-12-04 14:30                                         ` Sebastian Pop
  2014-12-05 20:12                                           ` Jeff Law
  0 siblings, 1 reply; 54+ messages in thread
From: Sebastian Pop @ 2014-12-04 14:30 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 342 bytes --]

Sebastian Pop wrote:
> Jeff Law wrote:
> > I'm a bit worried about compile-time impacts of the all the
> > recursion
> 
> I will also restrict the recursion to the loop in which we look for the FSM
> thread.

The attached patch includes this change.  It passed bootstrap and regression
test on x86_64-linux.  Ok to commit?

Thanks,
Sebastian

[-- Attachment #2: 0001-extend-jump-thread-for-finite-state-automata-PR-5474.patch --]
[-- Type: text/x-diff, Size: 25566 bytes --]

From 0e3312921c07c0e0dd5c1bf5f24050b2336475ef Mon Sep 17 00:00:00 2001
From: Sebastian Pop <s.pop@samsung.com>
Date: Fri, 26 Sep 2014 14:54:20 -0500
Subject: [PATCH] extend jump thread for finite state automata PR 54742

Adapted from a patch from James Greenhalgh.

	* params.def (max-fsm-thread-path-insns, max-fsm-thread-length,
	max-fsm-thread-paths): New.

	* doc/invoke.texi (max-fsm-thread-path-insns, max-fsm-thread-length,
	max-fsm-thread-paths): Documented.

	* tree-cfg.c (split_edge_bb_loc): Export.
	* tree-cfg.h (split_edge_bb_loc): Declared extern.

	* tree-ssa-threadedge.c (simplify_control_stmt_condition): Restore the
	original value of cond when simplification fails.
	(fsm_find_thread_path): New.
	(fsm_find_control_statement_thread_paths): New.
	(fsm_thread_through_normal_block): Call
	find_control_statement_thread_paths.

	* tree-ssa-threadupdate.c (dump_jump_thread_path): Pretty print
	EDGE_FSM_THREAD.
	(verify_seme): New.
	(duplicate_seme_region): New.
	(thread_through_all_blocks): Generate code for EDGE_FSM_THREAD edges
	calling duplicate_seme_region.

	* tree-ssa-threadupdate.h (jump_thread_edge_type): Add EDGE_FSM_THREAD.

	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c: New.
	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c: New.
---
 gcc/doc/invoke.texi                              |   12 +
 gcc/params.def                                   |   15 ++
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c |   43 ++++
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c |  127 ++++++++++
 gcc/tree-cfg.c                                   |    2 +-
 gcc/tree-cfg.h                                   |    1 +
 gcc/tree-ssa-threadedge.c                        |  277 +++++++++++++++++++++-
 gcc/tree-ssa-threadupdate.c                      |  203 +++++++++++++++-
 gcc/tree-ssa-threadupdate.h                      |    1 +
 9 files changed, 678 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 89edddb..074183f 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -10624,6 +10624,18 @@ large and significantly increase compile time at optimization level
 @option{-O1} and higher.  This parameter is a maximum nubmer of statements
 in a single generated constructor.  Default value is 5000.
 
+@item max-fsm-thread-path-insns
+Maximum number of instructions to copy when duplicating blocks on a
+finite state automaton jump thread path.  The default is 100.
+
+@item max-fsm-thread-length
+Maximum number of basic blocks on a finite state automaton jump thread
+path.  The default is 10.
+
+@item max-fsm-thread-paths
+Maximum number of new jump thread paths to create for a finite state
+automaton.  The default is 50.
+
 @end table
 @end table
 
diff --git a/gcc/params.def b/gcc/params.def
index 9b21c07..edf3f53 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1140,6 +1140,21 @@ DEFPARAM (PARAM_CHKP_MAX_CTOR_SIZE,
 	  "Maximum number of statements to be included into a single static "
 	  "constructor generated by Pointer Bounds Checker",
 	  5000, 100, 0)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_PATH_INSNS,
+	  "max-fsm-thread-path-insns",
+	  "Maximum number of instructions to copy when duplicating blocks on a finite state automaton jump thread path",
+	  100, 1, 999999)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_LENGTH,
+	  "max-fsm-thread-length",
+	  "Maximum number of basic blocks on a finite state automaton jump thread path",
+	  10, 1, 999999)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_PATHS,
+	  "max-fsm-thread-paths",
+	  "Maximum number of new jump thread paths to create for a finite state automaton",
+	  50, 1, 999999)
 /*
 
 Local variables:
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
new file mode 100644
index 0000000..bb34a74
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-dom1-details" } */
+/* { dg-final { scan-tree-dump-times "FSM" 6 "dom1" } } */
+/* { dg-final { cleanup-tree-dump "dom1" } } */
+
+int sum0, sum1, sum2, sum3;
+int foo (char *s, char **ret)
+{
+  int state=0;
+  char c;
+
+  for (; *s && state != 4; s++)
+    {
+      c = *s;
+      if (c == '*')
+	{
+	  s++;
+	  break;
+	}
+      switch (state)
+	{
+	case 0:
+	  if (c == '+')
+	    state = 1;
+	  else if (c != '-')
+	    sum0+=c;
+	  break;
+	case 1:
+	  if (c == '+')
+	    state = 2;
+	  else if (c == '-')
+	    state = 0;
+	  else
+	    sum1+=c;
+	  break;
+	default:
+	  break;
+	}
+
+    }
+  *ret = s;
+  return state;
+}
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
new file mode 100644
index 0000000..21474f0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
@@ -0,0 +1,127 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-dom1-details" } */
+/* { dg-final { scan-tree-dump-times "FSM" 19 "dom1" } } */
+/* { dg-final { cleanup-tree-dump "dom1" } } */
+
+enum STATE {
+  S0=0,
+  SI,
+  S1,
+  S2,
+  S3,
+  S4,
+  S5,
+  S6
+};
+
+int bar (enum STATE s);
+
+enum STATE foo (unsigned char **y, unsigned *c)
+{
+  unsigned char *x = *y;
+  unsigned char n;
+  enum STATE s = S0;
+
+  for( ; *x && s != SI; x++ )
+    {
+      n = *x;
+      if (n == 'x')
+	{
+	  x++;
+	  break;
+	}
+      switch(s)
+	{
+	case S0:
+	  if(bar(n))
+	    s = S3;
+	  else if( n == 'a' || n == 'b' )
+	    s = S1;
+	  else if( n == 'c' )
+	    s = S4;
+	  else
+	    {
+	      s = SI;
+	      c[SI]++;
+	    }
+	  c[S0]++;
+	  break;
+	case S1:
+	  if(bar(n))
+	    {
+	      s = S3;
+	      c[S1]++;
+	    }
+	  else if( n == 'c' )
+	    {
+	      s = S4;
+	      c[S1]++;
+	    }
+	  else
+	    {
+	      s = SI;
+	      c[S1]++;
+	    }
+	  break;
+	case S3:
+	  if( n == 'c' )
+	    {
+	      s = S4;
+	      c[S3]++;
+	    }
+	  else if(!bar(n))
+	    {
+	      s = SI;
+	      c[S3]++;
+	    }
+	  break;
+	case S4:
+	  if( n == 'E' || n == 'e' )
+	    {
+	      s = S2;
+	      c[S4]++;
+	    }
+	  else if(!bar(n))
+	    {
+	      s = SI;
+	      c[S4]++;
+	    }
+	  break;
+	case S2:
+	  if( n == 'a' || n == 'b' )
+	    {
+	      s = S5;
+	      c[S2]++;
+	    }
+	  else
+	    {
+	      s = SI;
+	      c[S2]++;
+	    }
+	  break;
+	case S5:
+	  if(bar(n))
+	    {
+	      s = S6;
+	      c[S5]++;
+	    }
+	  else
+	    {
+	      s = SI;
+	      c[S5]++;
+	    }
+	  break;
+	case S6:
+	  if(!bar(n))
+	    {
+	      s = SI;
+	      c[SI]++;
+	    }
+	  break;
+	default:
+	  break;
+	}
+    }
+  *y=x;
+  return s;
+}
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index 0a8d7a9..a4ac9d8 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -2666,7 +2666,7 @@ reinstall_phi_args (edge new_edge, edge old_edge)
    near its "logical" location.  This is of most help to humans looking
    at debugging dumps.  */
 
-static basic_block
+basic_block
 split_edge_bb_loc (edge edge_in)
 {
   basic_block dest = edge_in->dest;
diff --git a/gcc/tree-cfg.h b/gcc/tree-cfg.h
index d35e5ba..834fa71 100644
--- a/gcc/tree-cfg.h
+++ b/gcc/tree-cfg.h
@@ -67,6 +67,7 @@ extern void verify_gimple_in_cfg (struct function *, bool);
 extern tree gimple_block_label (basic_block);
 extern void add_phi_args_after_copy_bb (basic_block);
 extern void add_phi_args_after_copy (basic_block *, unsigned, edge);
+extern basic_block split_edge_bb_loc (edge);
 extern bool gimple_duplicate_sese_region (edge, edge, basic_block *, unsigned,
 					basic_block *, bool);
 extern bool gimple_duplicate_sese_tail (edge, edge, basic_block *, unsigned,
diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
index 8b0b7b8..29b20c8 100644
--- a/gcc/tree-ssa-threadedge.c
+++ b/gcc/tree-ssa-threadedge.c
@@ -56,6 +56,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "tree-ssa-threadedge.h"
 #include "builtins.h"
+#include "cfg.h"
+#include "cfganal.h"
 
 /* To avoid code explosion due to jump threading, we limit the
    number of statements we are going to copy.  This variable
@@ -661,6 +663,7 @@ simplify_control_stmt_condition (edge e,
      rather than use a relational operator.  These are simpler to handle.  */
   if (TREE_CODE (cond) == SSA_NAME)
     {
+      tree original_lhs = cond;
       cached_lhs = cond;
 
       /* Get the variable's current value from the equivalence chains.
@@ -689,6 +692,12 @@ simplify_control_stmt_condition (edge e,
 	 pass specific callback to try and simplify it further.  */
       if (cached_lhs && ! is_gimple_min_invariant (cached_lhs))
         cached_lhs = (*simplify) (stmt, stmt);
+
+      /* We couldn't find an invariant.  But, callers of this
+	 function may be able to do something useful with the
+	 unmodified destination.  */
+      if (!cached_lhs)
+	cached_lhs = original_lhs;
     }
   else
     cached_lhs = NULL;
@@ -948,6 +957,249 @@ thread_around_empty_blocks (edge taken_edge,
   return false;
 }
 
+/* Return true if the CFG contains at least one path from START_BB to END_BB.
+   When a path is found, record in PATH the blocks from END_BB to START_BB.
+   VISITED_BBS is used to make sure we don't fall into an infinite loop.  Bound
+   the recursion to basic blocks belonging to LOOP.  */
+
+static bool
+fsm_find_thread_path (basic_block start_bb, basic_block end_bb,
+		      vec<basic_block, va_gc> *&path,
+		      hash_set<basic_block> *visited_bbs, loop_p loop)
+{
+  if (loop != start_bb->loop_father)
+    return false;
+
+  if (start_bb == end_bb)
+    {
+      vec_safe_push (path, start_bb);
+      return true;
+    }
+
+  if (!visited_bbs->add (start_bb))
+    {
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, start_bb->succs)
+	if (fsm_find_thread_path (e->dest, end_bb, path, visited_bbs, loop))
+	  {
+	    vec_safe_push (path, start_bb);
+	    return true;
+	  }
+    }
+
+  return false;
+}
+
+static int max_threaded_paths;
+
+/* We trace the value of the variable EXPR back through any phi nodes looking
+   for places where it gets a constant value and save the path.  Stop after
+   having recorded MAX_PATHS jump threading paths.  */
+
+static void
+fsm_find_control_statement_thread_paths (tree expr,
+					 hash_set<gimple> *visited_phis,
+					 vec<basic_block, va_gc> *&path)
+{
+  tree var = SSA_NAME_VAR (expr);
+  gimple def_stmt = SSA_NAME_DEF_STMT (expr);
+  basic_block var_bb = gimple_bb (def_stmt);
+
+  if (var == NULL || var_bb == NULL)
+    return;
+
+  /* For the moment we assume that an SSA chain only contains phi nodes, and
+     eventually one of the phi arguments will be an integer constant.  In the
+     future, this could be extended to also handle simple assignments of
+     arithmetic operations.  */
+  if (gimple_code (def_stmt) != GIMPLE_PHI)
+    return;
+
+  /* Avoid infinite recursion.  */
+  if (visited_phis->add (def_stmt))
+    return;
+
+  gphi *phi = as_a <gphi *> (def_stmt);
+  int next_path_length = 0;
+  basic_block last_bb_in_path = path->last ();
+
+  /* Following the chain of SSA_NAME definitions, we jumped from a definition in
+     LAST_BB_IN_PATH to a definition in VAR_BB.  When these basic blocks are
+     different, append to PATH the blocks from LAST_BB_IN_PATH to VAR_BB.  */
+  if (var_bb != last_bb_in_path)
+    {
+      edge e;
+      int e_count = 0;
+      edge_iterator ei;
+      vec<basic_block, va_gc> *next_path;
+      vec_alloc (next_path, n_basic_blocks_for_fn (cfun));
+
+      FOR_EACH_EDGE (e, ei, last_bb_in_path->preds)
+	{
+	  hash_set<basic_block> *visited_bbs = new hash_set<basic_block>;
+
+	  if (fsm_find_thread_path (var_bb, e->src, next_path, visited_bbs,
+				    e->src->loop_father))
+	    ++e_count;
+
+	  delete visited_bbs;
+
+	  /* If there is more than one path, stop.  */
+	  if (e_count > 1)
+	    {
+	      vec_free (next_path);
+	      return;
+	    }
+	}
+
+      /* Stop if we have not found a path: this could occur when the recursion
+	 is stopped by one of the bounds.  */
+      if (e_count == 0)
+	{
+	  vec_free (next_path);
+	  return;
+	}
+
+      /* Append all the nodes from NEXT_PATH to PATH.  */
+      vec_safe_splice (path, next_path);
+      next_path_length = next_path->length ();
+      vec_free (next_path);
+    }
+
+  gcc_assert (path->last () == var_bb);
+
+  /* Iterate over the arguments of PHI.  */
+  unsigned int i;
+  for (i = 0; i < gimple_phi_num_args (phi); i++)
+    {
+      tree arg = gimple_phi_arg_def (phi, i);
+      basic_block bbi = gimple_phi_arg_edge (phi, i)->src;
+
+      /* Skip edges pointing outside the current loop.  */
+      if (!arg || var_bb->loop_father != bbi->loop_father)
+	continue;
+
+      if (TREE_CODE (arg) == SSA_NAME)
+	{
+	  vec_safe_push (path, bbi);
+	  /* Recursively follow SSA_NAMEs looking for a constant definition.  */
+	  fsm_find_control_statement_thread_paths (arg, visited_phis, path);
+	  path->pop ();
+	  continue;
+	}
+
+      if (TREE_CODE (arg) != INTEGER_CST)
+	continue;
+
+      int path_length = path->length ();
+      /* A path with less than 2 basic blocks should not be jump-threaded.  */
+      if (path_length < 2)
+	continue;
+
+      if (path_length > PARAM_VALUE (PARAM_MAX_FSM_THREAD_LENGTH))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "FSM jump-thread path not considered: "
+		     "the number of basic blocks on the path "
+		     "exceeds PARAM_MAX_FSM_THREAD_LENGTH.\n");
+	  continue;
+	}
+
+      if (max_threaded_paths <= 0)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "FSM jump-thread path not considered: "
+		     "the number of previously recorded FSM paths to thread "
+		     "exceeds PARAM_MAX_FSM_THREAD_PATHS.\n");
+	  continue;
+	}
+
+      /* Add BBI to the path.  */
+      vec_safe_push (path, bbi);
+      ++path_length;
+
+      int n_insns = 0;
+      gimple_stmt_iterator gsi;
+      int j;
+      loop_p loop = (*path)[0]->loop_father;
+      bool path_crosses_loops = false;
+
+      /* Count the number of instructions on the path: as these instructions
+	 will have to be duplicated, we will not record the path if there are
+	 too many instructions on the path.  Also check that all the blocks in
+	 the path belong to a single loop.  */
+      for (j = 1; j < path_length - 1; j++)
+	{
+	  basic_block bb = (*path)[j];
+
+	  if (bb->loop_father != loop)
+	    {
+	      path_crosses_loops = true;
+	      break;
+	    }
+
+	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+	    {
+	      gimple stmt = gsi_stmt (gsi);
+	      /* Do not count empty statements and labels.  */
+	      if (gimple_code (stmt) != GIMPLE_NOP
+		  && gimple_code (stmt) != GIMPLE_LABEL
+		  && !is_gimple_debug (stmt))
+		++n_insns;
+	    }
+	}
+
+      if (path_crosses_loops)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "FSM jump-thread path not considered: "
+		     "the path crosses loops.\n");
+	  path->pop ();
+	  continue;
+	}
+
+      if (n_insns >= PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATH_INSNS))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "FSM jump-thread path not considered: "
+		     "the number of instructions on the path "
+		     "exceeds PARAM_MAX_FSM_THREAD_PATH_INSNS.\n");
+	  path->pop ();
+	  continue;
+	}
+
+      vec<jump_thread_edge *> *jump_thread_path
+	= new vec<jump_thread_edge *> ();
+
+      /* Record the edges between the blocks in PATH.  */
+      for (j = 0; j < path_length - 1; j++)
+	{
+	  edge e = find_edge ((*path)[path_length - j - 1],
+			      (*path)[path_length - j - 2]);
+	  gcc_assert (e);
+	  jump_thread_edge *x = new jump_thread_edge (e, EDGE_FSM_THREAD);
+	  jump_thread_path->safe_push (x);
+	}
+
+      /* Add the edge taken when the control variable has value ARG.  */
+      edge taken_edge = find_taken_edge ((*path)[0], arg);
+      jump_thread_edge *x
+	= new jump_thread_edge (taken_edge, EDGE_NO_COPY_SRC_BLOCK);
+      jump_thread_path->safe_push (x);
+
+      register_jump_thread (jump_thread_path);
+      --max_threaded_paths;
+
+      /* Remove BBI from the path.  */
+      path->pop ();
+    }
+
+  /* Remove all the nodes that we added from NEXT_PATH.  */
+  if (next_path_length)
+    vec_safe_truncate (path, (path->length () - next_path_length));
+}
+
 /* We are exiting E->src, see if E->dest ends with a conditional
    jump which has a known value when reached via E.
 
@@ -1033,7 +1285,10 @@ thread_through_normal_block (edge e,
       cond = simplify_control_stmt_condition (e, stmt, dummy_cond, simplify,
 					      handle_dominating_asserts);
 
-      if (cond && is_gimple_min_invariant (cond))
+      if (!cond)
+	return 0;
+
+      if (is_gimple_min_invariant (cond))
 	{
 	  edge taken_edge = find_taken_edge (e->dest, cond);
 	  basic_block dest = (taken_edge ? taken_edge->dest : NULL);
@@ -1079,6 +1334,26 @@ thread_through_normal_block (edge e,
 				      backedge_seen_p);
 	  return 1;
 	}
+
+      if (!flag_expensive_optimizations
+	  || TREE_CODE (cond) != SSA_NAME
+	  || e->dest->loop_father != e->src->loop_father
+	  || loop_depth (e->dest->loop_father) == 0)
+	return 0;
+
+      /* When COND cannot be simplified, try to find paths from a control
+	 statement back through the PHI nodes which would affect that control
+	 statement.  */
+      vec<basic_block, va_gc> *bb_path;
+      vec_alloc (bb_path, n_basic_blocks_for_fn (cfun));
+      vec_safe_push (bb_path, e->dest);
+      hash_set<gimple> *visited_phis = new hash_set<gimple>;
+
+      max_threaded_paths = PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATHS);
+      fsm_find_control_statement_thread_paths (cond, visited_phis, bb_path);
+
+      delete visited_phis;
+      vec_free (bb_path);
     }
   return 0;
 }
diff --git a/gcc/tree-ssa-threadupdate.c b/gcc/tree-ssa-threadupdate.c
index ca0b8bf..4f83a2e 100644
--- a/gcc/tree-ssa-threadupdate.c
+++ b/gcc/tree-ssa-threadupdate.c
@@ -167,8 +167,9 @@ dump_jump_thread_path (FILE *dump_file, vec<jump_thread_edge *> path,
 		       bool registering)
 {
   fprintf (dump_file,
-	   "  %s jump thread: (%d, %d) incoming edge; ",
+	   "  %s%s jump thread: (%d, %d) incoming edge; ",
 	   (registering ? "Registering" : "Cancelling"),
+	   (path[0]->type == EDGE_FSM_THREAD ? " FSM": ""),
 	   path[0]->e->src->index, path[0]->e->dest->index);
 
   for (unsigned int i = 1; i < path.length (); i++)
@@ -2317,6 +2318,155 @@ bb_ends_with_multiway_branch (basic_block bb ATTRIBUTE_UNUSED)
   return false;
 }
 
+/* Verify that the REGION is a Single Entry Multiple Exits region: make sure no
+   edge other than ENTRY is entering the REGION.  */
+
+DEBUG_FUNCTION void
+verify_seme (edge entry, basic_block *region, unsigned n_region)
+{
+  bitmap bbs = BITMAP_ALLOC (NULL);
+
+  for (unsigned i = 0; i < n_region; i++)
+    bitmap_set_bit (bbs, region[i]->index);
+
+  for (unsigned i = 0; i < n_region; i++)
+    {
+      edge e;
+      edge_iterator ei;
+      basic_block bb = region[i];
+
+      /* All predecessors other than ENTRY->src should be in the region.  */
+      for (ei = ei_start (bb->preds); (e = ei_safe_edge (ei)); ei_next (&ei))
+	if (e != entry)
+	  gcc_assert (bitmap_bit_p (bbs, e->src->index));
+    }
+
+  BITMAP_FREE (bbs);
+}
+
+/* Duplicates a Single Entry Multiple Exit REGION (set of N_REGION basic
+   blocks).  The ENTRY edge is redirected to the duplicate of the region.  If
+   REGION is not a Single Entry region, ignore any incoming edges other than
+   ENTRY: this makes the copied region a Single Entry region.
+
+   Remove the last conditional statement in the last basic block in the REGION,
+   and create a single fallthru edge pointing to the same destination as the
+   EXIT edge.
+
+   The new basic blocks are stored to REGION_COPY in the same order as they had
+   in REGION, provided that REGION_COPY is not NULL.
+
+   Returns false if it is unable to copy the region, true otherwise.  */
+
+static bool
+duplicate_seme_region (edge entry, edge exit,
+		       basic_block *region, unsigned n_region,
+		       basic_block *region_copy)
+{
+  unsigned i;
+  bool free_region_copy = false, copying_header = false;
+  struct loop *loop = entry->dest->loop_father;
+  edge exit_copy;
+  edge redirected;
+  int total_freq = 0, entry_freq = 0;
+  gcov_type total_count = 0, entry_count = 0;
+
+  if (!can_copy_bbs_p (region, n_region))
+    return false;
+
+  /* Some sanity checking.  Note that we do not check for all possible
+     missuses of the functions.  I.e. if you ask to copy something weird,
+     it will work, but the state of structures probably will not be
+     correct.  */
+  for (i = 0; i < n_region; i++)
+    {
+      /* We do not handle subloops, i.e. all the blocks must belong to the
+	 same loop.  */
+      if (region[i]->loop_father != loop)
+	return false;
+    }
+
+  initialize_original_copy_tables ();
+
+  if (copying_header)
+    set_loop_copy (loop, loop_outer (loop));
+  else
+    set_loop_copy (loop, loop);
+
+  if (!region_copy)
+    {
+      region_copy = XNEWVEC (basic_block, n_region);
+      free_region_copy = true;
+    }
+
+  if (entry->dest->count)
+    {
+      total_count = entry->dest->count;
+      entry_count = entry->count;
+      /* Fix up corner cases, to avoid division by zero or creation of negative
+	 frequencies.  */
+      if (entry_count > total_count)
+	entry_count = total_count;
+    }
+  else
+    {
+      total_freq = entry->dest->frequency;
+      entry_freq = EDGE_FREQUENCY (entry);
+      /* Fix up corner cases, to avoid division by zero or creation of negative
+	 frequencies.  */
+      if (total_freq == 0)
+	total_freq = 1;
+      else if (entry_freq > total_freq)
+	entry_freq = total_freq;
+    }
+
+  copy_bbs (region, n_region, region_copy, &exit, 1, &exit_copy, loop,
+	    split_edge_bb_loc (entry), 0);
+  if (total_count)
+    {
+      scale_bbs_frequencies_gcov_type (region, n_region,
+				       total_count - entry_count,
+				       total_count);
+      scale_bbs_frequencies_gcov_type (region_copy, n_region, entry_count,
+				       total_count);
+    }
+  else
+    {
+      scale_bbs_frequencies_int (region, n_region, total_freq - entry_freq,
+				 total_freq);
+      scale_bbs_frequencies_int (region_copy, n_region, entry_freq, total_freq);
+    }
+
+#ifdef ENABLE_CHECKING
+  /* Make sure no edge other than ENTRY is entering the copied region.  */
+  verify_seme (entry, region_copy, n_region);
+#endif
+
+  /* Remove the last branch in the jump thread path.  */
+  remove_ctrl_stmt_and_useless_edges (region_copy[n_region - 1], exit->dest);
+  edge e = make_edge (region_copy[n_region - 1], exit->dest, EDGE_FALLTHRU);
+
+  if (e) {
+    rescan_loop_exit (e, true, false);
+    e->probability = REG_BR_PROB_BASE;
+    e->count = region_copy[n_region - 1]->count;
+  }
+
+  /* Redirect the entry and add the phi node arguments.  */
+  redirected = redirect_edge_and_branch (entry, get_bb_copy (entry->dest));
+  gcc_assert (redirected != NULL);
+  flush_pending_stmts (entry);
+
+  /* Add the other PHI node arguments.  */
+  add_phi_args_after_copy (region_copy, n_region, NULL);
+
+  if (free_region_copy)
+    free (region_copy);
+
+  free_original_copy_tables ();
+  return true;
+}
+
 /* Walk through all blocks and thread incoming edges to the appropriate
    outgoing edge for each edge pair recorded in THREADED_EDGES.
 
@@ -2343,6 +2493,57 @@ thread_through_all_blocks (bool may_peel_loop_headers)
   threaded_blocks = BITMAP_ALLOC (NULL);
   memset (&thread_stats, 0, sizeof (thread_stats));
 
+  /* Jump-thread all FSM threads before other jump-threads.  */
+  for (i = 0; i < paths.length ();)
+    {
+      vec<jump_thread_edge *> *path = paths[i];
+      edge entry = (*path)[0]->e;
+
+      if ((*path)[0]->type != EDGE_FSM_THREAD
+	  /* Do not jump-thread twice from the same block.  */
+	  || bitmap_bit_p (threaded_blocks, entry->src->index)) {
+	i++;
+	continue;
+      }
+
+      unsigned len = path->length ();
+      edge exit = (*path)[len - 1]->e;
+      basic_block *region = XNEWVEC (basic_block, len - 1);
+
+      for (unsigned int j = 0; j < len - 1; j++)
+	region[j] = (*path)[j]->e->dest;
+
+      if (duplicate_seme_region (entry, exit, region, len - 1, NULL))
+	{
+	  /* We do not update dominance info.  */
+	  free_dominance_info (CDI_DOMINATORS);
+	  bitmap_set_bit (threaded_blocks, entry->src->index);
+	  retval = true;
+	}
+
+      delete_jump_thread_path (path);
+      paths.unordered_remove (i);
+    }
+
+  /* Remove from PATHS all the jump-threads starting with an edge already
+     jump-threaded.  */
+  for (i = 0; i < paths.length ();)
+    {
+      vec<jump_thread_edge *> *path = paths[i];
+      edge entry = (*path)[0]->e;
+
+      /* Do not jump-thread twice from the same block.  */
+      if (bitmap_bit_p (threaded_blocks, entry->src->index))
+	{
+	  delete_jump_thread_path (path);
+	  paths.unordered_remove (i);
+	}
+      else
+	i++;
+    }
+
+  bitmap_clear (threaded_blocks);
+
   mark_threaded_blocks (threaded_blocks);
 
   initialize_original_copy_tables ();
diff --git a/gcc/tree-ssa-threadupdate.h b/gcc/tree-ssa-threadupdate.h
index 426aca5..22c5bce 100644
--- a/gcc/tree-ssa-threadupdate.h
+++ b/gcc/tree-ssa-threadupdate.h
@@ -26,6 +26,7 @@ extern bool thread_through_all_blocks (bool);
 enum jump_thread_edge_type
 {
   EDGE_START_JUMP_THREAD,
+  EDGE_FSM_THREAD,
   EDGE_COPY_SRC_BLOCK,
   EDGE_COPY_SRC_JOINER_BLOCK,
   EDGE_NO_COPY_SRC_BLOCK
-- 
1.7.10.4


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-04  9:14                                         ` Sebastian Pop
  2014-12-04 11:00                                           ` Sebastian Pop
@ 2014-12-05 20:08                                           ` Jeff Law
  1 sibling, 0 replies; 54+ messages in thread
From: Jeff Law @ 2014-12-05 20:08 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

On 12/04/14 02:14, Sebastian Pop wrote:
> Sebastian Pop wrote:
>> a fail I have not seen in the past:
>>
>> FAIL: gcc.c-torture/compile/pr27571.c   -Os  (internal compiler error)
>>
>> I am still investigating why this fails: as far as I can see for now this is
>> because in copying the FSM path we create an internal loop that is then
>> discovered by the loop verifier as a natural loop and is not yet in the existing
>> loop sturctures.  I will try to fix this in duplicate_seme by invalidating the
>> loop structure after we code generated all the FSM paths.  I will submit an
>> updated patch when it passes regtest.
>
> We need at least this patch to fix the fail:
>
> @@ -2518,6 +2518,7 @@ thread_through_all_blocks (bool may_peel_loop_headers)
>        if (duplicate_seme_region (entry, exit, region, len - 1, NULL))
>          {
>            /* We do not update dominance info.  */
>            free_dominance_info (CDI_DOMINATORS);
>            bitmap_set_bit (threaded_blocks, entry->src->index);
> +         retval = true;
>          }
>
> And this will trigger in the end of the code gen function:
>
>   if (retval)
>      loops_state_set (LOOPS_NEED_FIXUP);
>
> That will fix the loop structures.  I'm testing this patch on top of the one I
> have just sent out.
That looks correct to me.

Jeff

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-04 14:30                                         ` Sebastian Pop
@ 2014-12-05 20:12                                           ` Jeff Law
  2014-12-06 13:47                                             ` Sebastian Pop
  0 siblings, 1 reply; 54+ messages in thread
From: Jeff Law @ 2014-12-05 20:12 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

On 12/04/14 07:29, Sebastian Pop wrote:
> Sebastian Pop wrote:
>> Jeff Law wrote:
>>> I'm a bit worried about compile-time impacts of the all the
>>> recursion
>>
>> I will also restrict the recursion to the loop in which we look for the FSM
>> thread.
>
> The attached patch includes this change.  It passed bootstrap and regression
> test on x86_64-linux.  Ok to commit?
OK to commit.  Thanks for your patience.

Can you follow-up with a change which throttles this optimization when 
-Os is in effect.  You can check optimize_function_for_size_p (cfun) and 
simply avoid the backward traversal or you could allow it in that case 
if the amount of copying is suitably small.  Your call.



jeff

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-05 20:12                                           ` Jeff Law
@ 2014-12-06 13:47                                             ` Sebastian Pop
  2014-12-06 19:21                                               ` Sebastian Pop
  2014-12-08 19:02                                               ` Jeff Law
  0 siblings, 2 replies; 54+ messages in thread
From: Sebastian Pop @ 2014-12-06 13:47 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1042 bytes --]

Jeff Law wrote:
> OK to commit.  Thanks for your patience.
> 
> Can you follow-up with a change which throttles this optimization
> when -Os is in effect.  You can check optimize_function_for_size_p
> (cfun) and simply avoid the backward traversal or you could allow it
> in that case if the amount of copying is suitably small.  Your call.

I think it does not make sense to duplicate paths at -Os: I disabled the FSM
jump-threading when optimizing for size like this.

diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
index 29b20c8..ce70311 100644
--- a/gcc/tree-ssa-threadedge.c
+++ b/gcc/tree-ssa-threadedge.c
@@ -1335,8 +1335,9 @@ thread_through_normal_block (edge e,
          return 1;
        }
 
       if (!flag_expensive_optimizations
+         || optimize_function_for_size_p (cfun)
          || TREE_CODE (cond) != SSA_NAME
          || e->dest->loop_father != e->src->loop_father
          || loop_depth (e->dest->loop_father) == 0)
        return 0;

I will regstrap and commit the attached patch.

Sebastian

[-- Attachment #2: 0001-extend-jump-thread-for-finite-state-automata.patch --]
[-- Type: text/x-diff, Size: 27513 bytes --]

From 1e2efaa2e3121170a938cd479d979b55c37cc4a4 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <s.pop@samsung.com>
Date: Fri, 26 Sep 2014 14:54:20 -0500
Subject: [PATCH] extend jump thread for finite state automata

	PR tree-optimization/54742
	* params.def (max-fsm-thread-path-insns, max-fsm-thread-length,
	max-fsm-thread-paths): New.

	* doc/invoke.texi (max-fsm-thread-path-insns, max-fsm-thread-length,
	max-fsm-thread-paths): Documented.

	* tree-cfg.c (split_edge_bb_loc): Export.
	* tree-cfg.h (split_edge_bb_loc): Declared extern.

	* tree-ssa-threadedge.c (simplify_control_stmt_condition): Restore the
	original value of cond when simplification fails.
	(fsm_find_thread_path): New.
	(fsm_find_control_statement_thread_paths): New.
	(thread_through_normal_block): Call find_control_statement_thread_paths.

	* tree-ssa-threadupdate.c (dump_jump_thread_path): Pretty print
	EDGE_FSM_THREAD.
	(verify_seme): New.
	(duplicate_seme_region): New.
	(thread_through_all_blocks): Generate code for EDGE_FSM_THREAD edges
	calling duplicate_seme_region.

	* tree-ssa-threadupdate.h (jump_thread_edge_type): Add EDGE_FSM_THREAD.

	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c: New test.
	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c: New test.
---
 gcc/ChangeLog                                    |   29 +++
 gcc/doc/invoke.texi                              |   12 +
 gcc/params.def                                   |   15 ++
 gcc/testsuite/ChangeLog                          |    8 +
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c |   43 ++++
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c |  127 ++++++++++
 gcc/tree-cfg.c                                   |    2 +-
 gcc/tree-cfg.h                                   |    1 +
 gcc/tree-ssa-threadedge.c                        |  278 +++++++++++++++++++++-
 gcc/tree-ssa-threadupdate.c                      |  203 +++++++++++++++-
 gcc/tree-ssa-threadupdate.h                      |    1 +
 11 files changed, 716 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index b340b51..6cfd339 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,32 @@
+2014-12-06  James Greenhalgh  <james.greenhalgh@arm.com>
+	    Sebastian Pop  <s.pop@samsung.com>
+	    Brian Rzycki  <b.rzycki@samsung.com>
+
+	PR tree-optimization/54742
+	* params.def (max-fsm-thread-path-insns, max-fsm-thread-length,
+	max-fsm-thread-paths): New.
+
+	* doc/invoke.texi (max-fsm-thread-path-insns, max-fsm-thread-length,
+	max-fsm-thread-paths): Documented.
+
+	* tree-cfg.c (split_edge_bb_loc): Export.
+	* tree-cfg.h (split_edge_bb_loc): Declared extern.
+
+	* tree-ssa-threadedge.c (simplify_control_stmt_condition): Restore the
+	original value of cond when simplification fails.
+	(fsm_find_thread_path): New.
+	(fsm_find_control_statement_thread_paths): New.
+	(thread_through_normal_block): Call find_control_statement_thread_paths.
+
+	* tree-ssa-threadupdate.c (dump_jump_thread_path): Pretty print
+	EDGE_FSM_THREAD.
+	(verify_seme): New.
+	(duplicate_seme_region): New.
+	(thread_through_all_blocks): Generate code for EDGE_FSM_THREAD edges
+	calling duplicate_seme_region.
+
+	* tree-ssa-threadupdate.h (jump_thread_edge_type): Add EDGE_FSM_THREAD.
+
 2014-12-06  H.J. Lu  <hongjiu.lu@intel.com>
 
 	PR target/64200
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 82f0794..70d1336 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -10623,6 +10623,18 @@ large and significantly increase compile time at optimization level
 @option{-O1} and higher.  This parameter is a maximum nubmer of statements
 in a single generated constructor.  Default value is 5000.
 
+@item max-fsm-thread-path-insns
+Maximum number of instructions to copy when duplicating blocks on a
+finite state automaton jump thread path.  The default is 100.
+
+@item max-fsm-thread-length
+Maximum number of basic blocks on a finite state automaton jump thread
+path.  The default is 10.
+
+@item max-fsm-thread-paths
+Maximum number of new jump thread paths to create for a finite state
+automaton.  The default is 50.
+
 @end table
 @end table
 
diff --git a/gcc/params.def b/gcc/params.def
index 9b21c07..edf3f53 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1140,6 +1140,21 @@ DEFPARAM (PARAM_CHKP_MAX_CTOR_SIZE,
 	  "Maximum number of statements to be included into a single static "
 	  "constructor generated by Pointer Bounds Checker",
 	  5000, 100, 0)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_PATH_INSNS,
+	  "max-fsm-thread-path-insns",
+	  "Maximum number of instructions to copy when duplicating blocks on a finite state automaton jump thread path",
+	  100, 1, 999999)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_LENGTH,
+	  "max-fsm-thread-length",
+	  "Maximum number of basic blocks on a finite state automaton jump thread path",
+	  10, 1, 999999)
+
+DEFPARAM (PARAM_MAX_FSM_THREAD_PATHS,
+	  "max-fsm-thread-paths",
+	  "Maximum number of new jump thread paths to create for a finite state automaton",
+	  50, 1, 999999)
 /*
 
 Local variables:
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index abeacd0..4c89397 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,11 @@
+2014-12-06  James Greenhalgh  <james.greenhalgh@arm.com>
+	    Sebastian Pop  <s.pop@samsung.com>
+	    Brian Rzycki  <b.rzycki@samsung.com>
+
+	PR tree-optimization/54742
+	* gcc.dg/tree-ssa/ssa-dom-thread-6.c: New test.
+	* gcc.dg/tree-ssa/ssa-dom-thread-7.c: New test.
+
 2014-12-06  Marek Polacek  <polacek@redhat.com>
 
 	PR tree-optimization/64183
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
new file mode 100644
index 0000000..bb34a74
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-dom1-details" } */
+/* { dg-final { scan-tree-dump-times "FSM" 6 "dom1" } } */
+/* { dg-final { cleanup-tree-dump "dom1" } } */
+
+int sum0, sum1, sum2, sum3;
+int foo (char *s, char **ret)
+{
+  int state=0;
+  char c;
+
+  for (; *s && state != 4; s++)
+    {
+      c = *s;
+      if (c == '*')
+	{
+	  s++;
+	  break;
+	}
+      switch (state)
+	{
+	case 0:
+	  if (c == '+')
+	    state = 1;
+	  else if (c != '-')
+	    sum0+=c;
+	  break;
+	case 1:
+	  if (c == '+')
+	    state = 2;
+	  else if (c == '-')
+	    state = 0;
+	  else
+	    sum1+=c;
+	  break;
+	default:
+	  break;
+	}
+
+    }
+  *ret = s;
+  return state;
+}
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
new file mode 100644
index 0000000..21474f0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
@@ -0,0 +1,127 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-dom1-details" } */
+/* { dg-final { scan-tree-dump-times "FSM" 19 "dom1" } } */
+/* { dg-final { cleanup-tree-dump "dom1" } } */
+
+enum STATE {
+  S0=0,
+  SI,
+  S1,
+  S2,
+  S3,
+  S4,
+  S5,
+  S6
+};
+
+int bar (enum STATE s);
+
+enum STATE foo (unsigned char **y, unsigned *c)
+{
+  unsigned char *x = *y;
+  unsigned char n;
+  enum STATE s = S0;
+
+  for( ; *x && s != SI; x++ )
+    {
+      n = *x;
+      if (n == 'x')
+	{
+	  x++;
+	  break;
+	}
+      switch(s)
+	{
+	case S0:
+	  if(bar(n))
+	    s = S3;
+	  else if( n == 'a' || n == 'b' )
+	    s = S1;
+	  else if( n == 'c' )
+	    s = S4;
+	  else
+	    {
+	      s = SI;
+	      c[SI]++;
+	    }
+	  c[S0]++;
+	  break;
+	case S1:
+	  if(bar(n))
+	    {
+	      s = S3;
+	      c[S1]++;
+	    }
+	  else if( n == 'c' )
+	    {
+	      s = S4;
+	      c[S1]++;
+	    }
+	  else
+	    {
+	      s = SI;
+	      c[S1]++;
+	    }
+	  break;
+	case S3:
+	  if( n == 'c' )
+	    {
+	      s = S4;
+	      c[S3]++;
+	    }
+	  else if(!bar(n))
+	    {
+	      s = SI;
+	      c[S3]++;
+	    }
+	  break;
+	case S4:
+	  if( n == 'E' || n == 'e' )
+	    {
+	      s = S2;
+	      c[S4]++;
+	    }
+	  else if(!bar(n))
+	    {
+	      s = SI;
+	      c[S4]++;
+	    }
+	  break;
+	case S2:
+	  if( n == 'a' || n == 'b' )
+	    {
+	      s = S5;
+	      c[S2]++;
+	    }
+	  else
+	    {
+	      s = SI;
+	      c[S2]++;
+	    }
+	  break;
+	case S5:
+	  if(bar(n))
+	    {
+	      s = S6;
+	      c[S5]++;
+	    }
+	  else
+	    {
+	      s = SI;
+	      c[S5]++;
+	    }
+	  break;
+	case S6:
+	  if(!bar(n))
+	    {
+	      s = SI;
+	      c[SI]++;
+	    }
+	  break;
+	default:
+	  break;
+	}
+    }
+  *y=x;
+  return s;
+}
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index fbbe9c8..6aca58d 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -2666,7 +2666,7 @@ reinstall_phi_args (edge new_edge, edge old_edge)
    near its "logical" location.  This is of most help to humans looking
    at debugging dumps.  */
 
-static basic_block
+basic_block
 split_edge_bb_loc (edge edge_in)
 {
   basic_block dest = edge_in->dest;
diff --git a/gcc/tree-cfg.h b/gcc/tree-cfg.h
index d35e5ba..834fa71 100644
--- a/gcc/tree-cfg.h
+++ b/gcc/tree-cfg.h
@@ -67,6 +67,7 @@ extern void verify_gimple_in_cfg (struct function *, bool);
 extern tree gimple_block_label (basic_block);
 extern void add_phi_args_after_copy_bb (basic_block);
 extern void add_phi_args_after_copy (basic_block *, unsigned, edge);
+extern basic_block split_edge_bb_loc (edge);
 extern bool gimple_duplicate_sese_region (edge, edge, basic_block *, unsigned,
 					basic_block *, bool);
 extern bool gimple_duplicate_sese_tail (edge, edge, basic_block *, unsigned,
diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
index 8b0b7b8..ce70311 100644
--- a/gcc/tree-ssa-threadedge.c
+++ b/gcc/tree-ssa-threadedge.c
@@ -56,6 +56,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "tree-ssa-threadedge.h"
 #include "builtins.h"
+#include "cfg.h"
+#include "cfganal.h"
 
 /* To avoid code explosion due to jump threading, we limit the
    number of statements we are going to copy.  This variable
@@ -661,6 +663,7 @@ simplify_control_stmt_condition (edge e,
      rather than use a relational operator.  These are simpler to handle.  */
   if (TREE_CODE (cond) == SSA_NAME)
     {
+      tree original_lhs = cond;
       cached_lhs = cond;
 
       /* Get the variable's current value from the equivalence chains.
@@ -689,6 +692,12 @@ simplify_control_stmt_condition (edge e,
 	 pass specific callback to try and simplify it further.  */
       if (cached_lhs && ! is_gimple_min_invariant (cached_lhs))
         cached_lhs = (*simplify) (stmt, stmt);
+
+      /* We couldn't find an invariant.  But, callers of this
+	 function may be able to do something useful with the
+	 unmodified destination.  */
+      if (!cached_lhs)
+	cached_lhs = original_lhs;
     }
   else
     cached_lhs = NULL;
@@ -948,6 +957,249 @@ thread_around_empty_blocks (edge taken_edge,
   return false;
 }
 
+/* Return true if the CFG contains at least one path from START_BB to END_BB.
+   When a path is found, record in PATH the blocks from END_BB to START_BB.
+   VISITED_BBS is used to make sure we don't fall into an infinite loop.  Bound
+   the recursion to basic blocks belonging to LOOP.  */
+
+static bool
+fsm_find_thread_path (basic_block start_bb, basic_block end_bb,
+		      vec<basic_block, va_gc> *&path,
+		      hash_set<basic_block> *visited_bbs, loop_p loop)
+{
+  if (loop != start_bb->loop_father)
+    return false;
+
+  if (start_bb == end_bb)
+    {
+      vec_safe_push (path, start_bb);
+      return true;
+    }
+
+  if (!visited_bbs->add (start_bb))
+    {
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, start_bb->succs)
+	if (fsm_find_thread_path (e->dest, end_bb, path, visited_bbs, loop))
+	  {
+	    vec_safe_push (path, start_bb);
+	    return true;
+	  }
+    }
+
+  return false;
+}
+
+static int max_threaded_paths;
+
+/* We trace the value of the variable EXPR back through any phi nodes looking
+   for places where it gets a constant value and save the path.  Stop after
+   having recorded MAX_PATHS jump threading paths.  */
+
+static void
+fsm_find_control_statement_thread_paths (tree expr,
+					 hash_set<gimple> *visited_phis,
+					 vec<basic_block, va_gc> *&path)
+{
+  tree var = SSA_NAME_VAR (expr);
+  gimple def_stmt = SSA_NAME_DEF_STMT (expr);
+  basic_block var_bb = gimple_bb (def_stmt);
+
+  if (var == NULL || var_bb == NULL)
+    return;
+
+  /* For the moment we assume that an SSA chain only contains phi nodes, and
+     eventually one of the phi arguments will be an integer constant.  In the
+     future, this could be extended to also handle simple assignments of
+     arithmetic operations.  */
+  if (gimple_code (def_stmt) != GIMPLE_PHI)
+    return;
+
+  /* Avoid infinite recursion.  */
+  if (visited_phis->add (def_stmt))
+    return;
+
+  gphi *phi = as_a <gphi *> (def_stmt);
+  int next_path_length = 0;
+  basic_block last_bb_in_path = path->last ();
+
+  /* Following the chain of SSA_NAME definitions, we jumped from a definition in
+     LAST_BB_IN_PATH to a definition in VAR_BB.  When these basic blocks are
+     different, append to PATH the blocks from LAST_BB_IN_PATH to VAR_BB.  */
+  if (var_bb != last_bb_in_path)
+    {
+      edge e;
+      int e_count = 0;
+      edge_iterator ei;
+      vec<basic_block, va_gc> *next_path;
+      vec_alloc (next_path, n_basic_blocks_for_fn (cfun));
+
+      FOR_EACH_EDGE (e, ei, last_bb_in_path->preds)
+	{
+	  hash_set<basic_block> *visited_bbs = new hash_set<basic_block>;
+
+	  if (fsm_find_thread_path (var_bb, e->src, next_path, visited_bbs,
+				    e->src->loop_father))
+	    ++e_count;
+
+	  delete visited_bbs;
+
+	  /* If there is more than one path, stop.  */
+	  if (e_count > 1)
+	    {
+	      vec_free (next_path);
+	      return;
+	    }
+	}
+
+      /* Stop if we have not found a path: this could occur when the recursion
+	 is stopped by one of the bounds.  */
+      if (e_count == 0)
+	{
+	  vec_free (next_path);
+	  return;
+	}
+
+      /* Append all the nodes from NEXT_PATH to PATH.  */
+      vec_safe_splice (path, next_path);
+      next_path_length = next_path->length ();
+      vec_free (next_path);
+    }
+
+  gcc_assert (path->last () == var_bb);
+
+  /* Iterate over the arguments of PHI.  */
+  unsigned int i;
+  for (i = 0; i < gimple_phi_num_args (phi); i++)
+    {
+      tree arg = gimple_phi_arg_def (phi, i);
+      basic_block bbi = gimple_phi_arg_edge (phi, i)->src;
+
+      /* Skip edges pointing outside the current loop.  */
+      if (!arg || var_bb->loop_father != bbi->loop_father)
+	continue;
+
+      if (TREE_CODE (arg) == SSA_NAME)
+	{
+	  vec_safe_push (path, bbi);
+	  /* Recursively follow SSA_NAMEs looking for a constant definition.  */
+	  fsm_find_control_statement_thread_paths (arg, visited_phis, path);
+	  path->pop ();
+	  continue;
+	}
+
+      if (TREE_CODE (arg) != INTEGER_CST)
+	continue;
+
+      int path_length = path->length ();
+      /* A path with less than 2 basic blocks should not be jump-threaded.  */
+      if (path_length < 2)
+	continue;
+
+      if (path_length > PARAM_VALUE (PARAM_MAX_FSM_THREAD_LENGTH))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "FSM jump-thread path not considered: "
+		     "the number of basic blocks on the path "
+		     "exceeds PARAM_MAX_FSM_THREAD_LENGTH.\n");
+	  continue;
+	}
+
+      if (max_threaded_paths <= 0)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "FSM jump-thread path not considered: "
+		     "the number of previously recorded FSM paths to thread "
+		     "exceeds PARAM_MAX_FSM_THREAD_PATHS.\n");
+	  continue;
+	}
+
+      /* Add BBI to the path.  */
+      vec_safe_push (path, bbi);
+      ++path_length;
+
+      int n_insns = 0;
+      gimple_stmt_iterator gsi;
+      int j;
+      loop_p loop = (*path)[0]->loop_father;
+      bool path_crosses_loops = false;
+
+      /* Count the number of instructions on the path: as these instructions
+	 will have to be duplicated, we will not record the path if there are
+	 too many instructions on the path.  Also check that all the blocks in
+	 the path belong to a single loop.  */
+      for (j = 1; j < path_length - 1; j++)
+	{
+	  basic_block bb = (*path)[j];
+
+	  if (bb->loop_father != loop)
+	    {
+	      path_crosses_loops = true;
+	      break;
+	    }
+
+	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+	    {
+	      gimple stmt = gsi_stmt (gsi);
+	      /* Do not count empty statements and labels.  */
+	      if (gimple_code (stmt) != GIMPLE_NOP
+		  && gimple_code (stmt) != GIMPLE_LABEL
+		  && !is_gimple_debug (stmt))
+		++n_insns;
+	    }
+	}
+
+      if (path_crosses_loops)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "FSM jump-thread path not considered: "
+		     "the path crosses loops.\n");
+	  path->pop ();
+	  continue;
+	}
+
+      if (n_insns >= PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATH_INSNS))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "FSM jump-thread path not considered: "
+		     "the number of instructions on the path "
+		     "exceeds PARAM_MAX_FSM_THREAD_PATH_INSNS.\n");
+	  path->pop ();
+	  continue;
+	}
+
+      vec<jump_thread_edge *> *jump_thread_path
+	= new vec<jump_thread_edge *> ();
+
+      /* Record the edges between the blocks in PATH.  */
+      for (j = 0; j < path_length - 1; j++)
+	{
+	  edge e = find_edge ((*path)[path_length - j - 1],
+			      (*path)[path_length - j - 2]);
+	  gcc_assert (e);
+	  jump_thread_edge *x = new jump_thread_edge (e, EDGE_FSM_THREAD);
+	  jump_thread_path->safe_push (x);
+	}
+
+      /* Add the edge taken when the control variable has value ARG.  */
+      edge taken_edge = find_taken_edge ((*path)[0], arg);
+      jump_thread_edge *x
+	= new jump_thread_edge (taken_edge, EDGE_NO_COPY_SRC_BLOCK);
+      jump_thread_path->safe_push (x);
+
+      register_jump_thread (jump_thread_path);
+      --max_threaded_paths;
+
+      /* Remove BBI from the path.  */
+      path->pop ();
+    }
+
+  /* Remove all the nodes that we added from NEXT_PATH.  */
+  if (next_path_length)
+    vec_safe_truncate (path, (path->length () - next_path_length));
+}
+
 /* We are exiting E->src, see if E->dest ends with a conditional
    jump which has a known value when reached via E.
 
@@ -1033,7 +1285,10 @@ thread_through_normal_block (edge e,
       cond = simplify_control_stmt_condition (e, stmt, dummy_cond, simplify,
 					      handle_dominating_asserts);
 
-      if (cond && is_gimple_min_invariant (cond))
+      if (!cond)
+	return 0;
+
+      if (is_gimple_min_invariant (cond))
 	{
 	  edge taken_edge = find_taken_edge (e->dest, cond);
 	  basic_block dest = (taken_edge ? taken_edge->dest : NULL);
@@ -1079,6 +1334,27 @@ thread_through_normal_block (edge e,
 				      backedge_seen_p);
 	  return 1;
 	}
+
+      if (!flag_expensive_optimizations
+	  || optimize_function_for_size_p (cfun)
+	  || TREE_CODE (cond) != SSA_NAME
+	  || e->dest->loop_father != e->src->loop_father
+	  || loop_depth (e->dest->loop_father) == 0)
+	return 0;
+
+      /* When COND cannot be simplified, try to find paths from a control
+	 statement back through the PHI nodes which would affect that control
+	 statement.  */
+      vec<basic_block, va_gc> *bb_path;
+      vec_alloc (bb_path, n_basic_blocks_for_fn (cfun));
+      vec_safe_push (bb_path, e->dest);
+      hash_set<gimple> *visited_phis = new hash_set<gimple>;
+
+      max_threaded_paths = PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATHS);
+      fsm_find_control_statement_thread_paths (cond, visited_phis, bb_path);
+
+      delete visited_phis;
+      vec_free (bb_path);
     }
   return 0;
 }
diff --git a/gcc/tree-ssa-threadupdate.c b/gcc/tree-ssa-threadupdate.c
index a8243ae..12f83ba 100644
--- a/gcc/tree-ssa-threadupdate.c
+++ b/gcc/tree-ssa-threadupdate.c
@@ -167,8 +167,9 @@ dump_jump_thread_path (FILE *dump_file, vec<jump_thread_edge *> path,
 		       bool registering)
 {
   fprintf (dump_file,
-	   "  %s jump thread: (%d, %d) incoming edge; ",
+	   "  %s%s jump thread: (%d, %d) incoming edge; ",
 	   (registering ? "Registering" : "Cancelling"),
+	   (path[0]->type == EDGE_FSM_THREAD ? " FSM": ""),
 	   path[0]->e->src->index, path[0]->e->dest->index);
 
   for (unsigned int i = 1; i < path.length (); i++)
@@ -2317,6 +2318,155 @@ bb_ends_with_multiway_branch (basic_block bb ATTRIBUTE_UNUSED)
   return false;
 }
 
+/* Verify that the REGION is a Single Entry Multiple Exits region: make sure no
+   edge other than ENTRY is entering the REGION.  */
+
+DEBUG_FUNCTION void
+verify_seme (edge entry, basic_block *region, unsigned n_region)
+{
+  bitmap bbs = BITMAP_ALLOC (NULL);
+
+  for (unsigned i = 0; i < n_region; i++)
+    bitmap_set_bit (bbs, region[i]->index);
+
+  for (unsigned i = 0; i < n_region; i++)
+    {
+      edge e;
+      edge_iterator ei;
+      basic_block bb = region[i];
+
+      /* All predecessors other than ENTRY->src should be in the region.  */
+      for (ei = ei_start (bb->preds); (e = ei_safe_edge (ei)); ei_next (&ei))
+	if (e != entry)
+	  gcc_assert (bitmap_bit_p (bbs, e->src->index));
+    }
+
+  BITMAP_FREE (bbs);
+}
+
+/* Duplicates a Single Entry Multiple Exit REGION (set of N_REGION basic
+   blocks).  The ENTRY edge is redirected to the duplicate of the region.  If
+   REGION is not a Single Entry region, ignore any incoming edges other than
+   ENTRY: this makes the copied region a Single Entry region.
+
+   Remove the last conditional statement in the last basic block in the REGION,
+   and create a single fallthru edge pointing to the same destination as the
+   EXIT edge.
+
+   The new basic blocks are stored to REGION_COPY in the same order as they had
+   in REGION, provided that REGION_COPY is not NULL.
+
+   Returns false if it is unable to copy the region, true otherwise.  */
+
+static bool
+duplicate_seme_region (edge entry, edge exit,
+		       basic_block *region, unsigned n_region,
+		       basic_block *region_copy)
+{
+  unsigned i;
+  bool free_region_copy = false, copying_header = false;
+  struct loop *loop = entry->dest->loop_father;
+  edge exit_copy;
+  edge redirected;
+  int total_freq = 0, entry_freq = 0;
+  gcov_type total_count = 0, entry_count = 0;
+
+  if (!can_copy_bbs_p (region, n_region))
+    return false;
+
+  /* Some sanity checking.  Note that we do not check for all possible
+     missuses of the functions.  I.e. if you ask to copy something weird,
+     it will work, but the state of structures probably will not be
+     correct.  */
+  for (i = 0; i < n_region; i++)
+    {
+      /* We do not handle subloops, i.e. all the blocks must belong to the
+	 same loop.  */
+      if (region[i]->loop_father != loop)
+	return false;
+    }
+
+  initialize_original_copy_tables ();
+
+  if (copying_header)
+    set_loop_copy (loop, loop_outer (loop));
+  else
+    set_loop_copy (loop, loop);
+
+  if (!region_copy)
+    {
+      region_copy = XNEWVEC (basic_block, n_region);
+      free_region_copy = true;
+    }
+
+  if (entry->dest->count)
+    {
+      total_count = entry->dest->count;
+      entry_count = entry->count;
+      /* Fix up corner cases, to avoid division by zero or creation of negative
+	 frequencies.  */
+      if (entry_count > total_count)
+	entry_count = total_count;
+    }
+  else
+    {
+      total_freq = entry->dest->frequency;
+      entry_freq = EDGE_FREQUENCY (entry);
+      /* Fix up corner cases, to avoid division by zero or creation of negative
+	 frequencies.  */
+      if (total_freq == 0)
+	total_freq = 1;
+      else if (entry_freq > total_freq)
+	entry_freq = total_freq;
+    }
+
+  copy_bbs (region, n_region, region_copy, &exit, 1, &exit_copy, loop,
+	    split_edge_bb_loc (entry), 0);
+  if (total_count)
+    {
+      scale_bbs_frequencies_gcov_type (region, n_region,
+				       total_count - entry_count,
+				       total_count);
+      scale_bbs_frequencies_gcov_type (region_copy, n_region, entry_count,
+				       total_count);
+    }
+  else
+    {
+      scale_bbs_frequencies_int (region, n_region, total_freq - entry_freq,
+				 total_freq);
+      scale_bbs_frequencies_int (region_copy, n_region, entry_freq, total_freq);
+    }
+
+#ifdef ENABLE_CHECKING
+  /* Make sure no edge other than ENTRY is entering the copied region.  */
+  verify_seme (entry, region_copy, n_region);
+#endif
+
+  /* Remove the last branch in the jump thread path.  */
+  remove_ctrl_stmt_and_useless_edges (region_copy[n_region - 1], exit->dest);
+  edge e = make_edge (region_copy[n_region - 1], exit->dest, EDGE_FALLTHRU);
+
+  if (e) {
+    rescan_loop_exit (e, true, false);
+    e->probability = REG_BR_PROB_BASE;
+    e->count = region_copy[n_region - 1]->count;
+  }
+
+  /* Redirect the entry and add the phi node arguments.  */
+  redirected = redirect_edge_and_branch (entry, get_bb_copy (entry->dest));
+  gcc_assert (redirected != NULL);
+  flush_pending_stmts (entry);
+
+  /* Add the other PHI node arguments.  */
+  add_phi_args_after_copy (region_copy, n_region, NULL);
+
+  if (free_region_copy)
+    free (region_copy);
+
+  free_original_copy_tables ();
+  return true;
+}
+
 /* Walk through all blocks and thread incoming edges to the appropriate
    outgoing edge for each edge pair recorded in THREADED_EDGES.
 
@@ -2343,6 +2493,57 @@ thread_through_all_blocks (bool may_peel_loop_headers)
   threaded_blocks = BITMAP_ALLOC (NULL);
   memset (&thread_stats, 0, sizeof (thread_stats));
 
+  /* Jump-thread all FSM threads before other jump-threads.  */
+  for (i = 0; i < paths.length ();)
+    {
+      vec<jump_thread_edge *> *path = paths[i];
+      edge entry = (*path)[0]->e;
+
+      if ((*path)[0]->type != EDGE_FSM_THREAD
+	  /* Do not jump-thread twice from the same block.  */
+	  || bitmap_bit_p (threaded_blocks, entry->src->index)) {
+	i++;
+	continue;
+      }
+
+      unsigned len = path->length ();
+      edge exit = (*path)[len - 1]->e;
+      basic_block *region = XNEWVEC (basic_block, len - 1);
+
+      for (unsigned int j = 0; j < len - 1; j++)
+	region[j] = (*path)[j]->e->dest;
+
+      if (duplicate_seme_region (entry, exit, region, len - 1, NULL))
+	{
+	  /* We do not update dominance info.  */
+	  free_dominance_info (CDI_DOMINATORS);
+	  bitmap_set_bit (threaded_blocks, entry->src->index);
+	  retval = true;
+	}
+
+      delete_jump_thread_path (path);
+      paths.unordered_remove (i);
+    }
+
+  /* Remove from PATHS all the jump-threads starting with an edge already
+     jump-threaded.  */
+  for (i = 0; i < paths.length ();)
+    {
+      vec<jump_thread_edge *> *path = paths[i];
+      edge entry = (*path)[0]->e;
+
+      /* Do not jump-thread twice from the same block.  */
+      if (bitmap_bit_p (threaded_blocks, entry->src->index))
+	{
+	  delete_jump_thread_path (path);
+	  paths.unordered_remove (i);
+	}
+      else
+	i++;
+    }
+
+  bitmap_clear (threaded_blocks);
+
   mark_threaded_blocks (threaded_blocks);
 
   initialize_original_copy_tables ();
diff --git a/gcc/tree-ssa-threadupdate.h b/gcc/tree-ssa-threadupdate.h
index 426aca5..22c5bce 100644
--- a/gcc/tree-ssa-threadupdate.h
+++ b/gcc/tree-ssa-threadupdate.h
@@ -26,6 +26,7 @@ extern bool thread_through_all_blocks (bool);
 enum jump_thread_edge_type
 {
   EDGE_START_JUMP_THREAD,
+  EDGE_FSM_THREAD,
   EDGE_COPY_SRC_BLOCK,
   EDGE_COPY_SRC_JOINER_BLOCK,
   EDGE_NO_COPY_SRC_BLOCK
-- 
1.7.10.4


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-06 13:47                                             ` Sebastian Pop
@ 2014-12-06 19:21                                               ` Sebastian Pop
  2014-12-08 21:50                                                 ` Steve Ellcey
  2014-12-08 19:02                                               ` Jeff Law
  1 sibling, 1 reply; 54+ messages in thread
From: Sebastian Pop @ 2014-12-06 19:21 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

Sebastian Pop wrote:
> Jeff Law wrote:
> > OK to commit.  Thanks for your patience.
> > 
> > Can you follow-up with a change which throttles this optimization
> > when -Os is in effect.  You can check optimize_function_for_size_p
> > (cfun) and simply avoid the backward traversal or you could allow it
> > in that case if the amount of copying is suitably small.  Your call.
> 
> I think it does not make sense to duplicate paths at -Os: I disabled the FSM
> jump-threading when optimizing for size like this.
> 
> diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
> index 29b20c8..ce70311 100644
> --- a/gcc/tree-ssa-threadedge.c
> +++ b/gcc/tree-ssa-threadedge.c
> @@ -1335,8 +1335,9 @@ thread_through_normal_block (edge e,
>           return 1;
>         }
>  
>        if (!flag_expensive_optimizations
> +         || optimize_function_for_size_p (cfun)
>           || TREE_CODE (cond) != SSA_NAME
>           || e->dest->loop_father != e->src->loop_father
>           || loop_depth (e->dest->loop_father) == 0)
>         return 0;
> 
> I will regstrap and commit the attached patch.

Bootstrapped and regression tested on x86_64-linux.
Committed r218451.

Sebastian

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-06 13:47                                             ` Sebastian Pop
  2014-12-06 19:21                                               ` Sebastian Pop
@ 2014-12-08 19:02                                               ` Jeff Law
  1 sibling, 0 replies; 54+ messages in thread
From: Jeff Law @ 2014-12-08 19:02 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Richard Biener, James Greenhalgh, Steve Ellcey, GCC Patches

On 12/06/14 06:47, Sebastian Pop wrote:
> Jeff Law wrote:
>> OK to commit.  Thanks for your patience.
>>
>> Can you follow-up with a change which throttles this optimization
>> when -Os is in effect.  You can check optimize_function_for_size_p
>> (cfun) and simply avoid the backward traversal or you could allow it
>> in that case if the amount of copying is suitably small.  Your call.
>
> I think it does not make sense to duplicate paths at -Os: I disabled the FSM
> jump-threading when optimizing for size like this.
>
> diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
> index 29b20c8..ce70311 100644
> --- a/gcc/tree-ssa-threadedge.c
> +++ b/gcc/tree-ssa-threadedge.c
> @@ -1335,8 +1335,9 @@ thread_through_normal_block (edge e,
>            return 1;
>          }
>
>         if (!flag_expensive_optimizations
> +         || optimize_function_for_size_p (cfun)
>            || TREE_CODE (cond) != SSA_NAME
>            || e->dest->loop_father != e->src->loop_father
>            || loop_depth (e->dest->loop_father) == 0)
>          return 0;
>
> I will regstrap and commit the attached patch.
Looks good to me.  Thanks for taking care of it.

Jeff

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-06 19:21                                               ` Sebastian Pop
@ 2014-12-08 21:50                                                 ` Steve Ellcey
  2014-12-09 13:14                                                   ` Richard Biener
  0 siblings, 1 reply; 54+ messages in thread
From: Steve Ellcey @ 2014-12-08 21:50 UTC (permalink / raw)
  To: Sebastian Pop; +Cc: Jeff Law, Richard Biener, James Greenhalgh, GCC Patches

On Sat, 2014-12-06 at 19:21 +0000, Sebastian Pop wrote:

> > I think it does not make sense to duplicate paths at -Os: I disabled the FSM
> > jump-threading when optimizing for size like this.
> > 
> > diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
> > index 29b20c8..ce70311 100644
> > --- a/gcc/tree-ssa-threadedge.c
> > +++ b/gcc/tree-ssa-threadedge.c
> > @@ -1335,8 +1335,9 @@ thread_through_normal_block (edge e,
> >           return 1;
> >         }
> >  
> >        if (!flag_expensive_optimizations
> > +         || optimize_function_for_size_p (cfun)
> >           || TREE_CODE (cond) != SSA_NAME
> >           || e->dest->loop_father != e->src->loop_father
> >           || loop_depth (e->dest->loop_father) == 0)
> >         return 0;
> > 
> > I will regstrap and commit the attached patch.
> 
> Bootstrapped and regression tested on x86_64-linux.
> Committed r218451.
> 
> Sebastian

Thanks for getting all this checked in Sebastian, I have tested it on
coremark and I am getting the speed up that I expect.  But I am a little
confused about turning off jump threading.  I am getting the
optimization on coremark with -O3 and that is great and if I use '-O3
-fno-expensive-optimizations' then I don't see this part of the jump
threading, also great.  But I was surprised that if I just used '-O3
-fno-thread-jumps' then I still see this optimization.  Is that
expected?  Should this test also check flag_thread_jumps?  Or should
that be getting checked somewhere else?

Steve Ellcey


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-08 21:50                                                 ` Steve Ellcey
@ 2014-12-09 13:14                                                   ` Richard Biener
  2014-12-09 17:38                                                     ` Sebastian Pop
  0 siblings, 1 reply; 54+ messages in thread
From: Richard Biener @ 2014-12-09 13:14 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Sebastian Pop, Jeff Law, James Greenhalgh, GCC Patches

On Mon, Dec 8, 2014 at 10:49 PM, Steve Ellcey <sellcey@mips.com> wrote:
> On Sat, 2014-12-06 at 19:21 +0000, Sebastian Pop wrote:
>
>> > I think it does not make sense to duplicate paths at -Os: I disabled the FSM
>> > jump-threading when optimizing for size like this.
>> >
>> > diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
>> > index 29b20c8..ce70311 100644
>> > --- a/gcc/tree-ssa-threadedge.c
>> > +++ b/gcc/tree-ssa-threadedge.c
>> > @@ -1335,8 +1335,9 @@ thread_through_normal_block (edge e,
>> >           return 1;
>> >         }
>> >
>> >        if (!flag_expensive_optimizations
>> > +         || optimize_function_for_size_p (cfun)
>> >           || TREE_CODE (cond) != SSA_NAME
>> >           || e->dest->loop_father != e->src->loop_father
>> >           || loop_depth (e->dest->loop_father) == 0)
>> >         return 0;
>> >
>> > I will regstrap and commit the attached patch.
>>
>> Bootstrapped and regression tested on x86_64-linux.
>> Committed r218451.
>>
>> Sebastian
>
> Thanks for getting all this checked in Sebastian, I have tested it on
> coremark and I am getting the speed up that I expect.  But I am a little
> confused about turning off jump threading.  I am getting the
> optimization on coremark with -O3 and that is great and if I use '-O3
> -fno-expensive-optimizations' then I don't see this part of the jump
> threading, also great.  But I was surprised that if I just used '-O3
> -fno-thread-jumps' then I still see this optimization.  Is that
> expected?  Should this test also check flag_thread_jumps?  Or should
> that be getting checked somewhere else?

-fthread-jumps is an RTL optimization flag and ignored on GIMPLE.

Richard.

> Steve Ellcey
>
>

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-09 13:14                                                   ` Richard Biener
@ 2014-12-09 17:38                                                     ` Sebastian Pop
  2014-12-09 18:39                                                       ` Jeff Law
  0 siblings, 1 reply; 54+ messages in thread
From: Sebastian Pop @ 2014-12-09 17:38 UTC (permalink / raw)
  To: Richard Biener; +Cc: Steve Ellcey, Jeff Law, James Greenhalgh, GCC Patches

Richard Biener wrote:
> On Mon, Dec 8, 2014 at 10:49 PM, Steve Ellcey <sellcey@mips.com> wrote:
> > expected?  Should this test also check flag_thread_jumps?  Or should
> > that be getting checked somewhere else?
> 
> -fthread-jumps is an RTL optimization flag and ignored on GIMPLE.

Does it make sense to add a -f[no-]tree-thread-jumps to enable/disable the tree
jump threading?  I could also add -f[no-]tree-fsm-thread-jumps.  Opinions?

On the llvm test-suite, I have seen one ICE with my fsm jump-thread patch.
This patch fixes the problem:

diff --git a/gcc/tree-ssa-threadupdate.c b/gcc/tree-ssa-threadupdate.c
index 12f83ba..f8c736e 100644
--- a/gcc/tree-ssa-threadupdate.c
+++ b/gcc/tree-ssa-threadupdate.c
@@ -2564,6 +2564,7 @@ thread_through_all_blocks (bool may_peel_loop_headers)
   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
     {
       if (!loop->header
+        || !loop_latch_edge (loop)
           || !bitmap_bit_p (threaded_blocks, loop->header->index))
           continue;
 
      retval |= thread_through_loop_header (loop, may_peel_loop_headers);

Ok to commit after regstrap?

Thanks,
Sebastian

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-09 17:38                                                     ` Sebastian Pop
@ 2014-12-09 18:39                                                       ` Jeff Law
  2014-12-09 19:43                                                         ` Richard Biener
  2014-12-09 20:43                                                         ` Mike Stump
  0 siblings, 2 replies; 54+ messages in thread
From: Jeff Law @ 2014-12-09 18:39 UTC (permalink / raw)
  To: Sebastian Pop, Richard Biener; +Cc: Steve Ellcey, James Greenhalgh, GCC Patches

On 12/09/14 10:38, Sebastian Pop wrote:
> Richard Biener wrote:
>> On Mon, Dec 8, 2014 at 10:49 PM, Steve Ellcey <sellcey@mips.com> wrote:
>>> expected?  Should this test also check flag_thread_jumps?  Or should
>>> that be getting checked somewhere else?
>>
>> -fthread-jumps is an RTL optimization flag and ignored on GIMPLE.
>
> Does it make sense to add a -f[no-]tree-thread-jumps to enable/disable the tree
> jump threading?  I could also add -f[no-]tree-fsm-thread-jumps.  Opinions?
Our option handling is a bit of a mess if we look at it from the user 
standpoint.  Given that the vast majority of jump threading happens on 
gimple, ISTM that -f[no-]thread-jumps ought to be controlling the gimple 
implementation.  One could easily argue that the user doesn't really 
care about where in the pipeline the optimization is implemented.

My vote would be to just make -fthread-jumps control both RTL and gimple 
jump threading.


>
> On the llvm test-suite, I have seen one ICE with my fsm jump-thread patch.
> This patch fixes the problem:
>
> diff --git a/gcc/tree-ssa-threadupdate.c b/gcc/tree-ssa-threadupdate.c
> index 12f83ba..f8c736e 100644
> --- a/gcc/tree-ssa-threadupdate.c
> +++ b/gcc/tree-ssa-threadupdate.c
> @@ -2564,6 +2564,7 @@ thread_through_all_blocks (bool may_peel_loop_headers)
>     FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
>       {
>         if (!loop->header
> +        || !loop_latch_edge (loop)
>             || !bitmap_bit_p (threaded_blocks, loop->header->index))
>             continue;
>
>        retval |= thread_through_loop_header (loop, may_peel_loop_headers);
>
> Ok to commit after regstrap?
This seems to be indicating that we have with no edge from the latch 
block to the header block.  I'd like to know better how we got into that 
state.

Also, a test for the GCC testsuite would be good.  I have no idea what 
license covers the LLVM testsuite.  But given a good analysis of the 
problem we may be able to write a suitable test independent of the LLVM 
test.

jeff

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-09 18:39                                                       ` Jeff Law
@ 2014-12-09 19:43                                                         ` Richard Biener
  2014-12-09 19:48                                                           ` Jeff Law
  2014-12-15 21:28                                                           ` Sebastian Pop
  2014-12-09 20:43                                                         ` Mike Stump
  1 sibling, 2 replies; 54+ messages in thread
From: Richard Biener @ 2014-12-09 19:43 UTC (permalink / raw)
  To: Jeff Law, Sebastian Pop; +Cc: Steve Ellcey, James Greenhalgh, GCC Patches

On December 9, 2014 7:39:48 PM CET, Jeff Law <law@redhat.com> wrote:
>On 12/09/14 10:38, Sebastian Pop wrote:
>> Richard Biener wrote:
>>> On Mon, Dec 8, 2014 at 10:49 PM, Steve Ellcey <sellcey@mips.com>
>wrote:
>>>> expected?  Should this test also check flag_thread_jumps?  Or
>should
>>>> that be getting checked somewhere else?
>>>
>>> -fthread-jumps is an RTL optimization flag and ignored on GIMPLE.
>>
>> Does it make sense to add a -f[no-]tree-thread-jumps to
>enable/disable the tree
>> jump threading?  I could also add -f[no-]tree-fsm-thread-jumps. 
>Opinions?
>Our option handling is a bit of a mess if we look at it from the user 
>standpoint.  Given that the vast majority of jump threading happens on 
>gimple, ISTM that -f[no-]thread-jumps ought to be controlling the
>gimple 
>implementation.  One could easily argue that the user doesn't really 
>care about where in the pipeline the optimization is implemented.
>
>My vote would be to just make -fthread-jumps control both RTL and
>gimple 
>jump threading.

Works for me.

>
>>
>> On the llvm test-suite, I have seen one ICE with my fsm jump-thread
>patch.
>> This patch fixes the problem:
>>
>> diff --git a/gcc/tree-ssa-threadupdate.c
>b/gcc/tree-ssa-threadupdate.c
>> index 12f83ba..f8c736e 100644
>> --- a/gcc/tree-ssa-threadupdate.c
>> +++ b/gcc/tree-ssa-threadupdate.c
>> @@ -2564,6 +2564,7 @@ thread_through_all_blocks (bool
>may_peel_loop_headers)
>>     FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
>>       {
>>         if (!loop->header
>> +        || !loop_latch_edge (loop)
>>             || !bitmap_bit_p (threaded_blocks, loop->header->index))
>>             continue;
>>
>>        retval |= thread_through_loop_header (loop,
>may_peel_loop_headers);
>>
>> Ok to commit after regstrap?
>This seems to be indicating that we have with no edge from the latch 
>block to the header block.  I'd like to know better how we got into
>that 
>state.

It Also returns null for loops with multiple latches. So the patch looks OK for me.

Thanks,
Richard.

>Also, a test for the GCC testsuite would be good.  I have no idea what 
>license covers the LLVM testsuite.  But given a good analysis of the 
>problem we may be able to write a suitable test independent of the LLVM
>
>test.
>
>jeff


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-09 19:43                                                         ` Richard Biener
@ 2014-12-09 19:48                                                           ` Jeff Law
  2014-12-15 21:28                                                           ` Sebastian Pop
  1 sibling, 0 replies; 54+ messages in thread
From: Jeff Law @ 2014-12-09 19:48 UTC (permalink / raw)
  To: Richard Biener, Sebastian Pop; +Cc: Steve Ellcey, James Greenhalgh, GCC Patches

On 12/09/14 12:43, Richard Biener wrote:
>> This seems to be indicating that we have with no edge from the latch
>> block to the header block.  I'd like to know better how we got into
>> that
>> state.
>
> It Also returns null for loops with multiple latches. So the patch looks OK for me.
Ah, OK.

Jeff

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-09 18:39                                                       ` Jeff Law
  2014-12-09 19:43                                                         ` Richard Biener
@ 2014-12-09 20:43                                                         ` Mike Stump
  1 sibling, 0 replies; 54+ messages in thread
From: Mike Stump @ 2014-12-09 20:43 UTC (permalink / raw)
  To: Jeff Law
  Cc: Sebastian Pop, Richard Biener, Steve Ellcey, James Greenhalgh,
	GCC Patches

On Dec 9, 2014, at 10:39 AM, Jeff Law <law@redhat.com> wrote:
> Also, a test for the GCC testsuite would be good.  I have no idea what license covers the LLVM testsuite.  But given a good analysis of the problem we may be able to write a suitable test independent of the LLVM test.

So, the usual engineerings rules should work just fine on it.  delta reduce and submit.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [Patch] Improving jump-thread pass for PR 54742
  2014-12-09 19:43                                                         ` Richard Biener
  2014-12-09 19:48                                                           ` Jeff Law
@ 2014-12-15 21:28                                                           ` Sebastian Pop
  1 sibling, 0 replies; 54+ messages in thread
From: Sebastian Pop @ 2014-12-15 21:28 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jeff Law, Steve Ellcey, James Greenhalgh, GCC Patches

Richard Biener wrote:
> 
> >
> >>
> >> On the llvm test-suite, I have seen one ICE with my fsm jump-thread
> >patch.
> >> This patch fixes the problem:
> >>
> >> diff --git a/gcc/tree-ssa-threadupdate.c
> >b/gcc/tree-ssa-threadupdate.c
> >> index 12f83ba..f8c736e 100644
> >> --- a/gcc/tree-ssa-threadupdate.c
> >> +++ b/gcc/tree-ssa-threadupdate.c
> >> @@ -2564,6 +2564,7 @@ thread_through_all_blocks (bool
> >may_peel_loop_headers)
> >>     FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
> >>       {
> >>         if (!loop->header
> >> +        || !loop_latch_edge (loop)
> >>             || !bitmap_bit_p (threaded_blocks, loop->header->index))
> >>             continue;
> >>
> >>        retval |= thread_through_loop_header (loop,
> >may_peel_loop_headers);
> >>
> >> Ok to commit after regstrap?
> >This seems to be indicating that we have with no edge from the latch 
> >block to the header block.  I'd like to know better how we got into
> >that 
> >state.
> 
> It Also returns null for loops with multiple latches. So the patch looks OK for me.

The bug I was seeing has been fixed by the patch for:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64284

Thanks,
Sebastian

^ permalink raw reply	[flat|nested] 54+ messages in thread

end of thread, other threads:[~2014-12-15 21:12 UTC | newest]

Thread overview: 54+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-08-19 20:40 [Patch] Switch elimination pass for PR 54742 Steve Ellcey
2014-08-20 17:04 ` James Greenhalgh
2014-08-20 20:29   ` Sebastian Pop
2014-08-21  8:53     ` Richard Biener
2014-08-22 20:13       ` Sebastian Pop
2014-08-21  8:58 ` Richard Biener
2014-08-21  9:41   ` James Greenhalgh
2014-08-21 10:30     ` Richard Biener
2014-08-25 17:35       ` Jeff Law
2014-09-26 20:14         ` Sebastian Pop
2014-10-26 21:34           ` [Patch] Improving jump-thread " Sebastian Pop
2014-11-11  1:40             ` Sebastian Pop
2014-11-17  9:29               ` James Greenhalgh
2014-11-18 19:36                 ` Steve Ellcey
2014-11-18 20:04                   ` Jeff Law
2014-11-17 12:47               ` Richard Biener
2014-11-18 22:29                 ` Sebastian Pop
2014-11-22 23:41                   ` Jeff Law
2014-11-24  0:06                     ` Sebastian Pop
2014-11-24 21:33                       ` Jeff Law
2014-11-24 22:28                         ` Sebastian Pop
2014-11-24 23:02                           ` Sebastian Pop
2014-11-24 23:18                           ` Jeff Law
2014-11-25  9:44                           ` Richard Biener
2014-11-25 11:03                           ` Markus Trippelsdorf
2014-11-24 23:25                       ` Jeff Law
2014-11-25  0:23                         ` Sebastian Pop
2014-11-25  3:11                           ` Sebastian Pop
2014-11-25  7:51                             ` Jeff Law
2014-11-25 16:41                               ` Jeff Law
2014-11-25 18:35                                 ` Sebastian Pop
2014-11-25 21:54                                   ` Sebastian Pop
2014-12-01 21:06                                     ` Jeff Law
2014-12-02 10:15                                       ` Richard Biener
2014-12-02 20:17                                         ` Jeff Law
2014-12-04  8:38                                       ` Sebastian Pop
2014-12-04  9:14                                         ` Sebastian Pop
2014-12-04 11:00                                           ` Sebastian Pop
2014-12-05 20:08                                           ` Jeff Law
2014-12-04 11:04                                       ` Sebastian Pop
2014-12-04 14:30                                         ` Sebastian Pop
2014-12-05 20:12                                           ` Jeff Law
2014-12-06 13:47                                             ` Sebastian Pop
2014-12-06 19:21                                               ` Sebastian Pop
2014-12-08 21:50                                                 ` Steve Ellcey
2014-12-09 13:14                                                   ` Richard Biener
2014-12-09 17:38                                                     ` Sebastian Pop
2014-12-09 18:39                                                       ` Jeff Law
2014-12-09 19:43                                                         ` Richard Biener
2014-12-09 19:48                                                           ` Jeff Law
2014-12-15 21:28                                                           ` Sebastian Pop
2014-12-09 20:43                                                         ` Mike Stump
2014-12-08 19:02                                               ` Jeff Law
2014-11-19 22:35             ` Jeff Law

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).