public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Enhance ifcombine to recover non short circuit branches
@ 2013-10-16  9:37 Zhenqiang Chen
  2013-10-16 12:22 ` Marc Glisse
                   ` (2 more replies)
  0 siblings, 3 replies; 31+ messages in thread
From: Zhenqiang Chen @ 2013-10-16  9:37 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1852 bytes --]

Hi,

The patch enhances ifcombine pass to recover some non short circuit
branches. Basically, it will do the following transformation:

Case 1:
  if (cond1)
    if (cond2)
==>
  if (cond1 && cond2)

Case 2:
  if (cond1)
    goto L1
  if (cond2)
    goto L1
==>
  if (cond1 || cond2)
    goto L1

Case 3:
  if (cond1)
    goto L1
  else
    goto L2
L1:
  if (cond2)
    goto L2
==>
  if (invert (cond1) || cond2)
    goto L2

Case 4:
  if (cond1)
    goto L1
  if (cond2)
    goto L2
L1:
==>
  if (invert (cond1) && cond2)
    goto L2

Bootstrap on X86-64 and ARM.

Two new FAILs in regression tests:
gcc.dg/uninit-pred-8_b.c
gcc.dg/uninit-pred-9_b.c
uninit pass should be enhanced to handle more complex conditions. Will
submit a bug to track it and fix it later.

Is it OK for trunk?

Thanks!
-Zhenqiang

ChangeLog:
2013-10-16  Zhenqiang Chen  <zhenqiang.chen@linaro.org>

        * fold-const.c (simple_operand_p_2): Make it global.
        * tree.h (simple_operand_p_2): Declare it.
        * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
        (bb_has_overhead_p, generate_condition_node,
        ifcombine_ccmp): New functions.
        (ifcombine_fold_ifandif): New function, extracted from
        ifcombine_ifandif.
        (ifcombine_ifandif): Call ifcombine_ccmp.
        (tree_ssa_ifcombine_bb): Skip optimized bb.

testsuite/ChangeLog
2013-10-16  Zhenqiang Chen  <zhenqiang.chen@linaro.org>

        * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c: New test case.
        * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c: New test case.
        * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c: New test case.
        * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c: New test case.
        * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c: New test case.
        * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c: New test case.
        * gcc.dg/tree-ssa/ssa-dom-thread-3.c: Updated.

[-- Attachment #2: ifcombine.patch --]
[-- Type: application/octet-stream, Size: 17391 bytes --]

diff --git a/gcc/fold-const.c b/gcc/fold-const.c
index c4c09b6..a21d229 100644
--- a/gcc/fold-const.c
+++ b/gcc/fold-const.c
@@ -110,7 +110,6 @@ static tree decode_field_reference (location_t, tree, HOST_WIDE_INT *,
 static int all_ones_mask_p (const_tree, int);
 static tree sign_bit_p (tree, const_tree);
 static int simple_operand_p (const_tree);
-static bool simple_operand_p_2 (tree);
 static tree range_binop (enum tree_code, tree, tree, int, tree, int);
 static tree range_predecessor (tree);
 static tree range_successor (tree);
@@ -3818,7 +3817,7 @@ simple_operand_p (const_tree exp)
    I addition to simple_operand_p, we assume that comparisons, conversions,
    and logic-not operations are simple, if their operands are simple, too.  */
 
-static bool
+bool
 simple_operand_p_2 (tree exp)
 {
   enum tree_code code;
diff --git a/gcc/tree.h b/gcc/tree.h
index d40d416..bb6fa55 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -4835,6 +4835,9 @@ extern bool is_tm_ending_fndecl (tree);
 extern void record_tm_replacement (tree, tree);
 extern void tm_malloc_replacement (tree);
 
+/* In fold-const.c.  */
+extern bool simple_operand_p_2 (tree);
+
 static inline bool
 is_tm_safe_or_pure (const_tree x)
 {
diff --git a/gcc/tree-ssa-ifcombine.c b/gcc/tree-ssa-ifcombine.c
index 268275e..5c4e1db 100644
--- a/gcc/tree-ssa-ifcombine.c
+++ b/gcc/tree-ssa-ifcombine.c
@@ -22,6 +22,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "system.h"
 #include "coretypes.h"
 #include "tm.h"
+#include "rtl.h"
+#include "tm_p.h"
 #include "tree.h"
 #include "basic-block.h"
 #include "tree-pretty-print.h"
@@ -294,6 +296,234 @@ recognize_bits_test (gimple cond, tree *name, tree *bits, bool inv)
   return true;
 }
 
+#ifndef LOGICAL_OP_NON_SHORT_CIRCUIT
+#define LOGICAL_OP_NON_SHORT_CIRCUIT \
+  (BRANCH_COST (optimize_function_for_speed_p (cfun), \
+		false) >= 2)
+#endif
+
+/* Verify if the basic block BB have overhead (more than one instruction).
+   Return true in this case, else false.  */
+
+static bool
+bb_has_overhead_p (basic_block bb, gimple inner_cond)
+{
+  gimple_stmt_iterator gsi;
+
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple stmt = gsi_stmt (gsi);
+      /* Skip label and debug.  */
+      if (gimple_code (stmt) == GIMPLE_LABEL
+	  || gimple_code (stmt) == GIMPLE_DEBUG)
+	continue;
+      /* BB has only one statement INNER_COND.
+	 TBD: If conditional compare is supported, it can have be a complex
+	 condition.  */
+      if (stmt == inner_cond)
+	return false;
+      else
+	return true;
+    }
+
+  return true;
+}
+
+/* Create a tree node with CODE to reprent the condition of COND.
+   Note: CODE is not the same as gimple_cond_code (cond).  */
+
+static tree
+generate_condition_node (gimple_stmt_iterator *gsi, gimple cond,
+		       enum tree_code code)
+{
+  tree t;
+  gimple tem;
+
+  if (code == NE_EXPR
+      && integer_zerop(gimple_cond_rhs (cond))
+      && TREE_CODE (TREE_TYPE (gimple_cond_lhs (cond))) == BOOLEAN_TYPE)
+    {
+      t = gimple_cond_lhs (cond);
+    }
+  else
+    {
+      t = make_ssa_name (boolean_type_node, NULL);
+      tem = gimple_build_assign_with_ops (code, t,
+					  gimple_cond_lhs (cond),
+					  gimple_cond_rhs (cond));
+      if (!tem)
+	return NULL_TREE;
+      gimple_set_location (tem, gimple_location (cond));
+      gsi_insert_before (gsi, tem, GSI_SAME_STMT);
+    }
+  return t;
+}
+
+/* Recover a non-short-circuit branch.  The inner if is specified by its
+   INNER_COND_BB, the outer by OUTER_COND_BB.
+   INNER_INV and OUT_INV indicate whether the conditions are inverted.
+   It tries to generate candidates for conditional compare.
+   Returns true if the edges to the common else basic-block were merged.
+   case 1:
+     if (cond1)
+       if (cond2)
+   to
+     if (cond1 && cond2)
+
+   case 2:
+     if (cond1)
+       goto L1
+     if (cond2)
+       goto L1
+   to
+     if (cond1 || cond2)
+       goto L1
+
+   case 3:
+     if (cond1)
+       goto L1
+     else
+       goto L2
+     L1:
+     if (cond2)
+       goto L2
+   to
+     if (invert (cond1) || cond2)
+       goto L2
+
+   case 4:
+     if (cond1)
+       goto L1
+     if (cond2)
+       goto L2
+     L1:
+   to
+     if (invert (cond1) && cond2)
+       goto L2
+*/
+
+static bool
+ifcombine_ccmp (gimple inner_cond, bool inner_inv,
+		gimple outer_cond, bool outer_inv)
+{
+  enum tree_code inner_cond_code = gimple_cond_code (inner_cond);
+  enum tree_code outer_cond_code = gimple_cond_code (outer_cond);
+  tree t0, t1, t2, t3;
+  gimple tem;
+  gimple_stmt_iterator gsi;
+
+  /* For sequence point consistancy, we need to check for trapping,
+     and side-effects.  The check is the same as it in
+     function fold_truth_andor of fold-const.c.  */
+  if (!simple_operand_p_2(gimple_cond_lhs (outer_cond))
+      || !simple_operand_p_2(gimple_cond_rhs (outer_cond))
+      || !simple_operand_p_2(gimple_cond_lhs (inner_cond))
+      || !simple_operand_p_2(gimple_cond_rhs (inner_cond)))
+    return false;
+
+  if (outer_inv != inner_inv)
+    {
+      enum machine_mode mode;
+      mode = TYPE_MODE (TREE_TYPE (gimple_cond_lhs (outer_cond)));
+      outer_cond_code = invert_tree_comparison (outer_cond_code,
+						HONOR_NANS (mode));
+    }
+  if (outer_cond_code == ERROR_MARK)
+    return false;
+
+  gsi = gsi_for_stmt (inner_cond);
+  t0 = generate_condition_node (&gsi, outer_cond, outer_cond_code);
+  if (t0 == NULL_TREE)
+    return false;
+  t1 = generate_condition_node (&gsi, inner_cond, inner_cond_code);
+  if (t1 == NULL_TREE)
+    return false;
+
+  t2 = make_ssa_name (boolean_type_node, NULL);
+  tem = gimple_build_assign_with_ops (inner_inv ? BIT_IOR_EXPR : BIT_AND_EXPR,
+				       t2, t0, t1);
+  gsi_insert_before (&gsi, tem, GSI_SAME_STMT);
+
+  t3 = fold_build2 (NE_EXPR, TREE_TYPE (t2), t2, integer_zero_node);
+  t3 = canonicalize_cond_expr_cond (t3);
+  if (!t3)
+    return false;
+
+  gimple_cond_set_condition_from_tree (inner_cond, t3);
+  update_stmt (inner_cond);
+
+  /* Leave CFG optimization to cfg_cleanup.  */
+  gimple_cond_set_condition_from_tree (outer_cond,
+				       outer_inv ? boolean_false_node
+						   : boolean_true_node);
+  update_stmt (outer_cond);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "Recovering non short circuit.\n");
+    }
+
+  return true;
+}
+
+/* If-convert on a and pattern with a common else block.  The inner
+   if is specified by its INNER_COND_BB, the outer by OUTER_COND_BB.
+   INNER_INV, OUT_INV and RESULT_INV indicate whether the conditions
+   are inverted.  This convert depends on maybe_fold_and_comparisons.
+   Returns true if the edges to the common else basic-block were merged.  */
+
+static bool
+ifcombine_fold_ifandif (gimple inner_cond, bool inner_inv,
+			gimple outer_cond, bool outer_inv, bool result_inv)
+{
+  tree t;
+  enum tree_code inner_cond_code = gimple_cond_code (inner_cond);
+  enum tree_code outer_cond_code = gimple_cond_code (outer_cond);
+
+  /* Invert comparisons if necessary (and possible).  */
+  if (inner_inv)
+    inner_cond_code = invert_tree_comparison (inner_cond_code,
+      HONOR_NANS (TYPE_MODE (TREE_TYPE (gimple_cond_lhs (inner_cond)))));
+  if (inner_cond_code == ERROR_MARK)
+    return false;
+  if (outer_inv)
+    outer_cond_code = invert_tree_comparison (outer_cond_code,
+      HONOR_NANS (TYPE_MODE (TREE_TYPE (gimple_cond_lhs (outer_cond)))));
+  if (outer_cond_code == ERROR_MARK)
+    return false;
+  /* Don't return false so fast, try maybe_fold_or_comparisons?  */
+
+  if (!(t = maybe_fold_and_comparisons (inner_cond_code,
+					gimple_cond_lhs (inner_cond),
+					gimple_cond_rhs (inner_cond),
+					outer_cond_code,
+					gimple_cond_lhs (outer_cond),
+					gimple_cond_rhs (outer_cond))))
+    return false;
+  if (result_inv)
+    t = fold_build1 (TRUTH_NOT_EXPR, TREE_TYPE (t), t);
+  t = canonicalize_cond_expr_cond (t);
+  if (!t)
+    return false;
+  gimple_cond_set_condition_from_tree (inner_cond, t);
+  update_stmt (inner_cond);
+
+  /* Leave CFG optimization to cfg_cleanup.  */
+  gimple_cond_set_condition_from_tree (outer_cond,
+				       outer_inv ? boolean_false_node
+						   : boolean_true_node);
+  update_stmt (outer_cond);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "optimizing two comparisons to ");
+      print_generic_expr (dump_file, t, 0);
+      fprintf (dump_file, "\n");
+    }
+
+  return true;
+}
+
 /* If-convert on a and pattern with a common else block.  The inner
    if is specified by its INNER_COND_BB, the outer by OUTER_COND_BB.
    inner_inv, outer_inv and result_inv indicate whether the conditions
@@ -457,55 +687,18 @@ ifcombine_ifandif (basic_block inner_cond_bb, bool inner_inv,
       return true;
     }
 
-  /* See if we have two comparisons that we can merge into one.  */
   else if (TREE_CODE_CLASS (gimple_cond_code (inner_cond)) == tcc_comparison
 	   && TREE_CODE_CLASS (gimple_cond_code (outer_cond)) == tcc_comparison)
     {
-      tree t;
-      enum tree_code inner_cond_code = gimple_cond_code (inner_cond);
-      enum tree_code outer_cond_code = gimple_cond_code (outer_cond);
-
-      /* Invert comparisons if necessary (and possible).  */
-      if (inner_inv)
-	inner_cond_code = invert_tree_comparison (inner_cond_code,
-	  HONOR_NANS (TYPE_MODE (TREE_TYPE (gimple_cond_lhs (inner_cond)))));
-      if (inner_cond_code == ERROR_MARK)
-	return false;
-      if (outer_inv)
-	outer_cond_code = invert_tree_comparison (outer_cond_code,
-	  HONOR_NANS (TYPE_MODE (TREE_TYPE (gimple_cond_lhs (outer_cond)))));
-      if (outer_cond_code == ERROR_MARK)
-	return false;
-      /* Don't return false so fast, try maybe_fold_or_comparisons?  */
-
-      if (!(t = maybe_fold_and_comparisons (inner_cond_code,
-					    gimple_cond_lhs (inner_cond),
-					    gimple_cond_rhs (inner_cond),
-					    outer_cond_code,
-					    gimple_cond_lhs (outer_cond),
-					    gimple_cond_rhs (outer_cond))))
-	return false;
-      if (result_inv)
-	t = fold_build1 (TRUTH_NOT_EXPR, TREE_TYPE (t), t);
-      t = canonicalize_cond_expr_cond (t);
-      if (!t)
-	return false;
-      gimple_cond_set_condition_from_tree (inner_cond, t);
-      update_stmt (inner_cond);
-
-      /* Leave CFG optimization to cfg_cleanup.  */
-      gimple_cond_set_condition_from_tree (outer_cond,
-	outer_inv ? boolean_false_node : boolean_true_node);
-      update_stmt (outer_cond);
-
-      if (dump_file)
-	{
-	  fprintf (dump_file, "optimizing two comparisons to ");
-	  print_generic_expr (dump_file, t, 0);
-	  fprintf (dump_file, "\n");
-	}
-
-      return true;
+      /* See if we have two comparisons that we can merge into one.  */
+      if (ifcombine_fold_ifandif (inner_cond, inner_inv,
+				  outer_cond, outer_inv, result_inv))
+	return true;
+
+      /* See if we have two comparisons that we can use NON_SHORT_CIRCUIT.  */
+      if ((LOGICAL_OP_NON_SHORT_CIRCUIT)
+	   && !bb_has_overhead_p (inner_cond_bb, inner_cond))
+	return ifcombine_ccmp (inner_cond, inner_inv, outer_cond, outer_inv);
     }
 
   return false;
@@ -519,10 +712,21 @@ static bool
 tree_ssa_ifcombine_bb (basic_block inner_cond_bb)
 {
   basic_block then_bb = NULL, else_bb = NULL;
+  gimple inner_cond;
 
   if (!recognize_if_then_else (inner_cond_bb, &then_bb, &else_bb))
     return false;
 
+  inner_cond = last_stmt (inner_cond_bb);
+  if (!inner_cond
+      || gimple_code (inner_cond) != GIMPLE_COND)
+    return false;
+
+  /* If innner_cond is already optimized, just return FALSE.  */
+  if (TREE_CODE (gimple_cond_lhs (inner_cond)) == INTEGER_CST
+      && TREE_CODE (gimple_cond_rhs (inner_cond)) == INTEGER_CST)
+    return false;
+
   /* Recognize && and || of two conditions with a common
      then/else block which entry edges we can merge.  That is:
        if (a || b)
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c
index 0d53f50..6fbce47 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c
@@ -42,7 +42,7 @@ expand_one_var (tree var, unsigned char toplevel, unsigned char really_expand)
     abort ();
 }
 /* We should thread the jump, through an intermediate block.  */
-/* { dg-final { scan-tree-dump-times "Threaded" 2 "dom1"} } */
-/* { dg-final { scan-tree-dump-times "Registering jump thread: \\(.*\\) incoming edge;  \\(.*\\) joiner;  \\(.*\\) nocopy;" 1 "dom1"} } */
+/* { dg-final { scan-tree-dump "Threaded" "dom1"} } */
+/* { dg-final { scan-tree-dump-times "Registering jump thread: \\(.*\\) incoming edge;" 1 "dom1"} } */
 /* { dg-final { cleanup-tree-dump "dom1" } } */
 
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c
new file mode 100644
index 0000000..56c936d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
+
+/* { dg-options "-O2 -g -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+
+int t (int a, int b)
+{
+  if (a > 0)
+    if (b > 0)
+      return 0;
+  return 1;
+}
+/* { dg-final { scan-tree-dump "\&" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c
new file mode 100644
index 0000000..3273bcc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c
@@ -0,0 +1,17 @@
+/* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
+
+/* { dg-options "-O2 -g -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+
+int t (int a, int b)
+{
+  if (a > 0)
+    goto L1;
+  if (b > 0)
+    goto L1;
+  return 0;
+L1:
+  return 1;
+}
+/* { dg-final { scan-tree-dump "\|" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c
new file mode 100644
index 0000000..500cb01
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c
@@ -0,0 +1,20 @@
+/* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
+
+/* { dg-options "-O2 -g -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+
+int t (int a, int b)
+{
+  if (a > 0)
+    goto L1;
+  else
+    goto L2;
+L1:
+  if (b > 0)
+    goto L2;
+  return 5;
+L2:
+  return 6;
+}
+/* { dg-final { scan-tree-dump "\|" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c
new file mode 100644
index 0000000..8b87101
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c
@@ -0,0 +1,18 @@
+/* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
+
+/* { dg-options "-O2 -g -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+
+int t (int a, int b)
+{
+  if (a > 0)
+    goto L1;
+  if (b > 0)
+    goto L2;
+L1:
+  return 0;
+L2:
+  return 1;
+}
+/* { dg-final { scan-tree-dump "\&" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c
new file mode 100644
index 0000000..2aa225b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
+
+/* { dg-options "-O2 -g -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+
+int t (int a, int b, int c)
+{
+  if (a > 0 && b > 0 && c > 0)
+      return 0;
+  return 1;
+}
+/* { dg-final { scan-tree-dump-times "\&" 2 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c
new file mode 100644
index 0000000..659e816
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
+
+/* { dg-options "-O2 -g -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+
+int t (int a, int b, int c)
+{
+  if (a > 0 || b > 0 || c > 0)
+      return 0;
+  return 1;
+}
+/* { dg-final { scan-tree-dump-times "\\|" 2 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-16  9:37 [PATCH] Enhance ifcombine to recover non short circuit branches Zhenqiang Chen
@ 2013-10-16 12:22 ` Marc Glisse
  2013-10-17  2:50 ` Andrew Pinski
  2013-10-26 23:49 ` Andrew Pinski
  2 siblings, 0 replies; 31+ messages in thread
From: Marc Glisse @ 2013-10-16 12:22 UTC (permalink / raw)
  To: Zhenqiang Chen; +Cc: gcc-patches

On Wed, 16 Oct 2013, Zhenqiang Chen wrote:

> The patch enhances ifcombine pass to recover some non short circuit
> branches. Basically, it will do the following transformation:
>
> Case 1:
>  if (cond1)
>    if (cond2)
> ==>
>  if (cond1 && cond2)

Hello,

it might be more clear if you wrote (cond1 & cond2) so it can't be 
confused with a short-circuit?

Using branch probabilities to guide the decision would be nice, don't know 
how hard that would be though.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-16  9:37 [PATCH] Enhance ifcombine to recover non short circuit branches Zhenqiang Chen
  2013-10-16 12:22 ` Marc Glisse
@ 2013-10-17  2:50 ` Andrew Pinski
  2013-10-17 11:14   ` Richard Biener
  2013-10-26 23:49 ` Andrew Pinski
  2 siblings, 1 reply; 31+ messages in thread
From: Andrew Pinski @ 2013-10-17  2:50 UTC (permalink / raw)
  To: Zhenqiang Chen; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2994 bytes --]

On Wed, Oct 16, 2013 at 2:12 AM, Zhenqiang Chen
<zhenqiang.chen@linaro.org> wrote:
> Hi,
>
> The patch enhances ifcombine pass to recover some non short circuit
> branches. Basically, it will do the following transformation:
>
> Case 1:
>   if (cond1)
>     if (cond2)
> ==>
>   if (cond1 && cond2)
>
> Case 2:
>   if (cond1)
>     goto L1
>   if (cond2)
>     goto L1
> ==>
>   if (cond1 || cond2)
>     goto L1
>
> Case 3:
>   if (cond1)
>     goto L1
>   else
>     goto L2
> L1:
>   if (cond2)
>     goto L2
> ==>
>   if (invert (cond1) || cond2)
>     goto L2
>
> Case 4:
>   if (cond1)
>     goto L1
>   if (cond2)
>     goto L2
> L1:
> ==>
>   if (invert (cond1) && cond2)
>     goto L2
>
> Bootstrap on X86-64 and ARM.
>
> Two new FAILs in regression tests:
> gcc.dg/uninit-pred-8_b.c
> gcc.dg/uninit-pred-9_b.c
> uninit pass should be enhanced to handle more complex conditions. Will
> submit a bug to track it and fix it later.
>
> Is it OK for trunk?

I had a much simpler change which did basically the same from 4.7 (I
can update it if people think this is a better approach).

Thanks,
Andrew Pinski


    2012-09-29  Andrew Pinski  <apinski@cavium.com>

        * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
        (ifcombine_ifandif): Handle cases where
        maybe_fold_and_comparisons fails, combining the branches
        anyways.
        (tree_ssa_ifcombine): Inverse the order of
        the basic block walk, increases the number of combinings.
        * Makefile.in (tree-ssa-ifcombine.o): Update dependencies.

        * testsuite/gcc.dg/tree-ssa/phi-opt-2.c: Expect zero ifs as the compiler
        produces a & b now.
        * testsuite/gcc.dg/tree-ssa/phi-opt-9.c: Use a function call
        to prevent conditional move to be used.
        * testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c: Remove check for
        "one or more intermediate".


>
> Thanks!
> -Zhenqiang
>
> ChangeLog:
> 2013-10-16  Zhenqiang Chen  <zhenqiang.chen@linaro.org>
>
>         * fold-const.c (simple_operand_p_2): Make it global.
>         * tree.h (simple_operand_p_2): Declare it.
>         * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
>         (bb_has_overhead_p, generate_condition_node,
>         ifcombine_ccmp): New functions.
>         (ifcombine_fold_ifandif): New function, extracted from
>         ifcombine_ifandif.
>         (ifcombine_ifandif): Call ifcombine_ccmp.
>         (tree_ssa_ifcombine_bb): Skip optimized bb.
>
> testsuite/ChangeLog
> 2013-10-16  Zhenqiang Chen  <zhenqiang.chen@linaro.org>
>
>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c: New test case.
>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c: New test case.
>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c: New test case.
>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c: New test case.
>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c: New test case.
>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c: New test case.
>         * gcc.dg/tree-ssa/ssa-dom-thread-3.c: Updated.

[-- Attachment #2: doifcombine.diff.txt --]
[-- Type: text/plain, Size: 7462 bytes --]

commit 90434d0567109d9add1eb5583ae38df06cc6ffa1
Author: Andrew Pinski <apinski@cavium.com>
Date:   Sat Sep 29 01:29:12 2012 -0700

    2012-09-29  Andrew Pinski  <apinski@cavium.com>
    
    	* tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
    	(ifcombine_ifandif): Handle cases where
    	maybe_fold_and_comparisons fails, combining the branches
    	anyways.
    	(tree_ssa_ifcombine): Inverse the order of
    	the basic block walk, increases the number of combinings.
    	* Makefile.in (tree-ssa-ifcombine.o): Update dependencies.
    
    	* testsuite/gcc.dg/tree-ssa/phi-opt-2.c: Expect zero ifs as the compiler
    	produces a & b now.
    	* testsuite/gcc.dg/tree-ssa/phi-opt-9.c: Use a function call
    	to prevent conditional move to be used.
    	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c: Remove check for
    	"one or more intermediate".

diff --git a/gcc/ChangeLog.CAVIUM b/gcc/ChangeLog.CAVIUM
index 4739ded..8413818 100644
--- a/gcc/ChangeLog.CAVIUM
+++ b/gcc/ChangeLog.CAVIUM
@@ -1,5 +1,22 @@
 2012-09-29  Andrew Pinski  <apinski@cavium.com>
 
+	* tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
+	(ifcombine_ifandif): Handle cases where
+	maybe_fold_and_comparisons fails, combining the branches
+	anyways.
+	(tree_ssa_ifcombine): Inverse the order of
+	the basic block walk, increases the number of combinings.
+	* Makefile.in (tree-ssa-ifcombine.o): Update dependencies.
+
+	* testsuite/gcc.dg/tree-ssa/phi-opt-2.c: Expect zero ifs as the compiler
+	produces a & b now.
+	* testsuite/gcc.dg/tree-ssa/phi-opt-9.c: Use a function call
+	to prevent conditional move to be used.
+	* testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c: Remove check for
+	"one or more intermediate".
+
+2012-09-29  Andrew Pinski  <apinski@cavium.com>
+
 	Revert:
 	2012-07-23  Andrew Pinski  <apinski@cavium.com>
 
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 172dde2..ef8cb98 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -2357,7 +2357,7 @@ tree-ssa-phiprop.o : tree-ssa-phiprop.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
    $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) $(DIAGNOSTIC_H) $(TIMEVAR_H) \
    langhooks.h $(FLAGS_H) tree-pretty-print.h gimple-pretty-print.h
 tree-ssa-ifcombine.o : tree-ssa-ifcombine.c $(CONFIG_H) $(SYSTEM_H) \
-   coretypes.h $(TM_H) $(TREE_H) $(BASIC_BLOCK_H) \
+   coretypes.h $(TM_H) $(RTL_H) $(TM_P_H) $(TREE_H) $(BASIC_BLOCK_H) \
    $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) $(DIAGNOSTIC_H) $(TIMEVAR_H) \
    tree-pretty-print.h
 tree-merge-const-bfstores.o : tree-merge-const-bfstores.c $(CONFIG_H) \
diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h
index 6c680be..89c76d9 100644
--- a/gcc/config/mips/mips.h
+++ b/gcc/config/mips/mips.h
@@ -2428,7 +2428,6 @@ typedef struct mips_args {
    1 is the default; other values are interpreted relative to that.  */
 
 #define BRANCH_COST(speed_p, predictable_p) mips_branch_cost
-#define LOGICAL_OP_NON_SHORT_CIRCUIT 0
 
 /* If defined, modifies the length assigned to instruction INSN as a
    function of the context in which it is used.  LENGTH is an lvalue
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-2.c b/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-2.c
index 415c117..26952ac 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-2.c
@@ -13,10 +13,7 @@ _Bool f1(_Bool a, _Bool b)
   return 0;
 }
 
-
-/* There should be only one if, the outer one; the inner one
-   should have been changed to straight line code with the
-   value of b (except that we don't fold ! (b != 0) into b
-   which can be fixed in a different patch).  */
-/* { dg-final { scan-tree-dump-times "if" 1 "optimized"} } */
+/* There should be only no ifs; both of them should have
+   been changed into straight line code (a & b).  */
+/* { dg-final { scan-tree-dump-times "if" 0 "optimized"} } */
 /* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-9.c b/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-9.c
index dccea7b..a224788 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-9.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-9.c
@@ -2,13 +2,14 @@
 /* { dg-options "-O -fdump-tree-optimized" } */
 
 int g(int,int);
+int h(int);
 int f(int t, int c)
 {
   int d = 0;
   int e = 0;
   if (t)
     {
-      d = c+1;
+      d = h(c);
       e = t;
     }
   else d = 0, e = 0;
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c
index d851bf2..8fb3ef8 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c
@@ -42,6 +42,5 @@ expand_one_var (tree var, unsigned char toplevel, unsigned char really_expand)
 }
 /* We should thread the jump, through an intermediate block.  */
 /* { dg-final { scan-tree-dump-times "Threaded" 1 "dom1"} } */
-/* { dg-final { scan-tree-dump-times "one or more intermediate" 1 "dom1"} } */
 /* { dg-final { cleanup-tree-dump "dom1" } } */
 
diff --git a/gcc/tree-ssa-ifcombine.c b/gcc/tree-ssa-ifcombine.c
index 4e6075b..6e18ec8 100644
--- a/gcc/tree-ssa-ifcombine.c
+++ b/gcc/tree-ssa-ifcombine.c
@@ -22,6 +22,10 @@ along with GCC; see the file COPYING3.  If not see
 #include "system.h"
 #include "coretypes.h"
 #include "tm.h"
+/* rtl is needed only because arm back-end requires it for
+   BRANCH_COST.  */
+#include "rtl.h"
+#include "tm_p.h"
 #include "tree.h"
 #include "basic-block.h"
 #include "timevar.h"
@@ -30,6 +34,12 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-pass.h"
 #include "tree-dump.h"
 
+#ifndef LOGICAL_OP_NON_SHORT_CIRCUIT
+#define LOGICAL_OP_NON_SHORT_CIRCUIT \
+  (BRANCH_COST (optimize_function_for_speed_p (cfun), \
+                false) >= 2)
+#endif
+
 /* This pass combines COND_EXPRs to simplify control flow.  It
    currently recognizes bit tests and comparisons in chains that
    represent logical and or logical or of two COND_EXPRs.
@@ -486,7 +496,30 @@ ifcombine_ifandif (basic_block inner_cond_bb, bool inner_inv,
 					    outer_cond_code,
 					    gimple_cond_lhs (outer_cond),
 					    gimple_cond_rhs (outer_cond))))
-	return false;
+	{
+	  tree t1, t2;
+	  gimple_stmt_iterator gsi;
+	  if (!LOGICAL_OP_NON_SHORT_CIRCUIT)
+	    return false;
+	  /* Only do this optimization if the inner bb contains only the conditional. */
+	  if (!gsi_one_before_end_p (gsi_start_nondebug_bb (inner_cond_bb)))
+	    return false;
+	  t1 = fold_build2_loc (gimple_location (inner_cond),
+				inner_cond_code,
+				boolean_type_node,
+				gimple_cond_lhs (inner_cond),
+				gimple_cond_rhs (inner_cond));
+	  t2 = fold_build2_loc (gimple_location (outer_cond),
+				outer_cond_code,
+				boolean_type_node,
+				gimple_cond_lhs (outer_cond),
+				gimple_cond_rhs (outer_cond));
+	  t = fold_build2_loc (gimple_location (inner_cond), 
+			       TRUTH_AND_EXPR, boolean_type_node, t1, t2);
+	  gsi = gsi_for_stmt (inner_cond);
+	  t = force_gimple_operand_gsi (&gsi, t, true, NULL, true,
+					GSI_SAME_STMT);
+        }
       if (result_inv)
 	t = fold_build1 (TRUTH_NOT_EXPR, TREE_TYPE (t), t);
       t = canonicalize_cond_expr_cond (t);
@@ -629,7 +662,7 @@ tree_ssa_ifcombine (void)
   bbs = blocks_in_phiopt_order ();
   calculate_dominance_info (CDI_DOMINATORS);
 
-  for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; ++i)
+  for (i = n_basic_blocks - NUM_FIXED_BLOCKS -1; i >= 0; i--)
     {
       basic_block bb = bbs[i];
       gimple stmt = last_stmt (bb);

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-17  2:50 ` Andrew Pinski
@ 2013-10-17 11:14   ` Richard Biener
  2013-10-17 17:09     ` Jeff Law
  2013-10-18  2:37     ` Andrew Pinski
  0 siblings, 2 replies; 31+ messages in thread
From: Richard Biener @ 2013-10-17 11:14 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: Zhenqiang Chen, gcc-patches

On Thu, Oct 17, 2013 at 4:14 AM, Andrew Pinski <pinskia@gmail.com> wrote:
> On Wed, Oct 16, 2013 at 2:12 AM, Zhenqiang Chen
> <zhenqiang.chen@linaro.org> wrote:
>> Hi,
>>
>> The patch enhances ifcombine pass to recover some non short circuit
>> branches. Basically, it will do the following transformation:
>>
>> Case 1:
>>   if (cond1)
>>     if (cond2)
>> ==>
>>   if (cond1 && cond2)
>>
>> Case 2:
>>   if (cond1)
>>     goto L1
>>   if (cond2)
>>     goto L1
>> ==>
>>   if (cond1 || cond2)
>>     goto L1
>>
>> Case 3:
>>   if (cond1)
>>     goto L1
>>   else
>>     goto L2
>> L1:
>>   if (cond2)
>>     goto L2
>> ==>
>>   if (invert (cond1) || cond2)
>>     goto L2
>>
>> Case 4:
>>   if (cond1)
>>     goto L1
>>   if (cond2)
>>     goto L2
>> L1:
>> ==>
>>   if (invert (cond1) && cond2)
>>     goto L2
>>
>> Bootstrap on X86-64 and ARM.
>>
>> Two new FAILs in regression tests:
>> gcc.dg/uninit-pred-8_b.c
>> gcc.dg/uninit-pred-9_b.c
>> uninit pass should be enhanced to handle more complex conditions. Will
>> submit a bug to track it and fix it later.
>>
>> Is it OK for trunk?
>
> I had a much simpler change which did basically the same from 4.7 (I
> can update it if people think this is a better approach).

I like that more (note you can now use is_gimple_condexpr as predicate
for force_gimple_operand).

With that we should be able to kill the fold-const.c transform?

Thanks,
Richard.

> Thanks,
> Andrew Pinski
>
>
>     2012-09-29  Andrew Pinski  <apinski@cavium.com>
>
>         * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
>         (ifcombine_ifandif): Handle cases where
>         maybe_fold_and_comparisons fails, combining the branches
>         anyways.
>         (tree_ssa_ifcombine): Inverse the order of
>         the basic block walk, increases the number of combinings.
>         * Makefile.in (tree-ssa-ifcombine.o): Update dependencies.
>
>         * testsuite/gcc.dg/tree-ssa/phi-opt-2.c: Expect zero ifs as the compiler
>         produces a & b now.
>         * testsuite/gcc.dg/tree-ssa/phi-opt-9.c: Use a function call
>         to prevent conditional move to be used.
>         * testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c: Remove check for
>         "one or more intermediate".
>
>
>>
>> Thanks!
>> -Zhenqiang
>>
>> ChangeLog:
>> 2013-10-16  Zhenqiang Chen  <zhenqiang.chen@linaro.org>
>>
>>         * fold-const.c (simple_operand_p_2): Make it global.
>>         * tree.h (simple_operand_p_2): Declare it.
>>         * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
>>         (bb_has_overhead_p, generate_condition_node,
>>         ifcombine_ccmp): New functions.
>>         (ifcombine_fold_ifandif): New function, extracted from
>>         ifcombine_ifandif.
>>         (ifcombine_ifandif): Call ifcombine_ccmp.
>>         (tree_ssa_ifcombine_bb): Skip optimized bb.
>>
>> testsuite/ChangeLog
>> 2013-10-16  Zhenqiang Chen  <zhenqiang.chen@linaro.org>
>>
>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c: New test case.
>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c: New test case.
>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c: New test case.
>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c: New test case.
>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c: New test case.
>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c: New test case.
>>         * gcc.dg/tree-ssa/ssa-dom-thread-3.c: Updated.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-17 11:14   ` Richard Biener
@ 2013-10-17 17:09     ` Jeff Law
  2013-10-18  9:53       ` Zhenqiang Chen
  2013-10-18  2:37     ` Andrew Pinski
  1 sibling, 1 reply; 31+ messages in thread
From: Jeff Law @ 2013-10-17 17:09 UTC (permalink / raw)
  To: Richard Biener, Andrew Pinski; +Cc: Zhenqiang Chen, gcc-patches

On 10/17/13 05:03, Richard Biener wrote:
>>> Is it OK for trunk?
>>
>> I had a much simpler change which did basically the same from 4.7 (I
>> can update it if people think this is a better approach).
>
> I like that more (note you can now use is_gimple_condexpr as predicate
> for force_gimple_operand).
The obvious question is whether or not Andrew's simpler change picks up 
as many transformations as Zhenqiang's change.  If not are the things 
missed important.

Zhenqiang, can you do some testing of your change vs Andrew P.'s change?

>
> With that we should be able to kill the fold-const.c transform?
That would certainly be nice and an excellent follow-up for Zhenqiang.

jeff

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-17 11:14   ` Richard Biener
  2013-10-17 17:09     ` Jeff Law
@ 2013-10-18  2:37     ` Andrew Pinski
  2013-10-18  2:42       ` Andrew Pinski
  1 sibling, 1 reply; 31+ messages in thread
From: Andrew Pinski @ 2013-10-18  2:37 UTC (permalink / raw)
  To: Richard Biener; +Cc: Zhenqiang Chen, gcc-patches

On Thu, Oct 17, 2013 at 4:03 AM, Richard Biener
<richard.guenther@gmail.com> wrote:
> On Thu, Oct 17, 2013 at 4:14 AM, Andrew Pinski <pinskia@gmail.com> wrote:
>> On Wed, Oct 16, 2013 at 2:12 AM, Zhenqiang Chen
>> <zhenqiang.chen@linaro.org> wrote:
>>> Hi,
>>>
>>> The patch enhances ifcombine pass to recover some non short circuit
>>> branches. Basically, it will do the following transformation:
>>>
>>> Case 1:
>>>   if (cond1)
>>>     if (cond2)
>>> ==>
>>>   if (cond1 && cond2)
>>>
>>> Case 2:
>>>   if (cond1)
>>>     goto L1
>>>   if (cond2)
>>>     goto L1
>>> ==>
>>>   if (cond1 || cond2)
>>>     goto L1
>>>
>>> Case 3:
>>>   if (cond1)
>>>     goto L1
>>>   else
>>>     goto L2
>>> L1:
>>>   if (cond2)
>>>     goto L2
>>> ==>
>>>   if (invert (cond1) || cond2)
>>>     goto L2
>>>
>>> Case 4:
>>>   if (cond1)
>>>     goto L1
>>>   if (cond2)
>>>     goto L2
>>> L1:
>>> ==>
>>>   if (invert (cond1) && cond2)
>>>     goto L2
>>>
>>> Bootstrap on X86-64 and ARM.
>>>
>>> Two new FAILs in regression tests:
>>> gcc.dg/uninit-pred-8_b.c
>>> gcc.dg/uninit-pred-9_b.c
>>> uninit pass should be enhanced to handle more complex conditions. Will
>>> submit a bug to track it and fix it later.
>>>
>>> Is it OK for trunk?
>>
>> I had a much simpler change which did basically the same from 4.7 (I
>> can update it if people think this is a better approach).
>
> I like that more (note you can now use is_gimple_condexpr as predicate
> for force_gimple_operand).


Ok, with both this email and Jakub's email, I decided to port the
patch to the trunk but I ran into a bug in reassoc which I submitted
as PR 58775 (with a testcase which shows the issue without this
patch).


>
> With that we should be able to kill the fold-const.c transform?


I think so but I have never tested removing it.

Thanks,
Andrew Pinski


>
> Thanks,
> Richard.
>
>> Thanks,
>> Andrew Pinski
>>
>>
>>     2012-09-29  Andrew Pinski  <apinski@cavium.com>
>>
>>         * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
>>         (ifcombine_ifandif): Handle cases where
>>         maybe_fold_and_comparisons fails, combining the branches
>>         anyways.
>>         (tree_ssa_ifcombine): Inverse the order of
>>         the basic block walk, increases the number of combinings.
>>         * Makefile.in (tree-ssa-ifcombine.o): Update dependencies.
>>
>>         * testsuite/gcc.dg/tree-ssa/phi-opt-2.c: Expect zero ifs as the compiler
>>         produces a & b now.
>>         * testsuite/gcc.dg/tree-ssa/phi-opt-9.c: Use a function call
>>         to prevent conditional move to be used.
>>         * testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c: Remove check for
>>         "one or more intermediate".
>>
>>
>>>
>>> Thanks!
>>> -Zhenqiang
>>>
>>> ChangeLog:
>>> 2013-10-16  Zhenqiang Chen  <zhenqiang.chen@linaro.org>
>>>
>>>         * fold-const.c (simple_operand_p_2): Make it global.
>>>         * tree.h (simple_operand_p_2): Declare it.
>>>         * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
>>>         (bb_has_overhead_p, generate_condition_node,
>>>         ifcombine_ccmp): New functions.
>>>         (ifcombine_fold_ifandif): New function, extracted from
>>>         ifcombine_ifandif.
>>>         (ifcombine_ifandif): Call ifcombine_ccmp.
>>>         (tree_ssa_ifcombine_bb): Skip optimized bb.
>>>
>>> testsuite/ChangeLog
>>> 2013-10-16  Zhenqiang Chen  <zhenqiang.chen@linaro.org>
>>>
>>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c: New test case.
>>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c: New test case.
>>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c: New test case.
>>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c: New test case.
>>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c: New test case.
>>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c: New test case.
>>>         * gcc.dg/tree-ssa/ssa-dom-thread-3.c: Updated.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-18  2:37     ` Andrew Pinski
@ 2013-10-18  2:42       ` Andrew Pinski
  2013-10-18 10:55         ` Zhenqiang Chen
  0 siblings, 1 reply; 31+ messages in thread
From: Andrew Pinski @ 2013-10-18  2:42 UTC (permalink / raw)
  To: Richard Biener; +Cc: Zhenqiang Chen, gcc-patches

On Thu, Oct 17, 2013 at 6:39 PM, Andrew Pinski <pinskia@gmail.com> wrote:
> On Thu, Oct 17, 2013 at 4:03 AM, Richard Biener
> <richard.guenther@gmail.com> wrote:
>> On Thu, Oct 17, 2013 at 4:14 AM, Andrew Pinski <pinskia@gmail.com> wrote:
>>> On Wed, Oct 16, 2013 at 2:12 AM, Zhenqiang Chen
>>> <zhenqiang.chen@linaro.org> wrote:
>>>> Hi,
>>>>
>>>> The patch enhances ifcombine pass to recover some non short circuit
>>>> branches. Basically, it will do the following transformation:
>>>>
>>>> Case 1:
>>>>   if (cond1)
>>>>     if (cond2)
>>>> ==>
>>>>   if (cond1 && cond2)
>>>>
>>>> Case 2:
>>>>   if (cond1)
>>>>     goto L1
>>>>   if (cond2)
>>>>     goto L1
>>>> ==>
>>>>   if (cond1 || cond2)
>>>>     goto L1
>>>>
>>>> Case 3:
>>>>   if (cond1)
>>>>     goto L1
>>>>   else
>>>>     goto L2
>>>> L1:
>>>>   if (cond2)
>>>>     goto L2
>>>> ==>
>>>>   if (invert (cond1) || cond2)
>>>>     goto L2
>>>>
>>>> Case 4:
>>>>   if (cond1)
>>>>     goto L1
>>>>   if (cond2)
>>>>     goto L2
>>>> L1:
>>>> ==>
>>>>   if (invert (cond1) && cond2)
>>>>     goto L2
>>>>
>>>> Bootstrap on X86-64 and ARM.
>>>>
>>>> Two new FAILs in regression tests:
>>>> gcc.dg/uninit-pred-8_b.c
>>>> gcc.dg/uninit-pred-9_b.c
>>>> uninit pass should be enhanced to handle more complex conditions. Will
>>>> submit a bug to track it and fix it later.
>>>>
>>>> Is it OK for trunk?
>>>
>>> I had a much simpler change which did basically the same from 4.7 (I
>>> can update it if people think this is a better approach).
>>
>> I like that more (note you can now use is_gimple_condexpr as predicate
>> for force_gimple_operand).
>
>
> Ok, with both this email and Jakub's email, I decided to port the
> patch to the trunk but I ran into a bug in reassoc which I submitted
> as PR 58775 (with a testcase which shows the issue without this
> patch).

I forgot to say that the tree-ssa-ifcombine.c hunk of the 4.7 patch
applies correctly.  Once PR 58775 is fixed, I will be testing and
submitting the full patch including with the testcases from Zhenqiang.

Thanks,
Andrew

>
>
>>
>> With that we should be able to kill the fold-const.c transform?
>
>
> I think so but I have never tested removing it.
>
> Thanks,
> Andrew Pinski
>
>
>>
>> Thanks,
>> Richard.
>>
>>> Thanks,
>>> Andrew Pinski
>>>
>>>
>>>     2012-09-29  Andrew Pinski  <apinski@cavium.com>
>>>
>>>         * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
>>>         (ifcombine_ifandif): Handle cases where
>>>         maybe_fold_and_comparisons fails, combining the branches
>>>         anyways.
>>>         (tree_ssa_ifcombine): Inverse the order of
>>>         the basic block walk, increases the number of combinings.
>>>         * Makefile.in (tree-ssa-ifcombine.o): Update dependencies.
>>>
>>>         * testsuite/gcc.dg/tree-ssa/phi-opt-2.c: Expect zero ifs as the compiler
>>>         produces a & b now.
>>>         * testsuite/gcc.dg/tree-ssa/phi-opt-9.c: Use a function call
>>>         to prevent conditional move to be used.
>>>         * testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c: Remove check for
>>>         "one or more intermediate".
>>>
>>>
>>>>
>>>> Thanks!
>>>> -Zhenqiang
>>>>
>>>> ChangeLog:
>>>> 2013-10-16  Zhenqiang Chen  <zhenqiang.chen@linaro.org>
>>>>
>>>>         * fold-const.c (simple_operand_p_2): Make it global.
>>>>         * tree.h (simple_operand_p_2): Declare it.
>>>>         * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
>>>>         (bb_has_overhead_p, generate_condition_node,
>>>>         ifcombine_ccmp): New functions.
>>>>         (ifcombine_fold_ifandif): New function, extracted from
>>>>         ifcombine_ifandif.
>>>>         (ifcombine_ifandif): Call ifcombine_ccmp.
>>>>         (tree_ssa_ifcombine_bb): Skip optimized bb.
>>>>
>>>> testsuite/ChangeLog
>>>> 2013-10-16  Zhenqiang Chen  <zhenqiang.chen@linaro.org>
>>>>
>>>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c: New test case.
>>>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c: New test case.
>>>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c: New test case.
>>>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c: New test case.
>>>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c: New test case.
>>>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c: New test case.
>>>>         * gcc.dg/tree-ssa/ssa-dom-thread-3.c: Updated.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-17 17:09     ` Jeff Law
@ 2013-10-18  9:53       ` Zhenqiang Chen
  2013-10-18 10:04         ` Richard Biener
  2013-10-26 22:26         ` Andrew Pinski
  0 siblings, 2 replies; 31+ messages in thread
From: Zhenqiang Chen @ 2013-10-18  9:53 UTC (permalink / raw)
  To: Jeff Law; +Cc: Richard Biener, Andrew Pinski, gcc-patches

On 18 October 2013 00:58, Jeff Law <law@redhat.com> wrote:
> On 10/17/13 05:03, Richard Biener wrote:
>>>>
>>>> Is it OK for trunk?
>>>
>>>
>>> I had a much simpler change which did basically the same from 4.7 (I
>>> can update it if people think this is a better approach).
>>
>>
>> I like that more (note you can now use is_gimple_condexpr as predicate
>> for force_gimple_operand).
>
> The obvious question is whether or not Andrew's simpler change picks up as
> many transformations as Zhenqiang's change.  If not are the things missed
> important.
>
> Zhenqiang, can you do some testing of your change vs Andrew P.'s change?

Here is a rough compare:

1) Andrew P.'s change can not handle ssa-ifcombine-ccmp-3.c (included
in my patch). Root cause is that it does not skip "LABEL". The guard
to do this opt should be the same the bb_has_overhead_p in my patch.

2) Andrew P.'s change always generate TRUTH_AND_EXPR, which is not
efficient for "||". e.g. For ssa-ifcombine-ccmp-6.c, it will generate

  _3 = a_2(D) > 0;
  _5 = b_4(D) > 0;
  _6 = _3 | _5;
  _9 = c_7(D) <= 0;
  _10 = ~_6;
  _11 = _9 & _10;
  if (_11 == 0)

With my patch, it will generate

  _3 = a_2(D) > 0;
  _5 = b_4(D) > 0;
  _6 = _3 | _5;
  _9 = c_7(D) > 0;
  _10 = _6 | _9;
  if (_10 != 0)

3) The good thing of Andrew P.'s change is that "Inverse the order of
the basic block walk" so it can do combine recursively.

But I think we need some heuristic to control the number of ifs. Move
too much compares from
the inner_bb to outer_bb is not good.

4) Another good thing of Andrew P.'s change is that it reuses some
existing functions. So it looks much simple.

>>
>> With that we should be able to kill the fold-const.c transform?
>
> That would certainly be nice and an excellent follow-up for Zhenqiang.

That's my final goal to "kill the fold-const.c transform". I think we
may combine the two changes to make a "simple" and "good" patch.

Thanks!
-Zhenqiang

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-18  9:53       ` Zhenqiang Chen
@ 2013-10-18 10:04         ` Richard Biener
  2013-10-18 10:11           ` Zhenqiang Chen
  2013-10-18 15:45           ` Jeff Law
  2013-10-26 22:26         ` Andrew Pinski
  1 sibling, 2 replies; 31+ messages in thread
From: Richard Biener @ 2013-10-18 10:04 UTC (permalink / raw)
  To: Zhenqiang Chen; +Cc: Jeff Law, Andrew Pinski, gcc-patches

On Fri, Oct 18, 2013 at 11:21 AM, Zhenqiang Chen
<zhenqiang.chen@linaro.org> wrote:
> On 18 October 2013 00:58, Jeff Law <law@redhat.com> wrote:
>> On 10/17/13 05:03, Richard Biener wrote:
>>>>>
>>>>> Is it OK for trunk?
>>>>
>>>>
>>>> I had a much simpler change which did basically the same from 4.7 (I
>>>> can update it if people think this is a better approach).
>>>
>>>
>>> I like that more (note you can now use is_gimple_condexpr as predicate
>>> for force_gimple_operand).
>>
>> The obvious question is whether or not Andrew's simpler change picks up as
>> many transformations as Zhenqiang's change.  If not are the things missed
>> important.
>>
>> Zhenqiang, can you do some testing of your change vs Andrew P.'s change?
>
> Here is a rough compare:
>
> 1) Andrew P.'s change can not handle ssa-ifcombine-ccmp-3.c (included
> in my patch). Root cause is that it does not skip "LABEL". The guard
> to do this opt should be the same the bb_has_overhead_p in my patch.

I think we want a "proper" predicate in tree-cfg.c for this, like maybe
a subset of tree_forwarder_block_p or whatever it will end up looking
like (we need "effectively empty BB" elsewhere, for example in vectorization,
add a flag to allow a condition ending the BB and the predicate is done).

> 2) Andrew P.'s change always generate TRUTH_AND_EXPR, which is not
> efficient for "||". e.g. For ssa-ifcombine-ccmp-6.c, it will generate
>
>   _3 = a_2(D) > 0;
>   _5 = b_4(D) > 0;
>   _6 = _3 | _5;
>   _9 = c_7(D) <= 0;
>   _10 = ~_6;
>   _11 = _9 & _10;
>   if (_11 == 0)
>
> With my patch, it will generate
>
>   _3 = a_2(D) > 0;
>   _5 = b_4(D) > 0;
>   _6 = _3 | _5;
>   _9 = c_7(D) > 0;
>   _10 = _6 | _9;
>   if (_10 != 0)

But that seems like a missed simplification in predicate combining
which should be fixed more generally.

> 3) The good thing of Andrew P.'s change is that "Inverse the order of
> the basic block walk" so it can do combine recursively.
>
> But I think we need some heuristic to control the number of ifs. Move
> too much compares from
> the inner_bb to outer_bb is not good.

True, but that's what fold-const.c does, no?

> 4) Another good thing of Andrew P.'s change is that it reuses some
> existing functions. So it looks much simple.

Indeed - that's what I like about it.

>>>
>>> With that we should be able to kill the fold-const.c transform?
>>
>> That would certainly be nice and an excellent follow-up for Zhenqiang.
>
> That's my final goal to "kill the fold-const.c transform". I think we
> may combine the two changes to make a "simple" and "good" patch.

Thanks,
Richard.

> Thanks!
> -Zhenqiang

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-18 10:04         ` Richard Biener
@ 2013-10-18 10:11           ` Zhenqiang Chen
  2013-10-18 10:36             ` Richard Biener
  2013-10-18 15:45           ` Jeff Law
  1 sibling, 1 reply; 31+ messages in thread
From: Zhenqiang Chen @ 2013-10-18 10:11 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jeff Law, Andrew Pinski, gcc-patches

On 18 October 2013 17:53, Richard Biener <richard.guenther@gmail.com> wrote:
> On Fri, Oct 18, 2013 at 11:21 AM, Zhenqiang Chen
> <zhenqiang.chen@linaro.org> wrote:
>> On 18 October 2013 00:58, Jeff Law <law@redhat.com> wrote:
>>> On 10/17/13 05:03, Richard Biener wrote:
>>>>>>
>>>>>> Is it OK for trunk?
>>>>>
>>>>>
>>>>> I had a much simpler change which did basically the same from 4.7 (I
>>>>> can update it if people think this is a better approach).
>>>>
>>>>
>>>> I like that more (note you can now use is_gimple_condexpr as predicate
>>>> for force_gimple_operand).
>>>
>>> The obvious question is whether or not Andrew's simpler change picks up as
>>> many transformations as Zhenqiang's change.  If not are the things missed
>>> important.
>>>
>>> Zhenqiang, can you do some testing of your change vs Andrew P.'s change?
>>
>> Here is a rough compare:
>>
>> 1) Andrew P.'s change can not handle ssa-ifcombine-ccmp-3.c (included
>> in my patch). Root cause is that it does not skip "LABEL". The guard
>> to do this opt should be the same the bb_has_overhead_p in my patch.
>
> I think we want a "proper" predicate in tree-cfg.c for this, like maybe
> a subset of tree_forwarder_block_p or whatever it will end up looking
> like (we need "effectively empty BB" elsewhere, for example in vectorization,
> add a flag to allow a condition ending the BB and the predicate is done).
>
>> 2) Andrew P.'s change always generate TRUTH_AND_EXPR, which is not
>> efficient for "||". e.g. For ssa-ifcombine-ccmp-6.c, it will generate
>>
>>   _3 = a_2(D) > 0;
>>   _5 = b_4(D) > 0;
>>   _6 = _3 | _5;
>>   _9 = c_7(D) <= 0;
>>   _10 = ~_6;
>>   _11 = _9 & _10;
>>   if (_11 == 0)
>>
>> With my patch, it will generate
>>
>>   _3 = a_2(D) > 0;
>>   _5 = b_4(D) > 0;
>>   _6 = _3 | _5;
>>   _9 = c_7(D) > 0;
>>   _10 = _6 | _9;
>>   if (_10 != 0)
>
> But that seems like a missed simplification in predicate combining
> which should be fixed more generally.
>
>> 3) The good thing of Andrew P.'s change is that "Inverse the order of
>> the basic block walk" so it can do combine recursively.
>>
>> But I think we need some heuristic to control the number of ifs. Move
>> too much compares from
>> the inner_bb to outer_bb is not good.
>
> True, but that's what fold-const.c does, no?

Based on current fold-const, we can not generate more than "two"
compares in a basic block.

But if ifs can be combined recursively in ifcombine, we might generate
many compares in a basic block.

>> 4) Another good thing of Andrew P.'s change is that it reuses some
>> existing functions. So it looks much simple.
>
> Indeed - that's what I like about it.
>
>>>>
>>>> With that we should be able to kill the fold-const.c transform?
>>>
>>> That would certainly be nice and an excellent follow-up for Zhenqiang.
>>
>> That's my final goal to "kill the fold-const.c transform". I think we
>> may combine the two changes to make a "simple" and "good" patch.
>
> Thanks,
> Richard.
>
>> Thanks!
>> -Zhenqiang

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-18 10:11           ` Zhenqiang Chen
@ 2013-10-18 10:36             ` Richard Biener
  2013-10-18 10:41               ` Zhenqiang Chen
  0 siblings, 1 reply; 31+ messages in thread
From: Richard Biener @ 2013-10-18 10:36 UTC (permalink / raw)
  To: Zhenqiang Chen; +Cc: Jeff Law, Andrew Pinski, gcc-patches

On Fri, Oct 18, 2013 at 12:06 PM, Zhenqiang Chen
<zhenqiang.chen@linaro.org> wrote:
> On 18 October 2013 17:53, Richard Biener <richard.guenther@gmail.com> wrote:
>> On Fri, Oct 18, 2013 at 11:21 AM, Zhenqiang Chen
>> <zhenqiang.chen@linaro.org> wrote:
>>> On 18 October 2013 00:58, Jeff Law <law@redhat.com> wrote:
>>>> On 10/17/13 05:03, Richard Biener wrote:
>>>>>>>
>>>>>>> Is it OK for trunk?
>>>>>>
>>>>>>
>>>>>> I had a much simpler change which did basically the same from 4.7 (I
>>>>>> can update it if people think this is a better approach).
>>>>>
>>>>>
>>>>> I like that more (note you can now use is_gimple_condexpr as predicate
>>>>> for force_gimple_operand).
>>>>
>>>> The obvious question is whether or not Andrew's simpler change picks up as
>>>> many transformations as Zhenqiang's change.  If not are the things missed
>>>> important.
>>>>
>>>> Zhenqiang, can you do some testing of your change vs Andrew P.'s change?
>>>
>>> Here is a rough compare:
>>>
>>> 1) Andrew P.'s change can not handle ssa-ifcombine-ccmp-3.c (included
>>> in my patch). Root cause is that it does not skip "LABEL". The guard
>>> to do this opt should be the same the bb_has_overhead_p in my patch.
>>
>> I think we want a "proper" predicate in tree-cfg.c for this, like maybe
>> a subset of tree_forwarder_block_p or whatever it will end up looking
>> like (we need "effectively empty BB" elsewhere, for example in vectorization,
>> add a flag to allow a condition ending the BB and the predicate is done).
>>
>>> 2) Andrew P.'s change always generate TRUTH_AND_EXPR, which is not
>>> efficient for "||". e.g. For ssa-ifcombine-ccmp-6.c, it will generate
>>>
>>>   _3 = a_2(D) > 0;
>>>   _5 = b_4(D) > 0;
>>>   _6 = _3 | _5;
>>>   _9 = c_7(D) <= 0;
>>>   _10 = ~_6;
>>>   _11 = _9 & _10;
>>>   if (_11 == 0)
>>>
>>> With my patch, it will generate
>>>
>>>   _3 = a_2(D) > 0;
>>>   _5 = b_4(D) > 0;
>>>   _6 = _3 | _5;
>>>   _9 = c_7(D) > 0;
>>>   _10 = _6 | _9;
>>>   if (_10 != 0)
>>
>> But that seems like a missed simplification in predicate combining
>> which should be fixed more generally.
>>
>>> 3) The good thing of Andrew P.'s change is that "Inverse the order of
>>> the basic block walk" so it can do combine recursively.
>>>
>>> But I think we need some heuristic to control the number of ifs. Move
>>> too much compares from
>>> the inner_bb to outer_bb is not good.
>>
>> True, but that's what fold-const.c does, no?
>
> Based on current fold-const, we can not generate more than "two"
> compares in a basic block.

I think you are wrong as fold is invoked recursively on a && b && c && d.

> But if ifs can be combined recursively in ifcombine, we might generate
> many compares in a basic block.

Yes, we can end up with that for the other simplifications done in
ifcombine as well.  Eventually there needs to be a cost model for this
(use edge probabilities for example, so that if you have profile data
you never combine cold blocks).

Any takers?

Richard.

>>> 4) Another good thing of Andrew P.'s change is that it reuses some
>>> existing functions. So it looks much simple.
>>
>> Indeed - that's what I like about it.
>>
>>>>>
>>>>> With that we should be able to kill the fold-const.c transform?
>>>>
>>>> That would certainly be nice and an excellent follow-up for Zhenqiang.
>>>
>>> That's my final goal to "kill the fold-const.c transform". I think we
>>> may combine the two changes to make a "simple" and "good" patch.
>>
>> Thanks,
>> Richard.
>>
>>> Thanks!
>>> -Zhenqiang

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-18 10:36             ` Richard Biener
@ 2013-10-18 10:41               ` Zhenqiang Chen
  2013-10-18 12:10                 ` Richard Biener
  0 siblings, 1 reply; 31+ messages in thread
From: Zhenqiang Chen @ 2013-10-18 10:41 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jeff Law, Andrew Pinski, gcc-patches

On 18 October 2013 18:15, Richard Biener <richard.guenther@gmail.com> wrote:
> On Fri, Oct 18, 2013 at 12:06 PM, Zhenqiang Chen
> <zhenqiang.chen@linaro.org> wrote:
>> On 18 October 2013 17:53, Richard Biener <richard.guenther@gmail.com> wrote:
>>> On Fri, Oct 18, 2013 at 11:21 AM, Zhenqiang Chen
>>> <zhenqiang.chen@linaro.org> wrote:
>>>> On 18 October 2013 00:58, Jeff Law <law@redhat.com> wrote:
>>>>> On 10/17/13 05:03, Richard Biener wrote:
>>>>>>>>
>>>>>>>> Is it OK for trunk?
>>>>>>>
>>>>>>>
>>>>>>> I had a much simpler change which did basically the same from 4.7 (I
>>>>>>> can update it if people think this is a better approach).
>>>>>>
>>>>>>
>>>>>> I like that more (note you can now use is_gimple_condexpr as predicate
>>>>>> for force_gimple_operand).
>>>>>
>>>>> The obvious question is whether or not Andrew's simpler change picks up as
>>>>> many transformations as Zhenqiang's change.  If not are the things missed
>>>>> important.
>>>>>
>>>>> Zhenqiang, can you do some testing of your change vs Andrew P.'s change?
>>>>
>>>> Here is a rough compare:
>>>>
>>>> 1) Andrew P.'s change can not handle ssa-ifcombine-ccmp-3.c (included
>>>> in my patch). Root cause is that it does not skip "LABEL". The guard
>>>> to do this opt should be the same the bb_has_overhead_p in my patch.
>>>
>>> I think we want a "proper" predicate in tree-cfg.c for this, like maybe
>>> a subset of tree_forwarder_block_p or whatever it will end up looking
>>> like (we need "effectively empty BB" elsewhere, for example in vectorization,
>>> add a flag to allow a condition ending the BB and the predicate is done).
>>>
>>>> 2) Andrew P.'s change always generate TRUTH_AND_EXPR, which is not
>>>> efficient for "||". e.g. For ssa-ifcombine-ccmp-6.c, it will generate
>>>>
>>>>   _3 = a_2(D) > 0;
>>>>   _5 = b_4(D) > 0;
>>>>   _6 = _3 | _5;
>>>>   _9 = c_7(D) <= 0;
>>>>   _10 = ~_6;
>>>>   _11 = _9 & _10;
>>>>   if (_11 == 0)
>>>>
>>>> With my patch, it will generate
>>>>
>>>>   _3 = a_2(D) > 0;
>>>>   _5 = b_4(D) > 0;
>>>>   _6 = _3 | _5;
>>>>   _9 = c_7(D) > 0;
>>>>   _10 = _6 | _9;
>>>>   if (_10 != 0)
>>>
>>> But that seems like a missed simplification in predicate combining
>>> which should be fixed more generally.
>>>
>>>> 3) The good thing of Andrew P.'s change is that "Inverse the order of
>>>> the basic block walk" so it can do combine recursively.
>>>>
>>>> But I think we need some heuristic to control the number of ifs. Move
>>>> too much compares from
>>>> the inner_bb to outer_bb is not good.
>>>
>>> True, but that's what fold-const.c does, no?
>>
>> Based on current fold-const, we can not generate more than "two"
>> compares in a basic block.
>
> I think you are wrong as fold is invoked recursively on a && b && c && d.

Please try to compile it with -fdump-tree-gimple. It will be broken into

 if (a && b)
   if (c && d)

>> But if ifs can be combined recursively in ifcombine, we might generate
>> many compares in a basic block.
>
> Yes, we can end up with that for the other simplifications done in
> ifcombine as well.  Eventually there needs to be a cost model for this
> (use edge probabilities for example, so that if you have profile data
> you never combine cold blocks).
>
> Any takers?

I will take it.

Thanks!
-Zhenqiang

> Richard.
>
>>>> 4) Another good thing of Andrew P.'s change is that it reuses some
>>>> existing functions. So it looks much simple.
>>>
>>> Indeed - that's what I like about it.
>>>
>>>>>>
>>>>>> With that we should be able to kill the fold-const.c transform?
>>>>>
>>>>> That would certainly be nice and an excellent follow-up for Zhenqiang.
>>>>
>>>> That's my final goal to "kill the fold-const.c transform". I think we
>>>> may combine the two changes to make a "simple" and "good" patch.
>>>
>>> Thanks,
>>> Richard.
>>>
>>>> Thanks!
>>>> -Zhenqiang

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-18  2:42       ` Andrew Pinski
@ 2013-10-18 10:55         ` Zhenqiang Chen
  0 siblings, 0 replies; 31+ messages in thread
From: Zhenqiang Chen @ 2013-10-18 10:55 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: Richard Biener, gcc-patches

On 18 October 2013 10:13, Andrew Pinski <pinskia@gmail.com> wrote:
> On Thu, Oct 17, 2013 at 6:39 PM, Andrew Pinski <pinskia@gmail.com> wrote:
>> On Thu, Oct 17, 2013 at 4:03 AM, Richard Biener
>> <richard.guenther@gmail.com> wrote:
>>> On Thu, Oct 17, 2013 at 4:14 AM, Andrew Pinski <pinskia@gmail.com> wrote:
>>>> On Wed, Oct 16, 2013 at 2:12 AM, Zhenqiang Chen
>>>> <zhenqiang.chen@linaro.org> wrote:
>>>>> Hi,
>>>>>
>>>>> The patch enhances ifcombine pass to recover some non short circuit
>>>>> branches. Basically, it will do the following transformation:
>>>>>
>>>>> Case 1:
>>>>>   if (cond1)
>>>>>     if (cond2)
>>>>> ==>
>>>>>   if (cond1 && cond2)
>>>>>
>>>>> Case 2:
>>>>>   if (cond1)
>>>>>     goto L1
>>>>>   if (cond2)
>>>>>     goto L1
>>>>> ==>
>>>>>   if (cond1 || cond2)
>>>>>     goto L1
>>>>>
>>>>> Case 3:
>>>>>   if (cond1)
>>>>>     goto L1
>>>>>   else
>>>>>     goto L2
>>>>> L1:
>>>>>   if (cond2)
>>>>>     goto L2
>>>>> ==>
>>>>>   if (invert (cond1) || cond2)
>>>>>     goto L2
>>>>>
>>>>> Case 4:
>>>>>   if (cond1)
>>>>>     goto L1
>>>>>   if (cond2)
>>>>>     goto L2
>>>>> L1:
>>>>> ==>
>>>>>   if (invert (cond1) && cond2)
>>>>>     goto L2
>>>>>
>>>>> Bootstrap on X86-64 and ARM.
>>>>>
>>>>> Two new FAILs in regression tests:
>>>>> gcc.dg/uninit-pred-8_b.c
>>>>> gcc.dg/uninit-pred-9_b.c
>>>>> uninit pass should be enhanced to handle more complex conditions. Will
>>>>> submit a bug to track it and fix it later.
>>>>>
>>>>> Is it OK for trunk?
>>>>
>>>> I had a much simpler change which did basically the same from 4.7 (I
>>>> can update it if people think this is a better approach).
>>>
>>> I like that more (note you can now use is_gimple_condexpr as predicate
>>> for force_gimple_operand).
>>
>>
>> Ok, with both this email and Jakub's email, I decided to port the
>> patch to the trunk but I ran into a bug in reassoc which I submitted
>> as PR 58775 (with a testcase which shows the issue without this
>> patch).
>
> I forgot to say that the tree-ssa-ifcombine.c hunk of the 4.7 patch
> applies correctly.  Once PR 58775 is fixed, I will be testing and
> submitting the full patch including with the testcases from Zhenqiang.

I just post a patch to fix PR 58775. With the patch, your changes can
bootstrap on X86-64.

Thanks!
-Zhenqiang

> Thanks,
> Andrew
>
>>
>>
>>>
>>> With that we should be able to kill the fold-const.c transform?
>>
>>
>> I think so but I have never tested removing it.
>>
>> Thanks,
>> Andrew Pinski
>>
>>
>>>
>>> Thanks,
>>> Richard.
>>>
>>>> Thanks,
>>>> Andrew Pinski
>>>>
>>>>
>>>>     2012-09-29  Andrew Pinski  <apinski@cavium.com>
>>>>
>>>>         * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
>>>>         (ifcombine_ifandif): Handle cases where
>>>>         maybe_fold_and_comparisons fails, combining the branches
>>>>         anyways.
>>>>         (tree_ssa_ifcombine): Inverse the order of
>>>>         the basic block walk, increases the number of combinings.
>>>>         * Makefile.in (tree-ssa-ifcombine.o): Update dependencies.
>>>>
>>>>         * testsuite/gcc.dg/tree-ssa/phi-opt-2.c: Expect zero ifs as the compiler
>>>>         produces a & b now.
>>>>         * testsuite/gcc.dg/tree-ssa/phi-opt-9.c: Use a function call
>>>>         to prevent conditional move to be used.
>>>>         * testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c: Remove check for
>>>>         "one or more intermediate".
>>>>
>>>>
>>>>>
>>>>> Thanks!
>>>>> -Zhenqiang
>>>>>
>>>>> ChangeLog:
>>>>> 2013-10-16  Zhenqiang Chen  <zhenqiang.chen@linaro.org>
>>>>>
>>>>>         * fold-const.c (simple_operand_p_2): Make it global.
>>>>>         * tree.h (simple_operand_p_2): Declare it.
>>>>>         * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
>>>>>         (bb_has_overhead_p, generate_condition_node,
>>>>>         ifcombine_ccmp): New functions.
>>>>>         (ifcombine_fold_ifandif): New function, extracted from
>>>>>         ifcombine_ifandif.
>>>>>         (ifcombine_ifandif): Call ifcombine_ccmp.
>>>>>         (tree_ssa_ifcombine_bb): Skip optimized bb.
>>>>>
>>>>> testsuite/ChangeLog
>>>>> 2013-10-16  Zhenqiang Chen  <zhenqiang.chen@linaro.org>
>>>>>
>>>>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c: New test case.
>>>>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c: New test case.
>>>>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c: New test case.
>>>>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c: New test case.
>>>>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c: New test case.
>>>>>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c: New test case.
>>>>>         * gcc.dg/tree-ssa/ssa-dom-thread-3.c: Updated.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-18 10:41               ` Zhenqiang Chen
@ 2013-10-18 12:10                 ` Richard Biener
  0 siblings, 0 replies; 31+ messages in thread
From: Richard Biener @ 2013-10-18 12:10 UTC (permalink / raw)
  To: Zhenqiang Chen; +Cc: Jeff Law, Andrew Pinski, gcc-patches

On Fri, Oct 18, 2013 at 12:34 PM, Zhenqiang Chen
<zhenqiang.chen@linaro.org> wrote:
> On 18 October 2013 18:15, Richard Biener <richard.guenther@gmail.com> wrote:
>> On Fri, Oct 18, 2013 at 12:06 PM, Zhenqiang Chen
>> <zhenqiang.chen@linaro.org> wrote:
>>> On 18 October 2013 17:53, Richard Biener <richard.guenther@gmail.com> wrote:
>>>> On Fri, Oct 18, 2013 at 11:21 AM, Zhenqiang Chen
>>>> <zhenqiang.chen@linaro.org> wrote:
>>>>> On 18 October 2013 00:58, Jeff Law <law@redhat.com> wrote:
>>>>>> On 10/17/13 05:03, Richard Biener wrote:
>>>>>>>>>
>>>>>>>>> Is it OK for trunk?
>>>>>>>>
>>>>>>>>
>>>>>>>> I had a much simpler change which did basically the same from 4.7 (I
>>>>>>>> can update it if people think this is a better approach).
>>>>>>>
>>>>>>>
>>>>>>> I like that more (note you can now use is_gimple_condexpr as predicate
>>>>>>> for force_gimple_operand).
>>>>>>
>>>>>> The obvious question is whether or not Andrew's simpler change picks up as
>>>>>> many transformations as Zhenqiang's change.  If not are the things missed
>>>>>> important.
>>>>>>
>>>>>> Zhenqiang, can you do some testing of your change vs Andrew P.'s change?
>>>>>
>>>>> Here is a rough compare:
>>>>>
>>>>> 1) Andrew P.'s change can not handle ssa-ifcombine-ccmp-3.c (included
>>>>> in my patch). Root cause is that it does not skip "LABEL". The guard
>>>>> to do this opt should be the same the bb_has_overhead_p in my patch.
>>>>
>>>> I think we want a "proper" predicate in tree-cfg.c for this, like maybe
>>>> a subset of tree_forwarder_block_p or whatever it will end up looking
>>>> like (we need "effectively empty BB" elsewhere, for example in vectorization,
>>>> add a flag to allow a condition ending the BB and the predicate is done).
>>>>
>>>>> 2) Andrew P.'s change always generate TRUTH_AND_EXPR, which is not
>>>>> efficient for "||". e.g. For ssa-ifcombine-ccmp-6.c, it will generate
>>>>>
>>>>>   _3 = a_2(D) > 0;
>>>>>   _5 = b_4(D) > 0;
>>>>>   _6 = _3 | _5;
>>>>>   _9 = c_7(D) <= 0;
>>>>>   _10 = ~_6;
>>>>>   _11 = _9 & _10;
>>>>>   if (_11 == 0)
>>>>>
>>>>> With my patch, it will generate
>>>>>
>>>>>   _3 = a_2(D) > 0;
>>>>>   _5 = b_4(D) > 0;
>>>>>   _6 = _3 | _5;
>>>>>   _9 = c_7(D) > 0;
>>>>>   _10 = _6 | _9;
>>>>>   if (_10 != 0)
>>>>
>>>> But that seems like a missed simplification in predicate combining
>>>> which should be fixed more generally.
>>>>
>>>>> 3) The good thing of Andrew P.'s change is that "Inverse the order of
>>>>> the basic block walk" so it can do combine recursively.
>>>>>
>>>>> But I think we need some heuristic to control the number of ifs. Move
>>>>> too much compares from
>>>>> the inner_bb to outer_bb is not good.
>>>>
>>>> True, but that's what fold-const.c does, no?
>>>
>>> Based on current fold-const, we can not generate more than "two"
>>> compares in a basic block.
>>
>> I think you are wrong as fold is invoked recursively on a && b && c && d.
>
> Please try to compile it with -fdump-tree-gimple. It will be broken into
>
>  if (a && b)
>    if (c && d)

Indeed - it seems to handle all kinds of series but not the incremental
one that appears after the first folding - (a truth-and b) truth-andif c.

>>> But if ifs can be combined recursively in ifcombine, we might generate
>>> many compares in a basic block.
>>
>> Yes, we can end up with that for the other simplifications done in
>> ifcombine as well.  Eventually there needs to be a cost model for this
>> (use edge probabilities for example, so that if you have profile data
>> you never combine cold blocks).
>>
>> Any takers?
>
> I will take it.
>
> Thanks!
> -Zhenqiang
>
>> Richard.
>>
>>>>> 4) Another good thing of Andrew P.'s change is that it reuses some
>>>>> existing functions. So it looks much simple.
>>>>
>>>> Indeed - that's what I like about it.
>>>>
>>>>>>>
>>>>>>> With that we should be able to kill the fold-const.c transform?
>>>>>>
>>>>>> That would certainly be nice and an excellent follow-up for Zhenqiang.
>>>>>
>>>>> That's my final goal to "kill the fold-const.c transform". I think we
>>>>> may combine the two changes to make a "simple" and "good" patch.
>>>>
>>>> Thanks,
>>>> Richard.
>>>>
>>>>> Thanks!
>>>>> -Zhenqiang

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-18 10:04         ` Richard Biener
  2013-10-18 10:11           ` Zhenqiang Chen
@ 2013-10-18 15:45           ` Jeff Law
  1 sibling, 0 replies; 31+ messages in thread
From: Jeff Law @ 2013-10-18 15:45 UTC (permalink / raw)
  To: Richard Biener, Zhenqiang Chen; +Cc: Andrew Pinski, gcc-patches

On 10/18/13 03:53, Richard Biener wrote:
>
> I think we want a "proper" predicate in tree-cfg.c for this, like maybe
> a subset of tree_forwarder_block_p or whatever it will end up looking
> like (we need "effectively empty BB" elsewhere, for example in vectorization,
> add a flag to allow a condition ending the BB and the predicate is done).
Yes.  We definitely have a need for this.  For threading, "effectively 
empty" comes up often.  In that context, we're ignoring the control 
statement at the end (because it's going away ;-)

Jeff

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-18  9:53       ` Zhenqiang Chen
  2013-10-18 10:04         ` Richard Biener
@ 2013-10-26 22:26         ` Andrew Pinski
  2013-10-27  9:17           ` Andrew Pinski
  1 sibling, 1 reply; 31+ messages in thread
From: Andrew Pinski @ 2013-10-26 22:26 UTC (permalink / raw)
  To: Zhenqiang Chen; +Cc: Jeff Law, Richard Biener, gcc-patches

On Fri, Oct 18, 2013 at 2:21 AM, Zhenqiang Chen
<zhenqiang.chen@linaro.org> wrote:
> On 18 October 2013 00:58, Jeff Law <law@redhat.com> wrote:
>> On 10/17/13 05:03, Richard Biener wrote:
>>>>>
>>>>> Is it OK for trunk?
>>>>
>>>>
>>>> I had a much simpler change which did basically the same from 4.7 (I
>>>> can update it if people think this is a better approach).
>>>
>>>
>>> I like that more (note you can now use is_gimple_condexpr as predicate
>>> for force_gimple_operand).
>>
>> The obvious question is whether or not Andrew's simpler change picks up as
>> many transformations as Zhenqiang's change.  If not are the things missed
>> important.
>>
>> Zhenqiang, can you do some testing of your change vs Andrew P.'s change?
>
> Here is a rough compare:
>
> 1) Andrew P.'s change can not handle ssa-ifcombine-ccmp-3.c (included
> in my patch). Root cause is that it does not skip "LABEL". The guard
> to do this opt should be the same the bb_has_overhead_p in my patch.

This should be an easy change, I am working on this right now.

>
> 2) Andrew P.'s change always generate TRUTH_AND_EXPR, which is not
> efficient for "||". e.g. For ssa-ifcombine-ccmp-6.c, it will generate
>
>   _3 = a_2(D) > 0;
>   _5 = b_4(D) > 0;
>   _6 = _3 | _5;
>   _9 = c_7(D) <= 0;
>   _10 = ~_6;
>   _11 = _9 & _10;
>   if (_11 == 0)
>
> With my patch, it will generate
>
>   _3 = a_2(D) > 0;
>   _5 = b_4(D) > 0;
>   _6 = _3 | _5;
>   _9 = c_7(D) > 0;
>   _10 = _6 | _9;
>   if (_10 != 0)

As mentioned otherwise, this seems like a missed optimization inside
forwprop.  When I originally wrote this code there used to be two
cases one for & and one for |, but this was removed sometime and I
just made the code evolve with that.

>
> 3) The good thing of Andrew P.'s change is that "Inverse the order of
> the basic block walk" so it can do combine recursively.
>
> But I think we need some heuristic to control the number of ifs. Move
> too much compares from
> the inner_bb to outer_bb is not good.


I think this depends on the target.  For MIPS we don't want an upper
bound as integer comparisons go directly to GPRs.  I wrote this patch
with MIPS in mind as that was the target I was working on.

>
> 4) Another good thing of Andrew P.'s change is that it reuses some
> existing functions. So it looks much simple.


Thanks,
Andrew Pinski

>
>>>
>>> With that we should be able to kill the fold-const.c transform?
>>
>> That would certainly be nice and an excellent follow-up for Zhenqiang.
>
> That's my final goal to "kill the fold-const.c transform". I think we
> may combine the two changes to make a "simple" and "good" patch.
>
> Thanks!
> -Zhenqiang

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-16  9:37 [PATCH] Enhance ifcombine to recover non short circuit branches Zhenqiang Chen
  2013-10-16 12:22 ` Marc Glisse
  2013-10-17  2:50 ` Andrew Pinski
@ 2013-10-26 23:49 ` Andrew Pinski
  2 siblings, 0 replies; 31+ messages in thread
From: Andrew Pinski @ 2013-10-26 23:49 UTC (permalink / raw)
  To: Zhenqiang Chen; +Cc: gcc-patches

On Wed, Oct 16, 2013 at 2:12 AM, Zhenqiang Chen
<zhenqiang.chen@linaro.org> wrote:
> Hi,
>
> The patch enhances ifcombine pass to recover some non short circuit
> branches. Basically, it will do the following transformation:
>
> Case 1:
>   if (cond1)
>     if (cond2)
> ==>
>   if (cond1 && cond2)
>
> Case 2:
>   if (cond1)
>     goto L1
>   if (cond2)
>     goto L1
> ==>
>   if (cond1 || cond2)
>     goto L1
>
> Case 3:
>   if (cond1)
>     goto L1
>   else
>     goto L2
> L1:
>   if (cond2)
>     goto L2
> ==>
>   if (invert (cond1) || cond2)
>     goto L2
>
> Case 4:
>   if (cond1)
>     goto L1
>   if (cond2)
>     goto L2
> L1:
> ==>
>   if (invert (cond1) && cond2)
>     goto L2
>
> Bootstrap on X86-64 and ARM.
>
> Two new FAILs in regression tests:
> gcc.dg/uninit-pred-8_b.c
> gcc.dg/uninit-pred-9_b.c
> uninit pass should be enhanced to handle more complex conditions. Will
> submit a bug to track it and fix it later.
>
> Is it OK for trunk?
>
> Thanks!
> -Zhenqiang
>
> ChangeLog:
> 2013-10-16  Zhenqiang Chen  <zhenqiang.chen@linaro.org>
>
>         * fold-const.c (simple_operand_p_2): Make it global.
>         * tree.h (simple_operand_p_2): Declare it.
>         * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
>         (bb_has_overhead_p, generate_condition_node,
>         ifcombine_ccmp): New functions.
>         (ifcombine_fold_ifandif): New function, extracted from
>         ifcombine_ifandif.
>         (ifcombine_ifandif): Call ifcombine_ccmp.
>         (tree_ssa_ifcombine_bb): Skip optimized bb.
>
> testsuite/ChangeLog
> 2013-10-16  Zhenqiang Chen  <zhenqiang.chen@linaro.org>
>
>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c: New test case.
>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c: New test case.
>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c: New test case.
>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c: New test case.
>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c: New test case.
>         * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c: New test case.
>         * gcc.dg/tree-ssa/ssa-dom-thread-3.c: Updated.


One quick review of this patch, you don't don't need to test
simple_operand_p_2 as in gimple, they will always be simple.  So it at
least reduces this patch.  Also your bb_has_overhead_p can be simply:
!gsi_one_before_end_p (gsi_start_nondebug_after_labels_bb (inner_cond_bb))

Where gsi_start_nondebug_after_labels_bb is defined as:
/* Return a new iterator pointing to the first non-debug non-label statement in
   basic block BB.  */

static inline gimple_stmt_iterator
gsi_start_nondebug_after_labels_bb (basic_block bb)
{
  gimple_stmt_iterator i = gsi_after_labels (bb);

  if (!gsi_end_p (i) && is_gimple_debug (gsi_stmt (i)))
    gsi_next_nondebug (&i);

  return i;
}

Thanks,
Andrew Pinski

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-26 22:26         ` Andrew Pinski
@ 2013-10-27  9:17           ` Andrew Pinski
  2013-10-27 20:28             ` Andrew Pinski
  0 siblings, 1 reply; 31+ messages in thread
From: Andrew Pinski @ 2013-10-27  9:17 UTC (permalink / raw)
  To: Zhenqiang Chen; +Cc: Jeff Law, Richard Biener, gcc-patches

On Sat, Oct 26, 2013 at 2:30 PM, Andrew Pinski <pinskia@gmail.com> wrote:
> On Fri, Oct 18, 2013 at 2:21 AM, Zhenqiang Chen
> <zhenqiang.chen@linaro.org> wrote:
>> On 18 October 2013 00:58, Jeff Law <law@redhat.com> wrote:
>>> On 10/17/13 05:03, Richard Biener wrote:
>>>>>>
>>>>>> Is it OK for trunk?
>>>>>
>>>>>
>>>>> I had a much simpler change which did basically the same from 4.7 (I
>>>>> can update it if people think this is a better approach).
>>>>
>>>>
>>>> I like that more (note you can now use is_gimple_condexpr as predicate
>>>> for force_gimple_operand).
>>>
>>> The obvious question is whether or not Andrew's simpler change picks up as
>>> many transformations as Zhenqiang's change.  If not are the things missed
>>> important.
>>>
>>> Zhenqiang, can you do some testing of your change vs Andrew P.'s change?
>>
>> Here is a rough compare:
>>
>> 1) Andrew P.'s change can not handle ssa-ifcombine-ccmp-3.c (included
>> in my patch). Root cause is that it does not skip "LABEL". The guard
>> to do this opt should be the same the bb_has_overhead_p in my patch.
>
> This should be an easy change, I am working on this right now.
>
>>
>> 2) Andrew P.'s change always generate TRUTH_AND_EXPR, which is not
>> efficient for "||". e.g. For ssa-ifcombine-ccmp-6.c, it will generate
>>
>>   _3 = a_2(D) > 0;
>>   _5 = b_4(D) > 0;
>>   _6 = _3 | _5;
>>   _9 = c_7(D) <= 0;
>>   _10 = ~_6;
>>   _11 = _9 & _10;
>>   if (_11 == 0)
>>
>> With my patch, it will generate
>>
>>   _3 = a_2(D) > 0;
>>   _5 = b_4(D) > 0;
>>   _6 = _3 | _5;
>>   _9 = c_7(D) > 0;
>>   _10 = _6 | _9;
>>   if (_10 != 0)
>
> As mentioned otherwise, this seems like a missed optimization inside
> forwprop.  When I originally wrote this code there used to be two
> cases one for & and one for |, but this was removed sometime and I
> just made the code evolve with that.

Actually I can make a small patch (3 lines) to my current patch which
causes tree-ssa-ifcombine.c to produce the behavior of your patch.

 if (result_inv)
   {
     t = fold_build1 (TRUTH_NOT_EXPR, TREE_TYPE (t), t);
     result_inv = false;
   }

I will submit a new patch after some testing of my current patch which
fixes items 1 and 2.

Thanks,
Andrew Pinski


>
>>
>> 3) The good thing of Andrew P.'s change is that "Inverse the order of
>> the basic block walk" so it can do combine recursively.
>>
>> But I think we need some heuristic to control the number of ifs. Move
>> too much compares from
>> the inner_bb to outer_bb is not good.
>
>
> I think this depends on the target.  For MIPS we don't want an upper
> bound as integer comparisons go directly to GPRs.  I wrote this patch
> with MIPS in mind as that was the target I was working on.
>
>>
>> 4) Another good thing of Andrew P.'s change is that it reuses some
>> existing functions. So it looks much simple.
>
>
> Thanks,
> Andrew Pinski
>
>>
>>>>
>>>> With that we should be able to kill the fold-const.c transform?
>>>
>>> That would certainly be nice and an excellent follow-up for Zhenqiang.
>>
>> That's my final goal to "kill the fold-const.c transform". I think we
>> may combine the two changes to make a "simple" and "good" patch.
>>
>> Thanks!
>> -Zhenqiang

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-27  9:17           ` Andrew Pinski
@ 2013-10-27 20:28             ` Andrew Pinski
  2013-10-29  1:33               ` Zhenqiang Chen
                                 ` (2 more replies)
  0 siblings, 3 replies; 31+ messages in thread
From: Andrew Pinski @ 2013-10-27 20:28 UTC (permalink / raw)
  To: Zhenqiang Chen; +Cc: Jeff Law, Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 4450 bytes --]

On Sat, Oct 26, 2013 at 4:49 PM, Andrew Pinski <pinskia@gmail.com> wrote:
> On Sat, Oct 26, 2013 at 2:30 PM, Andrew Pinski <pinskia@gmail.com> wrote:
>> On Fri, Oct 18, 2013 at 2:21 AM, Zhenqiang Chen
>> <zhenqiang.chen@linaro.org> wrote:
>>> On 18 October 2013 00:58, Jeff Law <law@redhat.com> wrote:
>>>> On 10/17/13 05:03, Richard Biener wrote:
>>>>>>>
>>>>>>> Is it OK for trunk?
>>>>>>
>>>>>>
>>>>>> I had a much simpler change which did basically the same from 4.7 (I
>>>>>> can update it if people think this is a better approach).
>>>>>
>>>>>
>>>>> I like that more (note you can now use is_gimple_condexpr as predicate
>>>>> for force_gimple_operand).
>>>>
>>>> The obvious question is whether or not Andrew's simpler change picks up as
>>>> many transformations as Zhenqiang's change.  If not are the things missed
>>>> important.
>>>>
>>>> Zhenqiang, can you do some testing of your change vs Andrew P.'s change?
>>>
>>> Here is a rough compare:
>>>
>>> 1) Andrew P.'s change can not handle ssa-ifcombine-ccmp-3.c (included
>>> in my patch). Root cause is that it does not skip "LABEL". The guard
>>> to do this opt should be the same the bb_has_overhead_p in my patch.
>>
>> This should be an easy change, I am working on this right now.
>>
>>>
>>> 2) Andrew P.'s change always generate TRUTH_AND_EXPR, which is not
>>> efficient for "||". e.g. For ssa-ifcombine-ccmp-6.c, it will generate
>>>
>>>   _3 = a_2(D) > 0;
>>>   _5 = b_4(D) > 0;
>>>   _6 = _3 | _5;
>>>   _9 = c_7(D) <= 0;
>>>   _10 = ~_6;
>>>   _11 = _9 & _10;
>>>   if (_11 == 0)
>>>
>>> With my patch, it will generate
>>>
>>>   _3 = a_2(D) > 0;
>>>   _5 = b_4(D) > 0;
>>>   _6 = _3 | _5;
>>>   _9 = c_7(D) > 0;
>>>   _10 = _6 | _9;
>>>   if (_10 != 0)
>>
>> As mentioned otherwise, this seems like a missed optimization inside
>> forwprop.  When I originally wrote this code there used to be two
>> cases one for & and one for |, but this was removed sometime and I
>> just made the code evolve with that.
>
> Actually I can make a small patch (3 lines) to my current patch which
> causes tree-ssa-ifcombine.c to produce the behavior of your patch.
>
>  if (result_inv)
>    {
>      t = fold_build1 (TRUTH_NOT_EXPR, TREE_TYPE (t), t);
>      result_inv = false;
>    }
>
> I will submit a new patch after some testing of my current patch which
> fixes items 1 and 2.


Here is my latest patch which adds the testcases from Zhenqiang's
patch and fixes item 1 and 2.

OK?  Bootstrapped and tested on x86_64-linux-gnu with no regressions.

Thanks,
Andrew Pinski

ChangeLog:
* tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
(ifcombine_ifandif): Handle cases where
maybe_fold_and_comparisons fails, combining the branches
anyways.
(tree_ssa_ifcombine): Inverse the order of
the basic block walk, increases the number of combinings.
* gimple.h (gsi_start_nondebug_after_labels_bb): New function.


testsuite/ChangeLog:

* gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c: New test case.
* gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c: New test case.
* gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c: New test case.
* gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c: New test case.
* gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c: New test case.
* gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c: New test case.

* gcc.dg/tree-ssa/phi-opt-9.c: Use a function call to prevent
conditional move to be used.
* gcc.dg/tree-ssa/ssa-dom-thread-3.c: Remove check for "one or more
intermediate".




>
> Thanks,
> Andrew Pinski
>
>
>>
>>>
>>> 3) The good thing of Andrew P.'s change is that "Inverse the order of
>>> the basic block walk" so it can do combine recursively.
>>>
>>> But I think we need some heuristic to control the number of ifs. Move
>>> too much compares from
>>> the inner_bb to outer_bb is not good.
>>
>>
>> I think this depends on the target.  For MIPS we don't want an upper
>> bound as integer comparisons go directly to GPRs.  I wrote this patch
>> with MIPS in mind as that was the target I was working on.
>>
>>>
>>> 4) Another good thing of Andrew P.'s change is that it reuses some
>>> existing functions. So it looks much simple.
>>
>>
>> Thanks,
>> Andrew Pinski
>>
>>>
>>>>>
>>>>> With that we should be able to kill the fold-const.c transform?
>>>>
>>>> That would certainly be nice and an excellent follow-up for Zhenqiang.
>>>
>>> That's my final goal to "kill the fold-const.c transform". I think we
>>> may combine the two changes to make a "simple" and "good" patch.
>>>
>>> Thanks!
>>> -Zhenqiang

[-- Attachment #2: ssaifcombine.diff.txt --]
[-- Type: text/plain, Size: 9431 bytes --]

Index: testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c	(revision 0)
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
+
+/* { dg-options "-O2 -g -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+
+int t (int a, int b)
+{
+  if (a > 0)
+    if (b > 0)
+      return 0;
+  return 1;
+}
+/* { dg-final { scan-tree-dump "\&" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c	(revision 204095)
+++ testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c	(working copy)
@@ -42,7 +42,7 @@ expand_one_var (tree var, unsigned char
     abort ();
 }
 /* We should thread the jump, through an intermediate block.  */
-/* { dg-final { scan-tree-dump-times "Threaded" 2 "dom1"} } */
-/* { dg-final { scan-tree-dump-times "Registering jump thread: \\(.*\\) incoming edge;  \\(.*\\) joiner;  \\(.*\\) nocopy;" 1 "dom1"} } */
+/* { dg-final { scan-tree-dump "Threaded" "dom1"} } */
+/* { dg-final { scan-tree-dump-times "Registering jump thread: \\(.*\\) incoming edge;" 1 "dom1"} } */
 /* { dg-final { cleanup-tree-dump "dom1" } } */
 
Index: testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c	(revision 0)
@@ -0,0 +1,17 @@
+/* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
+
+/* { dg-options "-O2 -g -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+
+int t (int a, int b)
+{
+  if (a > 0)
+    goto L1;
+  if (b > 0)
+    goto L1;
+  return 0;
+L1:
+  return 1;
+}
+/* { dg-final { scan-tree-dump "\|" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c	(revision 0)
@@ -0,0 +1,20 @@
+/* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
+
+/* { dg-options "-O2 -g -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+
+int t (int a, int b)
+{
+  if (a > 0)
+    goto L1;
+  else
+    goto L2;
+L1:
+  if (b > 0)
+    goto L2;
+  return 5;
+L2:
+  return 6;
+}
+/* { dg-final { scan-tree-dump "\|" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c	(revision 0)
@@ -0,0 +1,18 @@
+/* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
+
+/* { dg-options "-O2 -g -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+
+int t (int a, int b)
+{
+  if (a > 0)
+    goto L1;
+  if (b > 0)
+    goto L2;
+L1:
+  return 0;
+L2:
+  return 1;
+}
+/* { dg-final { scan-tree-dump "\&" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: testsuite/gcc.dg/tree-ssa/phi-opt-9.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/phi-opt-9.c	(revision 204095)
+++ testsuite/gcc.dg/tree-ssa/phi-opt-9.c	(working copy)
@@ -2,13 +2,14 @@
 /* { dg-options "-O -fdump-tree-optimized" } */
 
 int g(int,int);
+int h(int);
 int f(int t, int c)
 {
   int d = 0;
   int e = 0;
   if (t)
     {
-      d = c+1;
+      d = h(c);
       e = t;
     }
   else d = 0, e = 0;
Index: testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c	(revision 0)
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
+
+/* { dg-options "-O2 -g -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+
+int t (int a, int b, int c)
+{
+  if (a > 0 && b > 0 && c > 0)
+      return 0;
+  return 1;
+}
+/* { dg-final { scan-tree-dump-times "\&" 2 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c	(revision 0)
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
+
+/* { dg-options "-O2 -g -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+
+int t (int a, int b, int c)
+{
+  if (a > 0 || b > 0 || c > 0)
+      return 0;
+  return 1;
+}
+/* { dg-final { scan-tree-dump-times "\\|" 2 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: tree-ssa-ifcombine.c
===================================================================
--- tree-ssa-ifcombine.c	(revision 204095)
+++ tree-ssa-ifcombine.c	(working copy)
@@ -22,6 +22,10 @@ along with GCC; see the file COPYING3.
 #include "system.h"
 #include "coretypes.h"
 #include "tm.h"
+/* rtl is needed only because arm back-end requires it for
+   BRANCH_COST.  */
+#include "rtl.h"
+#include "tm_p.h"
 #include "tree.h"
 #include "basic-block.h"
 #include "tree-pretty-print.h"
@@ -32,6 +36,12 @@ along with GCC; see the file COPYING3.
 #include "ssa-iterators.h"
 #include "tree-pass.h"
 
+#ifndef LOGICAL_OP_NON_SHORT_CIRCUIT
+#define LOGICAL_OP_NON_SHORT_CIRCUIT \
+  (BRANCH_COST (optimize_function_for_speed_p (cfun), \
+                false) >= 2)
+#endif
+
 /* This pass combines COND_EXPRs to simplify control flow.  It
    currently recognizes bit tests and comparisons in chains that
    represent logical and or logical or of two COND_EXPRs.
@@ -488,7 +498,35 @@ ifcombine_ifandif (basic_block inner_con
 					    outer_cond_code,
 					    gimple_cond_lhs (outer_cond),
 					    gimple_cond_rhs (outer_cond))))
-	return false;
+	{
+	  tree t1, t2;
+	  gimple_stmt_iterator gsi;
+	  if (!LOGICAL_OP_NON_SHORT_CIRCUIT)
+	    return false;
+	  /* Only do this optimization if the inner bb contains only the conditional. */
+	  if (!gsi_one_before_end_p (gsi_start_nondebug_after_labels_bb (inner_cond_bb)))
+	    return false;
+	  t1 = fold_build2_loc (gimple_location (inner_cond),
+				inner_cond_code,
+				boolean_type_node,
+				gimple_cond_lhs (inner_cond),
+				gimple_cond_rhs (inner_cond));
+	  t2 = fold_build2_loc (gimple_location (outer_cond),
+				outer_cond_code,
+				boolean_type_node,
+				gimple_cond_lhs (outer_cond),
+				gimple_cond_rhs (outer_cond));
+	  t = fold_build2_loc (gimple_location (inner_cond), 
+			       TRUTH_AND_EXPR, boolean_type_node, t1, t2);
+	  if (result_inv)
+	    {
+	      t = fold_build1 (TRUTH_NOT_EXPR, TREE_TYPE (t), t);
+	      result_inv = false;
+	    }
+	  gsi = gsi_for_stmt (inner_cond);
+	  t = force_gimple_operand_gsi_1 (&gsi, t, is_gimple_condexpr, NULL, true,
+					  GSI_SAME_STMT);
+        }
       if (result_inv)
 	t = fold_build1 (TRUTH_NOT_EXPR, TREE_TYPE (t), t);
       t = canonicalize_cond_expr_cond (t);
@@ -631,7 +669,7 @@ tree_ssa_ifcombine (void)
   bbs = single_pred_before_succ_order ();
   calculate_dominance_info (CDI_DOMINATORS);
 
-  for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; ++i)
+  for (i = n_basic_blocks - NUM_FIXED_BLOCKS -1; i >= 0; i--)
     {
       basic_block bb = bbs[i];
       gimple stmt = last_stmt (bb);
Index: gimple.h
===================================================================
--- gimple.h	(revision 204095)
+++ gimple.h	(working copy)
@@ -5533,6 +5533,19 @@ gsi_start_nondebug_bb (basic_block bb)
 
   return i;
 }
+/* Return a new iterator pointing to the first non-debug non-label statement in
+   basic block BB.  */
+
+static inline gimple_stmt_iterator
+gsi_start_nondebug_after_labels_bb (basic_block bb)
+{
+  gimple_stmt_iterator i = gsi_after_labels (bb);
+
+  if (!gsi_end_p (i) && is_gimple_debug (gsi_stmt (i)))
+    gsi_next_nondebug (&i);
+
+  return i;
+}
 
 /* Return a new iterator pointing to the last non-debug statement in
    basic block BB.  */

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-27 20:28             ` Andrew Pinski
@ 2013-10-29  1:33               ` Zhenqiang Chen
  2013-10-29  7:24               ` Jeff Law
  2013-10-29 10:39               ` Richard Biener
  2 siblings, 0 replies; 31+ messages in thread
From: Zhenqiang Chen @ 2013-10-29  1:33 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: Jeff Law, Richard Biener, gcc-patches

On 28 October 2013 02:55, Andrew Pinski <pinskia@gmail.com> wrote:
> On Sat, Oct 26, 2013 at 4:49 PM, Andrew Pinski <pinskia@gmail.com> wrote:
>> On Sat, Oct 26, 2013 at 2:30 PM, Andrew Pinski <pinskia@gmail.com> wrote:
>>> On Fri, Oct 18, 2013 at 2:21 AM, Zhenqiang Chen
>>> <zhenqiang.chen@linaro.org> wrote:
>>>> On 18 October 2013 00:58, Jeff Law <law@redhat.com> wrote:
>>>>> On 10/17/13 05:03, Richard Biener wrote:
>>>>>>>>
>>>>>>>> Is it OK for trunk?
>>>>>>>
>>>>>>>
>>>>>>> I had a much simpler change which did basically the same from 4.7 (I
>>>>>>> can update it if people think this is a better approach).
>>>>>>
>>>>>>
>>>>>> I like that more (note you can now use is_gimple_condexpr as predicate
>>>>>> for force_gimple_operand).
>>>>>
>>>>> The obvious question is whether or not Andrew's simpler change picks up as
>>>>> many transformations as Zhenqiang's change.  If not are the things missed
>>>>> important.
>>>>>
>>>>> Zhenqiang, can you do some testing of your change vs Andrew P.'s change?
>>>>
>>>> Here is a rough compare:
>>>>
>>>> 1) Andrew P.'s change can not handle ssa-ifcombine-ccmp-3.c (included
>>>> in my patch). Root cause is that it does not skip "LABEL". The guard
>>>> to do this opt should be the same the bb_has_overhead_p in my patch.
>>>
>>> This should be an easy change, I am working on this right now.
>>>
>>>>
>>>> 2) Andrew P.'s change always generate TRUTH_AND_EXPR, which is not
>>>> efficient for "||". e.g. For ssa-ifcombine-ccmp-6.c, it will generate
>>>>
>>>>   _3 = a_2(D) > 0;
>>>>   _5 = b_4(D) > 0;
>>>>   _6 = _3 | _5;
>>>>   _9 = c_7(D) <= 0;
>>>>   _10 = ~_6;
>>>>   _11 = _9 & _10;
>>>>   if (_11 == 0)
>>>>
>>>> With my patch, it will generate
>>>>
>>>>   _3 = a_2(D) > 0;
>>>>   _5 = b_4(D) > 0;
>>>>   _6 = _3 | _5;
>>>>   _9 = c_7(D) > 0;
>>>>   _10 = _6 | _9;
>>>>   if (_10 != 0)
>>>
>>> As mentioned otherwise, this seems like a missed optimization inside
>>> forwprop.  When I originally wrote this code there used to be two
>>> cases one for & and one for |, but this was removed sometime and I
>>> just made the code evolve with that.
>>
>> Actually I can make a small patch (3 lines) to my current patch which
>> causes tree-ssa-ifcombine.c to produce the behavior of your patch.
>>
>>  if (result_inv)
>>    {
>>      t = fold_build1 (TRUTH_NOT_EXPR, TREE_TYPE (t), t);
>>      result_inv = false;
>>    }
>>
>> I will submit a new patch after some testing of my current patch which
>> fixes items 1 and 2.
>
>
> Here is my latest patch which adds the testcases from Zhenqiang's
> patch and fixes item 1 and 2.

The patch works fine for me. Bootstrap and no make check regression on
ARM Chromebook.

Thanks!
-Zhenqiang

> OK?  Bootstrapped and tested on x86_64-linux-gnu with no regressions.
>
> Thanks,
> Andrew Pinski
>
> ChangeLog:
> * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
> (ifcombine_ifandif): Handle cases where
> maybe_fold_and_comparisons fails, combining the branches
> anyways.
> (tree_ssa_ifcombine): Inverse the order of
> the basic block walk, increases the number of combinings.
> * gimple.h (gsi_start_nondebug_after_labels_bb): New function.
>
>
> testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c: New test case.
> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c: New test case.
> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c: New test case.
> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c: New test case.
> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c: New test case.
> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c: New test case.
>
> * gcc.dg/tree-ssa/phi-opt-9.c: Use a function call to prevent
> conditional move to be used.
> * gcc.dg/tree-ssa/ssa-dom-thread-3.c: Remove check for "one or more
> intermediate".
>
>
>
>
>>
>> Thanks,
>> Andrew Pinski
>>
>>
>>>
>>>>
>>>> 3) The good thing of Andrew P.'s change is that "Inverse the order of
>>>> the basic block walk" so it can do combine recursively.
>>>>
>>>> But I think we need some heuristic to control the number of ifs. Move
>>>> too much compares from
>>>> the inner_bb to outer_bb is not good.
>>>
>>>
>>> I think this depends on the target.  For MIPS we don't want an upper
>>> bound as integer comparisons go directly to GPRs.  I wrote this patch
>>> with MIPS in mind as that was the target I was working on.
>>>
>>>>
>>>> 4) Another good thing of Andrew P.'s change is that it reuses some
>>>> existing functions. So it looks much simple.
>>>
>>>
>>> Thanks,
>>> Andrew Pinski
>>>
>>>>
>>>>>>
>>>>>> With that we should be able to kill the fold-const.c transform?
>>>>>
>>>>> That would certainly be nice and an excellent follow-up for Zhenqiang.
>>>>
>>>> That's my final goal to "kill the fold-const.c transform". I think we
>>>> may combine the two changes to make a "simple" and "good" patch.
>>>>
>>>> Thanks!
>>>> -Zhenqiang

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-27 20:28             ` Andrew Pinski
  2013-10-29  1:33               ` Zhenqiang Chen
@ 2013-10-29  7:24               ` Jeff Law
  2013-10-29 15:30                 ` Andrew Pinski
  2013-10-29 10:39               ` Richard Biener
  2 siblings, 1 reply; 31+ messages in thread
From: Jeff Law @ 2013-10-29  7:24 UTC (permalink / raw)
  To: Andrew Pinski, Zhenqiang Chen; +Cc: Richard Biener, gcc-patches

On 10/27/13 12:55, Andrew Pinski wrote:
> Here is my latest patch which adds the testcases from Zhenqiang's
> patch and fixes item 1 and 2.
>
> OK?  Bootstrapped and tested on x86_64-linux-gnu with no regressions.
>
> Thanks,
> Andrew Pinski
>
> ChangeLog:
> * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
> (ifcombine_ifandif): Handle cases where
> maybe_fold_and_comparisons fails, combining the branches
> anyways.
> (tree_ssa_ifcombine): Inverse the order of
> the basic block walk, increases the number of combinings.
> * gimple.h (gsi_start_nondebug_after_labels_bb): New function.
>
>
> testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c: New test case.
> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c: New test case.
> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c: New test case.
> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c: New test case.
> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c: New test case.
> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c: New test case.
>
> * gcc.dg/tree-ssa/phi-opt-9.c: Use a function call to prevent
> conditional move to be used.
> * gcc.dg/tree-ssa/ssa-dom-thread-3.c: Remove check for "one or more
> intermediate".
I don't be able to look at this in any depth tonight.  However, I would 
ask that you pass along the dump file for ssa-dom-thread-3.c so that I 
can evaluate the correctness of your change better.

Thanks,
jeff

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-27 20:28             ` Andrew Pinski
  2013-10-29  1:33               ` Zhenqiang Chen
  2013-10-29  7:24               ` Jeff Law
@ 2013-10-29 10:39               ` Richard Biener
  2013-10-30  7:32                 ` Andrew Pinski
  2 siblings, 1 reply; 31+ messages in thread
From: Richard Biener @ 2013-10-29 10:39 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: Zhenqiang Chen, Jeff Law, gcc-patches

On Sun, Oct 27, 2013 at 7:55 PM, Andrew Pinski <pinskia@gmail.com> wrote:
> On Sat, Oct 26, 2013 at 4:49 PM, Andrew Pinski <pinskia@gmail.com> wrote:
>> On Sat, Oct 26, 2013 at 2:30 PM, Andrew Pinski <pinskia@gmail.com> wrote:
>>> On Fri, Oct 18, 2013 at 2:21 AM, Zhenqiang Chen
>>> <zhenqiang.chen@linaro.org> wrote:
>>>> On 18 October 2013 00:58, Jeff Law <law@redhat.com> wrote:
>>>>> On 10/17/13 05:03, Richard Biener wrote:
>>>>>>>>
>>>>>>>> Is it OK for trunk?
>>>>>>>
>>>>>>>
>>>>>>> I had a much simpler change which did basically the same from 4.7 (I
>>>>>>> can update it if people think this is a better approach).
>>>>>>
>>>>>>
>>>>>> I like that more (note you can now use is_gimple_condexpr as predicate
>>>>>> for force_gimple_operand).
>>>>>
>>>>> The obvious question is whether or not Andrew's simpler change picks up as
>>>>> many transformations as Zhenqiang's change.  If not are the things missed
>>>>> important.
>>>>>
>>>>> Zhenqiang, can you do some testing of your change vs Andrew P.'s change?
>>>>
>>>> Here is a rough compare:
>>>>
>>>> 1) Andrew P.'s change can not handle ssa-ifcombine-ccmp-3.c (included
>>>> in my patch). Root cause is that it does not skip "LABEL". The guard
>>>> to do this opt should be the same the bb_has_overhead_p in my patch.
>>>
>>> This should be an easy change, I am working on this right now.
>>>
>>>>
>>>> 2) Andrew P.'s change always generate TRUTH_AND_EXPR, which is not
>>>> efficient for "||". e.g. For ssa-ifcombine-ccmp-6.c, it will generate
>>>>
>>>>   _3 = a_2(D) > 0;
>>>>   _5 = b_4(D) > 0;
>>>>   _6 = _3 | _5;
>>>>   _9 = c_7(D) <= 0;
>>>>   _10 = ~_6;
>>>>   _11 = _9 & _10;
>>>>   if (_11 == 0)
>>>>
>>>> With my patch, it will generate
>>>>
>>>>   _3 = a_2(D) > 0;
>>>>   _5 = b_4(D) > 0;
>>>>   _6 = _3 | _5;
>>>>   _9 = c_7(D) > 0;
>>>>   _10 = _6 | _9;
>>>>   if (_10 != 0)
>>>
>>> As mentioned otherwise, this seems like a missed optimization inside
>>> forwprop.  When I originally wrote this code there used to be two
>>> cases one for & and one for |, but this was removed sometime and I
>>> just made the code evolve with that.
>>
>> Actually I can make a small patch (3 lines) to my current patch which
>> causes tree-ssa-ifcombine.c to produce the behavior of your patch.
>>
>>  if (result_inv)
>>    {
>>      t = fold_build1 (TRUTH_NOT_EXPR, TREE_TYPE (t), t);
>>      result_inv = false;
>>    }
>>
>> I will submit a new patch after some testing of my current patch which
>> fixes items 1 and 2.
>
>
> Here is my latest patch which adds the testcases from Zhenqiang's
> patch and fixes item 1 and 2.
>
> OK?  Bootstrapped and tested on x86_64-linux-gnu with no regressions.

   return i;
 }
+/* Return a new iterator pointing to the first non-debug non-label statement in
+   basic block BB.  */
+
+static inline gimple_stmt_iterator
+gsi_start_nondebug_after_labels_bb (basic_block bb)


vertical space before the comment missing.

@@ -631,7 +669,7 @@ tree_ssa_ifcombine (void)
   bbs = single_pred_before_succ_order ();
   calculate_dominance_info (CDI_DOMINATORS);

-  for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; ++i)
+  for (i = n_basic_blocks - NUM_FIXED_BLOCKS -1; i >= 0; i--)
     {
       basic_block bb = bbs[i];
       gimple stmt = last_stmt (bb);

The original code matches what phiopt does which even has a comment:

  /* Search every basic block for COND_EXPR we may be able to optimize.

     We walk the blocks in order that guarantees that a block with
     a single predecessor is processed before the predecessor.
     This ensures that we collapse inner ifs before visiting the
     outer ones, and also that we do not try to visit a removed
     block.  */
  bb_order = single_pred_before_succ_order ();
  n = n_basic_blocks - NUM_FIXED_BLOCKS;

  for (i = 0; i < n; i++)

so if the reverse order is better here (and in phiopt?) then it at least
needs a comment explaining why.

Otherwise the patch looks good, but please iterate with Jeff over
the dom testcase issue.

Thanks,
Richard.

> Thanks,
> Andrew Pinski
>
> ChangeLog:
> * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
> (ifcombine_ifandif): Handle cases where
> maybe_fold_and_comparisons fails, combining the branches
> anyways.
> (tree_ssa_ifcombine): Inverse the order of
> the basic block walk, increases the number of combinings.
> * gimple.h (gsi_start_nondebug_after_labels_bb): New function.
>
>
> testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c: New test case.
> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c: New test case.
> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c: New test case.
> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c: New test case.
> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c: New test case.
> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c: New test case.
>
> * gcc.dg/tree-ssa/phi-opt-9.c: Use a function call to prevent
> conditional move to be used.
> * gcc.dg/tree-ssa/ssa-dom-thread-3.c: Remove check for "one or more
> intermediate".
>
>
>
>
>>
>> Thanks,
>> Andrew Pinski
>>
>>
>>>
>>>>
>>>> 3) The good thing of Andrew P.'s change is that "Inverse the order of
>>>> the basic block walk" so it can do combine recursively.
>>>>
>>>> But I think we need some heuristic to control the number of ifs. Move
>>>> too much compares from
>>>> the inner_bb to outer_bb is not good.
>>>
>>>
>>> I think this depends on the target.  For MIPS we don't want an upper
>>> bound as integer comparisons go directly to GPRs.  I wrote this patch
>>> with MIPS in mind as that was the target I was working on.
>>>
>>>>
>>>> 4) Another good thing of Andrew P.'s change is that it reuses some
>>>> existing functions. So it looks much simple.
>>>
>>>
>>> Thanks,
>>> Andrew Pinski
>>>
>>>>
>>>>>>
>>>>>> With that we should be able to kill the fold-const.c transform?
>>>>>
>>>>> That would certainly be nice and an excellent follow-up for Zhenqiang.
>>>>
>>>> That's my final goal to "kill the fold-const.c transform". I think we
>>>> may combine the two changes to make a "simple" and "good" patch.
>>>>
>>>> Thanks!
>>>> -Zhenqiang

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-29  7:24               ` Jeff Law
@ 2013-10-29 15:30                 ` Andrew Pinski
  2013-10-29 18:42                   ` Jeff Law
  0 siblings, 1 reply; 31+ messages in thread
From: Andrew Pinski @ 2013-10-29 15:30 UTC (permalink / raw)
  To: Jeff Law; +Cc: Zhenqiang Chen, Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1869 bytes --]

On Mon, Oct 28, 2013 at 10:51 PM, Jeff Law <law@redhat.com> wrote:
> On 10/27/13 12:55, Andrew Pinski wrote:
>>
>> Here is my latest patch which adds the testcases from Zhenqiang's
>> patch and fixes item 1 and 2.
>>
>> OK?  Bootstrapped and tested on x86_64-linux-gnu with no regressions.
>>
>> Thanks,
>> Andrew Pinski
>>
>> ChangeLog:
>> * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
>> (ifcombine_ifandif): Handle cases where
>> maybe_fold_and_comparisons fails, combining the branches
>> anyways.
>> (tree_ssa_ifcombine): Inverse the order of
>> the basic block walk, increases the number of combinings.
>> * gimple.h (gsi_start_nondebug_after_labels_bb): New function.
>>
>>
>> testsuite/ChangeLog:
>>
>> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c: New test case.
>> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c: New test case.
>> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c: New test case.
>> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c: New test case.
>> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c: New test case.
>> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c: New test case.
>>
>> * gcc.dg/tree-ssa/phi-opt-9.c: Use a function call to prevent
>> conditional move to be used.
>> * gcc.dg/tree-ssa/ssa-dom-thread-3.c: Remove check for "one or more
>> intermediate".
>
> I don't be able to look at this in any depth tonight.  However, I would ask
> that you pass along the dump file for ssa-dom-thread-3.c so that I can
> evaluate the correctness of your change better.

We no longer jump thread one case as we have:

  <bb 2>:
  var_3 = var_1(D)->ssa_name.var;
  _4 = var_1(D)->base.code;
  if (_4 == 1)
    goto <bb 3>;
  else
    goto <bb 5>;

...

 <bb 5>:
  _6 = var_3->base.code;
  _13 = _4 != 1;
  _14 = _6 != 0;
  _12 = _13 & _14;
  if (_12 != 0)
    goto <bb 8>;
  else
    goto <bb 6>;

Attached is the full dump too.

Thanks,
Andrew Pinski



>
> Thanks,
> jeff
>

[-- Attachment #2: ssa-dom-thread-3.c.077t.dom1 --]
[-- Type: application/octet-stream, Size: 6684 bytes --]


;; Function expand_one_var (expand_one_var, funcdef_no=0, decl_uid=1753, symbol_order=0)

;; 1 loops found
;;
;; Loop 0
;;  header 0, latch 1
;;  depth 0, outer -1
;;  nodes: 0 1 2 3 4 5 6 7 8
;; 2 succs { 3 5 }
;; 3 succs { 4 5 }
;; 4 succs { }
;; 5 succs { 8 6 }
;; 6 succs { 7 8 }
;; 7 succs { }
;; 8 succs { 1 }


Optimizing block #0



Optimizing block #2

Optimizing statement var_3 = var_1(D)->ssa_name.var;
LKUP STMT var_3 = var_1(D)->ssa_name.var
          var_3 = var_1(D)->ssa_name.var;
2>>> STMT var_3 = var_1(D)->ssa_name.var
          var_3 = var_1(D)->ssa_name.var;
Optimizing statement _4 = var_1(D)->base.code;
LKUP STMT _4 = var_1(D)->base.code
          _4 = var_1(D)->base.code;
2>>> STMT _4 = var_1(D)->base.code
          _4 = var_1(D)->base.code;
Optimizing statement if (_4 == 1)
LKUP STMT _4 eq_expr 1
          if (_4 == 1)


Optimizing block #3

0>>> COPY _4 = 1
1>>> COND 1 = _4 le_expr 1
1>>> COND 1 = _4 ge_expr 1
1>>> COND 1 = _4 eq_expr 1
1>>> COND 0 = _4 ne_expr 1
Optimizing statement _5 = var_3->base.code;
LKUP STMT _5 = var_3->base.code
          _5 = var_3->base.code;
2>>> STMT _5 = var_3->base.code
          _5 = var_3->base.code;
Optimizing statement if (_5 == 0)
LKUP STMT _5 eq_expr 0
          if (_5 == 0)


Optimizing block #4

0>>> COPY _5 = 0
1>>> COND 1 = _5 le_expr 0
1>>> COND 1 = _5 ge_expr 0
1>>> COND 1 = _5 eq_expr 0
1>>> COND 0 = _5 ne_expr 0
Optimizing statement abort ();
<<<< COND 0 = _5 ne_expr 0
<<<< COND 1 = _5 eq_expr 0
<<<< COND 1 = _5 ge_expr 0
<<<< COND 1 = _5 le_expr 0
<<<< COPY _5 = 0
1>>> COND 1 = _5 ne_expr 0
1>>> COND 0 = _5 eq_expr 0
LKUP STMT _6 = var_3->base.code
          _6 = var_3->base.code;
FIND: _5
LKUP STMT _14 = _5 ne_expr 0
          _14 = _5 != 0;
FIND: 1
  Registering jump thread: (3, 5) incoming edge;  (5, 6) normal;
<<<< COND 0 = _5 eq_expr 0
<<<< COND 1 = _5 ne_expr 0
<<<< STMT _5 = var_3->base.code
          _5 = var_3->base.code;
<<<< COND 0 = _4 ne_expr 1
<<<< COND 1 = _4 eq_expr 1
<<<< COND 1 = _4 ge_expr 1
<<<< COND 1 = _4 le_expr 1
<<<< COPY _4 = 1


Optimizing block #5

Optimizing statement _6 = var_3->base.code;
LKUP STMT _6 = var_3->base.code
          _6 = var_3->base.code;
2>>> STMT _6 = var_3->base.code
          _6 = var_3->base.code;
Optimizing statement _13 = _4 != 1;
LKUP STMT _13 = _4 ne_expr 1
          _13 = _4 != 1;
2>>> STMT _13 = _4 ne_expr 1
          _13 = _4 != 1;
Optimizing statement _14 = _6 != 0;
LKUP STMT _14 = _6 ne_expr 0
          _14 = _6 != 0;
2>>> STMT _14 = _6 ne_expr 0
          _14 = _6 != 0;
Optimizing statement _12 = _13 & _14;
LKUP STMT _12 = _13 bit_and_expr _14
          _12 = _13 & _14;
2>>> STMT _12 = _13 bit_and_expr _14
          _12 = _13 & _14;
Optimizing statement if (_12 != 0)
LKUP STMT _12 ne_expr 0
          if (_12 != 0)


Optimizing block #6

0>>> COPY _12 = 0
Optimizing statement _7 = (int) _6;
LKUP STMT _7 = nop_expr _6
          _7 = (int) _6;
2>>> STMT _7 = nop_expr _6
          _7 = (int) _6;
Optimizing statement _8 = tree_contains_struct[_7][0];
LKUP STMT _8 = tree_contains_struct[_7][0]
          _8 = tree_contains_struct[_7][0];
2>>> STMT _8 = tree_contains_struct[_7][0]
          _8 = tree_contains_struct[_7][0];
Optimizing statement if (_8 != 1)
LKUP STMT _8 ne_expr 1
          if (_8 != 1)


Optimizing block #7

1>>> COND 1 = _8 ne_expr 1
1>>> COND 0 = _8 eq_expr 1
Optimizing statement abort ();
<<<< COND 0 = _8 eq_expr 1
<<<< COND 1 = _8 ne_expr 1
<<<< STMT _8 = tree_contains_struct[_7][0]
          _8 = tree_contains_struct[_7][0];
<<<< STMT _7 = nop_expr _6
          _7 = (int) _6;
<<<< COPY _12 = 0


Optimizing block #8

Optimizing statement return;
<<<< STMT _12 = _13 bit_and_expr _14
          _12 = _13 & _14;
<<<< STMT _14 = _6 ne_expr 0
          _14 = _6 != 0;
<<<< STMT _13 = _4 ne_expr 1
          _13 = _4 != 1;
<<<< STMT _6 = var_3->base.code
          _6 = var_3->base.code;
1>>> COND 1 = _4 ne_expr 1
1>>> COND 0 = _4 eq_expr 1
LKUP STMT _6 = var_3->base.code
          _6 = var_3->base.code;
LKUP STMT _13 = _4 ne_expr 1
          _13 = _4 != 1;
FIND: 1
LKUP STMT _14 = _6 ne_expr 0
          _14 = _6 != 0;
LKUP STMT _14 ne_expr 0
          if (_14 != 0)
<<<< COND 0 = _4 eq_expr 1
<<<< COND 1 = _4 ne_expr 1
<<<< STMT _4 = var_1(D)->base.code
          _4 = var_1(D)->base.code;
<<<< STMT var_3 = var_1(D)->ssa_name.var
          var_3 = var_1(D)->ssa_name.var;
  Threaded jump 3 --> 5 to 9

Updating SSA:
creating PHI node in block #6 for _6
Registering new PHI nodes in block #2
Registering new PHI nodes in block #5
Updating SSA information for statement _6 = var_3->base.code;
Updating SSA information for statement _13 = _4 != 1;
Updating SSA information for statement _14 = _6 != 0;
Updating SSA information for statement _12 = _13 & _14;
Updating SSA information for statement if (_12 != 0)
Registering new PHI nodes in block #3
Registering new PHI nodes in block #9
Updating SSA information for statement _10 = var_3->base.code;
Updating SSA information for statement _11 = _4 != 1;
Updating SSA information for statement _9 = _6 != 0;
Updating SSA information for statement _15 = _13 & _14;
Registering new PHI nodes in block #4
Registering new PHI nodes in block #6
Updating SSA information for statement _7 = (int) _6;
Registering new PHI nodes in block #7
Registering new PHI nodes in block #8
Updating SSA information for statement return;

SSA replacement table
N_i -> { O_1 ... O_j } means that N_i replaces O_1, ..., O_j

_9 -> { _14 }
_10 -> { _6 }
_11 -> { _13 }
_15 -> { _12 }
_16 -> { _6 }
Incremental SSA update started at block: 2
Number of blocks in CFG: 10
Number of blocks to update: 4 ( 40%)
Affected blocks: 5 6 8 9


expand_one_var (union tree_node * var, unsigned char toplevel, unsigned char really_expand)
{
  short unsigned int _4;
  short unsigned int _5;
  short unsigned int _6;
  int _7;
  unsigned char _8;
  _Bool _9;
  short unsigned int _10;
  _Bool _11;
  _Bool _12;
  _Bool _13;
  _Bool _14;
  _Bool _15;
  short unsigned int _16;

  <bb 2>:
  var_3 = var_1(D)->ssa_name.var;
  _4 = var_1(D)->base.code;
  if (_4 == 1)
    goto <bb 3>;
  else
    goto <bb 5>;

  <bb 3>:
  _5 = var_3->base.code;
  if (_5 == 0)
    goto <bb 4>;
  else
    goto <bb 9>;

  <bb 4>:
  abort ();

  <bb 5>:
  _6 = var_3->base.code;
  _13 = _4 != 1;
  _14 = _6 != 0;
  _12 = _13 & _14;
  if (_12 != 0)
    goto <bb 8>;
  else
    goto <bb 6>;

  <bb 6>:
  # _16 = PHI <_6(5), _10(9)>
  _7 = (int) _16;
  _8 = tree_contains_struct[_7][0];
  if (_8 != 1)
    goto <bb 7>;
  else
    goto <bb 8>;

  <bb 7>:
  abort ();

  <bb 8>:
  return;

  <bb 9>:
  _10 = var_3->base.code;
  _11 = _4 != 1;
  _9 = _10 != 0;
  _15 = _11 & _9;
  goto <bb 6>;

}



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-29 15:30                 ` Andrew Pinski
@ 2013-10-29 18:42                   ` Jeff Law
  0 siblings, 0 replies; 31+ messages in thread
From: Jeff Law @ 2013-10-29 18:42 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: Zhenqiang Chen, Richard Biener, gcc-patches

On 10/29/13 09:14, Andrew Pinski wrote:
> On Mon, Oct 28, 2013 at 10:51 PM, Jeff Law <law@redhat.com> wrote:
>> On 10/27/13 12:55, Andrew Pinski wrote:
>>>
>>> Here is my latest patch which adds the testcases from Zhenqiang's
>>> patch and fixes item 1 and 2.
>>>
>>> OK?  Bootstrapped and tested on x86_64-linux-gnu with no regressions.
>>>
>>> Thanks,
>>> Andrew Pinski
>>>
>>> ChangeLog:
>>> * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
>>> (ifcombine_ifandif): Handle cases where
>>> maybe_fold_and_comparisons fails, combining the branches
>>> anyways.
>>> (tree_ssa_ifcombine): Inverse the order of
>>> the basic block walk, increases the number of combinings.
>>> * gimple.h (gsi_start_nondebug_after_labels_bb): New function.
>>>
>>>
>>> testsuite/ChangeLog:
>>>
>>> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c: New test case.
>>> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c: New test case.
>>> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c: New test case.
>>> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c: New test case.
>>> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c: New test case.
>>> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c: New test case.
>>>
>>> * gcc.dg/tree-ssa/phi-opt-9.c: Use a function call to prevent
>>> conditional move to be used.
>>> * gcc.dg/tree-ssa/ssa-dom-thread-3.c: Remove check for "one or more
>>> intermediate".
>>
>> I don't be able to look at this in any depth tonight.  However, I would ask
>> that you pass along the dump file for ssa-dom-thread-3.c so that I can
>> evaluate the correctness of your change better.
>
> We no longer jump thread one case as we have:
>
>    <bb 2>:
>    var_3 = var_1(D)->ssa_name.var;
>    _4 = var_1(D)->base.code;
>    if (_4 == 1)
>      goto <bb 3>;
>    else
>      goto <bb 5>;
>
> ...
>
>   <bb 5>:
>    _6 = var_3->base.code;
>    _13 = _4 != 1;
>    _14 = _6 != 0;
>    _12 = _13 & _14;
>    if (_12 != 0)
>      goto <bb 8>;
>    else
>      goto <bb 6>;
>
> Attached is the full dump too.
I mostly wanted to verify whether or not the test was worth running. 
It isn't after this change.  It no longer tests for the threader's 
ability to clone an intermediate block to isolate a path to a successor 
of the intermediate that is threadable.


Please just remove ssa-dom-thread-3.c as part of this checkin.

Thanks,

Jeff

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-29 10:39               ` Richard Biener
@ 2013-10-30  7:32                 ` Andrew Pinski
  2013-11-08 17:43                   ` Steven Bosscher
  0 siblings, 1 reply; 31+ messages in thread
From: Andrew Pinski @ 2013-10-30  7:32 UTC (permalink / raw)
  To: Richard Biener; +Cc: Zhenqiang Chen, Jeff Law, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 7358 bytes --]

On Tue, Oct 29, 2013 at 3:28 AM, Richard Biener
<richard.guenther@gmail.com> wrote:
> On Sun, Oct 27, 2013 at 7:55 PM, Andrew Pinski <pinskia@gmail.com> wrote:
>> On Sat, Oct 26, 2013 at 4:49 PM, Andrew Pinski <pinskia@gmail.com> wrote:
>>> On Sat, Oct 26, 2013 at 2:30 PM, Andrew Pinski <pinskia@gmail.com> wrote:
>>>> On Fri, Oct 18, 2013 at 2:21 AM, Zhenqiang Chen
>>>> <zhenqiang.chen@linaro.org> wrote:
>>>>> On 18 October 2013 00:58, Jeff Law <law@redhat.com> wrote:
>>>>>> On 10/17/13 05:03, Richard Biener wrote:
>>>>>>>>>
>>>>>>>>> Is it OK for trunk?
>>>>>>>>
>>>>>>>>
>>>>>>>> I had a much simpler change which did basically the same from 4.7 (I
>>>>>>>> can update it if people think this is a better approach).
>>>>>>>
>>>>>>>
>>>>>>> I like that more (note you can now use is_gimple_condexpr as predicate
>>>>>>> for force_gimple_operand).
>>>>>>
>>>>>> The obvious question is whether or not Andrew's simpler change picks up as
>>>>>> many transformations as Zhenqiang's change.  If not are the things missed
>>>>>> important.
>>>>>>
>>>>>> Zhenqiang, can you do some testing of your change vs Andrew P.'s change?
>>>>>
>>>>> Here is a rough compare:
>>>>>
>>>>> 1) Andrew P.'s change can not handle ssa-ifcombine-ccmp-3.c (included
>>>>> in my patch). Root cause is that it does not skip "LABEL". The guard
>>>>> to do this opt should be the same the bb_has_overhead_p in my patch.
>>>>
>>>> This should be an easy change, I am working on this right now.
>>>>
>>>>>
>>>>> 2) Andrew P.'s change always generate TRUTH_AND_EXPR, which is not
>>>>> efficient for "||". e.g. For ssa-ifcombine-ccmp-6.c, it will generate
>>>>>
>>>>>   _3 = a_2(D) > 0;
>>>>>   _5 = b_4(D) > 0;
>>>>>   _6 = _3 | _5;
>>>>>   _9 = c_7(D) <= 0;
>>>>>   _10 = ~_6;
>>>>>   _11 = _9 & _10;
>>>>>   if (_11 == 0)
>>>>>
>>>>> With my patch, it will generate
>>>>>
>>>>>   _3 = a_2(D) > 0;
>>>>>   _5 = b_4(D) > 0;
>>>>>   _6 = _3 | _5;
>>>>>   _9 = c_7(D) > 0;
>>>>>   _10 = _6 | _9;
>>>>>   if (_10 != 0)
>>>>
>>>> As mentioned otherwise, this seems like a missed optimization inside
>>>> forwprop.  When I originally wrote this code there used to be two
>>>> cases one for & and one for |, but this was removed sometime and I
>>>> just made the code evolve with that.
>>>
>>> Actually I can make a small patch (3 lines) to my current patch which
>>> causes tree-ssa-ifcombine.c to produce the behavior of your patch.
>>>
>>>  if (result_inv)
>>>    {
>>>      t = fold_build1 (TRUTH_NOT_EXPR, TREE_TYPE (t), t);
>>>      result_inv = false;
>>>    }
>>>
>>> I will submit a new patch after some testing of my current patch which
>>> fixes items 1 and 2.
>>
>>
>> Here is my latest patch which adds the testcases from Zhenqiang's
>> patch and fixes item 1 and 2.
>>
>> OK?  Bootstrapped and tested on x86_64-linux-gnu with no regressions.
>
>    return i;
>  }
> +/* Return a new iterator pointing to the first non-debug non-label statement in
> +   basic block BB.  */
> +
> +static inline gimple_stmt_iterator
> +gsi_start_nondebug_after_labels_bb (basic_block bb)
>
>
> vertical space before the comment missing.
>
> @@ -631,7 +669,7 @@ tree_ssa_ifcombine (void)
>    bbs = single_pred_before_succ_order ();
>    calculate_dominance_info (CDI_DOMINATORS);
>
> -  for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; ++i)
> +  for (i = n_basic_blocks - NUM_FIXED_BLOCKS -1; i >= 0; i--)
>      {
>        basic_block bb = bbs[i];
>        gimple stmt = last_stmt (bb);
>
> The original code matches what phiopt does which even has a comment:
>
>   /* Search every basic block for COND_EXPR we may be able to optimize.
>
>      We walk the blocks in order that guarantees that a block with
>      a single predecessor is processed before the predecessor.
>      This ensures that we collapse inner ifs before visiting the
>      outer ones, and also that we do not try to visit a removed
>      block.  */
>   bb_order = single_pred_before_succ_order ();
>   n = n_basic_blocks - NUM_FIXED_BLOCKS;
>
>   for (i = 0; i < n; i++)
>
> so if the reverse order is better here (and in phiopt?) then it at least
> needs a comment explaining why.
>
> Otherwise the patch looks good, but please iterate with Jeff over
> the dom testcase issue.


Here is what I applied in the end; Jeff told me just to remove the
testcase.  I added the comment trying to explain why it was the
opposite order of PHI-opt.

Thanks,
Andrew Pinski

ChangeLog:
* tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
(ifcombine_ifandif): Handle cases where
maybe_fold_and_comparisons fails, combining the branches
anyways.
(tree_ssa_ifcombine): Inverse the order of
the basic block walk, increases the number of combinings.
* gimple.h (gsi_start_nondebug_after_labels_bb): New function.

testsuite/ChangeLog:

* gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c: New test case.
* gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c: New test case.
* gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c: New test case.
* gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c: New test case.
* gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c: New test case.
* gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c: New test case.

* gcc.dg/tree-ssa/phi-opt-9.c: Use a function call to prevent
conditional move to be used.
* gcc.dg/tree-ssa/ssa-dom-thread-3.c: Remove.


>
> Thanks,
> Richard.
>
>> Thanks,
>> Andrew Pinski
>>
>> ChangeLog:
>> * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
>> (ifcombine_ifandif): Handle cases where
>> maybe_fold_and_comparisons fails, combining the branches
>> anyways.
>> (tree_ssa_ifcombine): Inverse the order of
>> the basic block walk, increases the number of combinings.
>> * gimple.h (gsi_start_nondebug_after_labels_bb): New function.
>>
>>
>> testsuite/ChangeLog:
>>
>> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c: New test case.
>> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c: New test case.
>> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c: New test case.
>> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c: New test case.
>> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c: New test case.
>> * gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c: New test case.
>>
>> * gcc.dg/tree-ssa/phi-opt-9.c: Use a function call to prevent
>> conditional move to be used.
>> * gcc.dg/tree-ssa/ssa-dom-thread-3.c: Remove check for "one or more
>> intermediate".
>>
>>
>>
>>
>>>
>>> Thanks,
>>> Andrew Pinski
>>>
>>>
>>>>
>>>>>
>>>>> 3) The good thing of Andrew P.'s change is that "Inverse the order of
>>>>> the basic block walk" so it can do combine recursively.
>>>>>
>>>>> But I think we need some heuristic to control the number of ifs. Move
>>>>> too much compares from
>>>>> the inner_bb to outer_bb is not good.
>>>>
>>>>
>>>> I think this depends on the target.  For MIPS we don't want an upper
>>>> bound as integer comparisons go directly to GPRs.  I wrote this patch
>>>> with MIPS in mind as that was the target I was working on.
>>>>
>>>>>
>>>>> 4) Another good thing of Andrew P.'s change is that it reuses some
>>>>> existing functions. So it looks much simple.
>>>>
>>>>
>>>> Thanks,
>>>> Andrew Pinski
>>>>
>>>>>
>>>>>>>
>>>>>>> With that we should be able to kill the fold-const.c transform?
>>>>>>
>>>>>> That would certainly be nice and an excellent follow-up for Zhenqiang.
>>>>>
>>>>> That's my final goal to "kill the fold-const.c transform". I think we
>>>>> may combine the two changes to make a "simple" and "good" patch.
>>>>>
>>>>> Thanks!
>>>>> -Zhenqiang

[-- Attachment #2: ssa_ifcombine.patch.txt --]
[-- Type: text/plain, Size: 10670 bytes --]

Index: testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-1.c	(revision 0)
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
+
+/* { dg-options "-O2 -g -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+
+int t (int a, int b)
+{
+  if (a > 0)
+    if (b > 0)
+      return 0;
+  return 1;
+}
+/* { dg-final { scan-tree-dump "\&" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c	(revision 204133)
+++ testsuite/gcc.dg/tree-ssa/ssa-dom-thread-3.c	(working copy)
@@ -1,48 +0,0 @@
-/* { dg-do compile } */ 
-/* { dg-options "-O2 -fdump-tree-dom1-details -fno-short-enums" } */
-
-extern void abort (void) __attribute__ ((__noreturn__));
-union tree_node;
-typedef union tree_node *tree;
-enum tree_code
-{
-  VAR_DECL,
-  SSA_NAME,
-  MAX_TREE_CODES
-};
-extern unsigned char tree_contains_struct[MAX_TREE_CODES][64];
-struct tree_base
-{
-  enum tree_code code:16;
-};
-enum tree_node_structure_enum
-{
-  TS_DECL_COMMON
-};
-struct tree_ssa_name
-{
-  tree var;
-};
-union tree_node
-{
-  struct tree_base base;
-  struct tree_ssa_name ssa_name;
-};
-long
-expand_one_var (tree var, unsigned char toplevel, unsigned char really_expand)
-{
-  tree origvar = var;
-  var = var->ssa_name.var;
-  if (((enum tree_code) (origvar)->base.code) == SSA_NAME
-      && !((var->base.code != VAR_DECL)))
-    abort ();
-  if ((var->base.code) != VAR_DECL && ((origvar)->base.code) != SSA_NAME)
-    ;
-  else if (tree_contains_struct[(var->base.code)][(TS_DECL_COMMON)] != 1)
-    abort ();
-}
-/* We should thread the jump, through an intermediate block.  */
-/* { dg-final { scan-tree-dump-times "Threaded" 2 "dom1"} } */
-/* { dg-final { scan-tree-dump-times "Registering jump thread: \\(.*\\) incoming edge;  \\(.*\\) joiner;  \\(.*\\) nocopy;" 1 "dom1"} } */
-/* { dg-final { cleanup-tree-dump "dom1" } } */
-
Index: testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-2.c	(revision 0)
@@ -0,0 +1,17 @@
+/* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
+
+/* { dg-options "-O2 -g -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+
+int t (int a, int b)
+{
+  if (a > 0)
+    goto L1;
+  if (b > 0)
+    goto L1;
+  return 0;
+L1:
+  return 1;
+}
+/* { dg-final { scan-tree-dump "\|" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-3.c	(revision 0)
@@ -0,0 +1,20 @@
+/* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
+
+/* { dg-options "-O2 -g -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+
+int t (int a, int b)
+{
+  if (a > 0)
+    goto L1;
+  else
+    goto L2;
+L1:
+  if (b > 0)
+    goto L2;
+  return 5;
+L2:
+  return 6;
+}
+/* { dg-final { scan-tree-dump "\|" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-4.c	(revision 0)
@@ -0,0 +1,18 @@
+/* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
+
+/* { dg-options "-O2 -g -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+
+int t (int a, int b)
+{
+  if (a > 0)
+    goto L1;
+  if (b > 0)
+    goto L2;
+L1:
+  return 0;
+L2:
+  return 1;
+}
+/* { dg-final { scan-tree-dump "\&" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: testsuite/gcc.dg/tree-ssa/phi-opt-9.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/phi-opt-9.c	(revision 204133)
+++ testsuite/gcc.dg/tree-ssa/phi-opt-9.c	(working copy)
@@ -2,13 +2,14 @@
 /* { dg-options "-O -fdump-tree-optimized" } */
 
 int g(int,int);
+int h(int);
 int f(int t, int c)
 {
   int d = 0;
   int e = 0;
   if (t)
     {
-      d = c+1;
+      d = h(c);
       e = t;
     }
   else d = 0, e = 0;
Index: testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-5.c	(revision 0)
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
+
+/* { dg-options "-O2 -g -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+
+int t (int a, int b, int c)
+{
+  if (a > 0 && b > 0 && c > 0)
+      return 0;
+  return 1;
+}
+/* { dg-final { scan-tree-dump-times "\&" 2 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/ssa-ifcombine-ccmp-6.c	(revision 0)
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
+
+/* { dg-options "-O2 -g -fdump-tree-optimized" } */
+/* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+
+int t (int a, int b, int c)
+{
+  if (a > 0 || b > 0 || c > 0)
+      return 0;
+  return 1;
+}
+/* { dg-final { scan-tree-dump-times "\\|" 2 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: tree-ssa-ifcombine.c
===================================================================
--- tree-ssa-ifcombine.c	(revision 204133)
+++ tree-ssa-ifcombine.c	(working copy)
@@ -22,6 +22,10 @@ along with GCC; see the file COPYING3.
 #include "system.h"
 #include "coretypes.h"
 #include "tm.h"
+/* rtl is needed only because arm back-end requires it for
+   BRANCH_COST.  */
+#include "rtl.h"
+#include "tm_p.h"
 #include "tree.h"
 #include "basic-block.h"
 #include "tree-pretty-print.h"
@@ -32,6 +36,12 @@ along with GCC; see the file COPYING3.
 #include "ssa-iterators.h"
 #include "tree-pass.h"
 
+#ifndef LOGICAL_OP_NON_SHORT_CIRCUIT
+#define LOGICAL_OP_NON_SHORT_CIRCUIT \
+  (BRANCH_COST (optimize_function_for_speed_p (cfun), \
+                false) >= 2)
+#endif
+
 /* This pass combines COND_EXPRs to simplify control flow.  It
    currently recognizes bit tests and comparisons in chains that
    represent logical and or logical or of two COND_EXPRs.
@@ -488,7 +498,35 @@ ifcombine_ifandif (basic_block inner_con
 					    outer_cond_code,
 					    gimple_cond_lhs (outer_cond),
 					    gimple_cond_rhs (outer_cond))))
-	return false;
+	{
+	  tree t1, t2;
+	  gimple_stmt_iterator gsi;
+	  if (!LOGICAL_OP_NON_SHORT_CIRCUIT)
+	    return false;
+	  /* Only do this optimization if the inner bb contains only the conditional. */
+	  if (!gsi_one_before_end_p (gsi_start_nondebug_after_labels_bb (inner_cond_bb)))
+	    return false;
+	  t1 = fold_build2_loc (gimple_location (inner_cond),
+				inner_cond_code,
+				boolean_type_node,
+				gimple_cond_lhs (inner_cond),
+				gimple_cond_rhs (inner_cond));
+	  t2 = fold_build2_loc (gimple_location (outer_cond),
+				outer_cond_code,
+				boolean_type_node,
+				gimple_cond_lhs (outer_cond),
+				gimple_cond_rhs (outer_cond));
+	  t = fold_build2_loc (gimple_location (inner_cond), 
+			       TRUTH_AND_EXPR, boolean_type_node, t1, t2);
+	  if (result_inv)
+	    {
+	      t = fold_build1 (TRUTH_NOT_EXPR, TREE_TYPE (t), t);
+	      result_inv = false;
+	    }
+	  gsi = gsi_for_stmt (inner_cond);
+	  t = force_gimple_operand_gsi_1 (&gsi, t, is_gimple_condexpr, NULL, true,
+					  GSI_SAME_STMT);
+        }
       if (result_inv)
 	t = fold_build1 (TRUTH_NOT_EXPR, TREE_TYPE (t), t);
       t = canonicalize_cond_expr_cond (t);
@@ -631,7 +669,15 @@ tree_ssa_ifcombine (void)
   bbs = single_pred_before_succ_order ();
   calculate_dominance_info (CDI_DOMINATORS);
 
-  for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; ++i)
+  /* Search every basic block for COND_EXPR we may be able to optimize.
+
+     We walk the blocks in order that guarantees that a block with
+     a single predecessor is processed after the predecessor.
+     This ensures that we collapse outter ifs before visiting the
+     inner ones, and also that we do not try to visit a removed
+     block.  This is opposite of PHI-OPT, because we cascade the
+     combining rather than cascading PHIs. */
+  for (i = n_basic_blocks - NUM_FIXED_BLOCKS - 1; i >= 0; i--)
     {
       basic_block bb = bbs[i];
       gimple stmt = last_stmt (bb);
Index: gimple.h
===================================================================
--- gimple.h	(revision 204133)
+++ gimple.h	(working copy)
@@ -5534,6 +5534,20 @@ gsi_start_nondebug_bb (basic_block bb)
   return i;
 }
 
+/* Return a new iterator pointing to the first non-debug non-label statement in
+   basic block BB.  */
+
+static inline gimple_stmt_iterator
+gsi_start_nondebug_after_labels_bb (basic_block bb)
+{
+  gimple_stmt_iterator i = gsi_after_labels (bb);
+
+  if (!gsi_end_p (i) && is_gimple_debug (gsi_stmt (i)))
+    gsi_next_nondebug (&i);
+
+  return i;
+}
+
 /* Return a new iterator pointing to the last non-debug statement in
    basic block BB.  */
 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-10-30  7:32                 ` Andrew Pinski
@ 2013-11-08 17:43                   ` Steven Bosscher
  2013-11-08 18:09                     ` Steven Bosscher
  2013-11-08 18:10                     ` pinskia
  0 siblings, 2 replies; 31+ messages in thread
From: Steven Bosscher @ 2013-11-08 17:43 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: Richard Biener, Zhenqiang Chen, Jeff Law, gcc-patches

On Wed, Oct 30, 2013 at 5:03 AM, Andrew Pinski wrote:
> Here is what I applied in the end; Jeff told me just to remove the
> testcase.  I added the comment trying to explain why it was the
> opposite order of PHI-opt.
>
> Thanks,
> Andrew Pinski
>
> ChangeLog:
> * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.

Eh, why???

The file has this comment:

25    /* rtl is needed only because arm back-end requires it for
26       BRANCH_COST.  */
27    #include "rtl.h"
28    #include "tm_p.h"

Can you please clarify why this is not something to be fixed in the
ARM back end?

You're taking the easy way out here, but it's a step in the wrong
direction from modularity point of view.

Ciao!
Steven

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-11-08 17:43                   ` Steven Bosscher
@ 2013-11-08 18:09                     ` Steven Bosscher
  2013-11-11 11:54                       ` Richard Biener
  2013-11-08 18:10                     ` pinskia
  1 sibling, 1 reply; 31+ messages in thread
From: Steven Bosscher @ 2013-11-08 18:09 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: Richard Biener, Zhenqiang Chen, Jeff Law, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 780 bytes --]

On Fri, Nov 8, 2013 at 6:20 PM, Steven Bosscher wrote:
> On Wed, Oct 30, 2013 at 5:03 AM, Andrew Pinski wrote:
>> Here is what I applied in the end; Jeff told me just to remove the
>> testcase.  I added the comment trying to explain why it was the
>> opposite order of PHI-opt.
>>
>> Thanks,
>> Andrew Pinski
>>
>> ChangeLog:
>> * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
>
> Eh, why???
>
> The file has this comment:
>
> 25    /* rtl is needed only because arm back-end requires it for
> 26       BRANCH_COST.  */
> 27    #include "rtl.h"
> 28    #include "tm_p.h"
>
> Can you please clarify why this is not something to be fixed in the
> ARM back end?


Could be fixed like attached. Can you please have a look and
foster-parent it if you like it?

Thanks,

Ciao!
Steven

[-- Attachment #2: arm_dont_expose_yourself.diff.txt --]
[-- Type: text/plain, Size: 3316 bytes --]


	* config/arm/arm-protos.h (arm_branch_cost,
	arm_logical_op_non_short_circuit): New prototypes.
	* config/arm/arm.c (arm_branch_cost): Implement it.
	(arm_logical_op_non_short_circuit): Likewise.
	* config/arm/arm.h (BRANCH_COST): Don't expose ARM back-end
	internals to the RTL middle end.
	(LOGICAL_OP_NON_SHORT_CIRCUIT): Likewise.
	* tree-ssa-ifcombine.c: Don't include rtl.h here.
	* tree-ssa-reassoc.c: Don't include rtl.h here either.

Index: config/arm/arm-protos.h
===================================================================
--- config/arm/arm-protos.h	(revision 204565)
+++ config/arm/arm-protos.h	(working copy)
@@ -288,4 +288,7 @@ extern bool arm_autoinc_modes_ok_p (enum machine_m
 
 extern void arm_emit_eabi_attribute (const char *, int, int);
 
+extern bool arm_branch_cost (bool, bool);
+extern bool arm_logical_op_non_short_circuit (void);
+
 #endif /* ! GCC_ARM_PROTOS_H */
Index: config/arm/arm.c
===================================================================
--- config/arm/arm.c	(revision 204565)
+++ config/arm/arm.c	(working copy)
@@ -30351,4 +30351,19 @@ arm_asan_shadow_offset (void)
   return (unsigned HOST_WIDE_INT) 1 << 29;
 }
 
+/* Implement BRANCH_COST as a function, to hide tune_params from the
+   rest of the compiler.  */
+bool
+arm_branch_cost (bool speed_p, bool predictable_p)
+{
+  return current_tune->branch_cost (speed_p, predictable_p);
+}
+
+/* Likewise for LOGICAL_OP_NON_SHORT_CIRCUIT.  */
+bool
+arm_logical_op_non_short_circuit (void)
+{
+  return (current_tune->logical_op_non_short_circuit[TARGET_ARM]);
+}
+
 #include "gt-arm.h"
Index: config/arm/arm.h
===================================================================
--- config/arm/arm.h	(revision 204565)
+++ config/arm/arm.h	(working copy)
@@ -2042,14 +2042,13 @@ enum arm_auto_incmodes
 /* Try to generate sequences that don't involve branches, we can then use
    conditional instructions.  */
 #define BRANCH_COST(speed_p, predictable_p) \
-  (current_tune->branch_cost (speed_p, predictable_p))
+  arm_branch_cost (speed_p, predictable_p)
 
 /* False if short circuit operation is preferred.  */
 #define LOGICAL_OP_NON_SHORT_CIRCUIT				\
   ((optimize_size)						\
    ? (TARGET_THUMB ? false : true)				\
-   : (current_tune->logical_op_non_short_circuit[TARGET_ARM]))
-
+   : arm_logical_op_non_short_circuit ()
 \f
 /* Position Independent Code.  */
 /* We decide which register to use based on the compilation options and
Index: tree-ssa-ifcombine.c
===================================================================
--- tree-ssa-ifcombine.c	(revision 204565)
+++ tree-ssa-ifcombine.c	(working copy)
@@ -22,9 +22,6 @@ along with GCC; see the file COPYING3.  If not see
 #include "system.h"
 #include "coretypes.h"
 #include "tm.h"
-/* rtl is needed only because arm back-end requires it for
-   BRANCH_COST.  */
-#include "rtl.h"
 #include "tm_p.h"
 #include "tree.h"
 #include "basic-block.h"
Index: tree-ssa-reassoc.c
===================================================================
--- tree-ssa-reassoc.c	(revision 204565)
+++ tree-ssa-reassoc.c	(working copy)
@@ -23,7 +23,6 @@ along with GCC; see the file COPYING3.  If not see
 #include "coretypes.h"
 #include "hash-table.h"
 #include "tm.h"
-#include "rtl.h"
 #include "tm_p.h"
 #include "tree.h"
 #include "basic-block.h"

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-11-08 17:43                   ` Steven Bosscher
  2013-11-08 18:09                     ` Steven Bosscher
@ 2013-11-08 18:10                     ` pinskia
  2013-11-08 22:15                       ` Jeff Law
  1 sibling, 1 reply; 31+ messages in thread
From: pinskia @ 2013-11-08 18:10 UTC (permalink / raw)
  To: Steven Bosscher; +Cc: Richard Biener, Zhenqiang Chen, Jeff Law, gcc-patches



> On Nov 8, 2013, at 9:20 AM, Steven Bosscher <stevenb.gcc@gmail.com> wrote:
> 
>> On Wed, Oct 30, 2013 at 5:03 AM, Andrew Pinski wrote:
>> Here is what I applied in the end; Jeff told me just to remove the
>> testcase.  I added the comment trying to explain why it was the
>> opposite order of PHI-opt.
>> 
>> Thanks,
>> Andrew Pinski
>> 
>> ChangeLog:
>> * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
> 
> Eh, why???
> 
> The file has this comment:
> 
> 25    /* rtl is needed only because arm back-end requires it for
> 26       BRANCH_COST.  */
> 27    #include "rtl.h"
> 28    #include "tm_p.h"
> 
> Can you please clarify why this is not something to be fixed in the
> ARM back end?

  Really BRANCH_COST should be a target hook rather than a macro which should fix this issue.  fold-const.c has the same include for the same reason.  I thought I had saw a patch which changes it into a hook. Next week if I get time, I will do that.  I knew it was the wrong direction which is why I added a comment.

Thanks,
Andrew

> 
> You're taking the easy way out here, but it's a step in the wrong
> direction from modularity point of view.
> 
> Ciao!
> Steven

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-11-08 22:15                       ` Jeff Law
@ 2013-11-08 22:15                         ` Steven Bosscher
  0 siblings, 0 replies; 31+ messages in thread
From: Steven Bosscher @ 2013-11-08 22:15 UTC (permalink / raw)
  To: Jeff Law; +Cc: Andrew Pinski, Richard Biener, Zhenqiang Chen, gcc-patches

On Fri, Nov 8, 2013 at 10:46 PM, Jeff Law wrote:
> Also note that other patches have gone in recently which include those files
> for exactly the same reason.

I assume you're referring to tree-ssa-reassoc.c? It's the only new one
I could find since I cleaned rtl.h out from almost all tree-* files
last year.


>  I don't think Andrew did anything wrong here.

Yup. The problem came in with an ARM patch: http://gcc.gnu.org/r174578


> Clearly those can/should/will go away when BRANCH_COST turns into a target
> hook.  I'd like to do it myself, but I'm buried right now.

Hook would be better but even as a macro this can be solved, see the
patch I attached up thread.

There should be a ban, enforced somehow, on new #includes of rtl.h and
expr.h in GIMPLE passes...

Ciao!
Steven

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-11-08 18:10                     ` pinskia
@ 2013-11-08 22:15                       ` Jeff Law
  2013-11-08 22:15                         ` Steven Bosscher
  0 siblings, 1 reply; 31+ messages in thread
From: Jeff Law @ 2013-11-08 22:15 UTC (permalink / raw)
  To: pinskia, Steven Bosscher; +Cc: Richard Biener, Zhenqiang Chen, gcc-patches

On 11/08/13 10:45, pinskia@gmail.com wrote:
>
>
>> On Nov 8, 2013, at 9:20 AM, Steven Bosscher <stevenb.gcc@gmail.com>
>> wrote:
>>
>>> On Wed, Oct 30, 2013 at 5:03 AM, Andrew Pinski wrote: Here is
>>> what I applied in the end; Jeff told me just to remove the
>>> testcase.  I added the comment trying to explain why it was the
>>> opposite order of PHI-opt.
>>>
>>> Thanks, Andrew Pinski
>>>
>>> ChangeLog: * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
>>
>> Eh, why???
>>
>> The file has this comment:
>>
>> 25    /* rtl is needed only because arm back-end requires it for 26
>> BRANCH_COST.  */ 27    #include "rtl.h" 28    #include "tm_p.h"
>>
>> Can you please clarify why this is not something to be fixed in
>> the ARM back end?
>
> Really BRANCH_COST should be a target hook rather than a macro which
> should fix this issue.  fold-const.c has the same include for the
> same reason.  I thought I had saw a patch which changes it into a
> hook. Next week if I get time, I will do that.  I knew it was the
> wrong direction which is why I added a comment.
Also note that other patches have gone in recently which include those 
files for exactly the same reason.  I don't think Andrew did anything 
wrong here.

Clearly those can/should/will go away when BRANCH_COST turns into a 
target hook.  I'd like to do it myself, but I'm buried right now.

jeff

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Enhance ifcombine to recover non short circuit branches
  2013-11-08 18:09                     ` Steven Bosscher
@ 2013-11-11 11:54                       ` Richard Biener
  0 siblings, 0 replies; 31+ messages in thread
From: Richard Biener @ 2013-11-11 11:54 UTC (permalink / raw)
  To: Steven Bosscher
  Cc: Andrew Pinski, Zhenqiang Chen, Jeff Law, gcc-patches, Richard Earnshaw

On Fri, Nov 8, 2013 at 6:41 PM, Steven Bosscher <stevenb.gcc@gmail.com> wrote:
> On Fri, Nov 8, 2013 at 6:20 PM, Steven Bosscher wrote:
>> On Wed, Oct 30, 2013 at 5:03 AM, Andrew Pinski wrote:
>>> Here is what I applied in the end; Jeff told me just to remove the
>>> testcase.  I added the comment trying to explain why it was the
>>> opposite order of PHI-opt.
>>>
>>> Thanks,
>>> Andrew Pinski
>>>
>>> ChangeLog:
>>> * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
>>
>> Eh, why???
>>
>> The file has this comment:
>>
>> 25    /* rtl is needed only because arm back-end requires it for
>> 26       BRANCH_COST.  */
>> 27    #include "rtl.h"
>> 28    #include "tm_p.h"
>>
>> Can you please clarify why this is not something to be fixed in the
>> ARM back end?
>
>
> Could be fixed like attached. Can you please have a look and
> foster-parent it if you like it?

Looks good to me - Richard, can you pick it up?

Thanks,
Richard.

> Thanks,
>
> Ciao!
> Steven

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2013-11-11 11:43 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-10-16  9:37 [PATCH] Enhance ifcombine to recover non short circuit branches Zhenqiang Chen
2013-10-16 12:22 ` Marc Glisse
2013-10-17  2:50 ` Andrew Pinski
2013-10-17 11:14   ` Richard Biener
2013-10-17 17:09     ` Jeff Law
2013-10-18  9:53       ` Zhenqiang Chen
2013-10-18 10:04         ` Richard Biener
2013-10-18 10:11           ` Zhenqiang Chen
2013-10-18 10:36             ` Richard Biener
2013-10-18 10:41               ` Zhenqiang Chen
2013-10-18 12:10                 ` Richard Biener
2013-10-18 15:45           ` Jeff Law
2013-10-26 22:26         ` Andrew Pinski
2013-10-27  9:17           ` Andrew Pinski
2013-10-27 20:28             ` Andrew Pinski
2013-10-29  1:33               ` Zhenqiang Chen
2013-10-29  7:24               ` Jeff Law
2013-10-29 15:30                 ` Andrew Pinski
2013-10-29 18:42                   ` Jeff Law
2013-10-29 10:39               ` Richard Biener
2013-10-30  7:32                 ` Andrew Pinski
2013-11-08 17:43                   ` Steven Bosscher
2013-11-08 18:09                     ` Steven Bosscher
2013-11-11 11:54                       ` Richard Biener
2013-11-08 18:10                     ` pinskia
2013-11-08 22:15                       ` Jeff Law
2013-11-08 22:15                         ` Steven Bosscher
2013-10-18  2:37     ` Andrew Pinski
2013-10-18  2:42       ` Andrew Pinski
2013-10-18 10:55         ` Zhenqiang Chen
2013-10-26 23:49 ` Andrew Pinski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).