Hi All,

This optimizes the following sequence

  ((a < b) & c) | ((a >= b) & d)

into

  (a < b ? c : d) & 1

for scalar. On vector we can omit the & 1.

This changes the code generation from

zoo2:
	cmp     w0, w1
	cset    w0, lt
	cset    w1, ge
	and     w0, w0, w2
	and     w1, w1, w3
	orr     w0, w0, w1
	ret

into

	cmp	w0, w1
	csel	w0, w2, w3, lt
	and	w0, w0, 1
	ret

and significantly reduces the number of selects we have to do in the vector
code.

Bootstrapped Regtested on aarch64-none-linux-gnu,
x86_64-pc-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* fold-const.cc (inverse_conditions_p): Traverse if SSA_NAME.
	* match.pd: Add new rule.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/if-compare_1.c: New test.
	* gcc.target/aarch64/if-compare_2.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 39a5a52958d87497f301826e706886b290771a2d..f180599b90150acd3ed895a64280aa3255061256 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -2833,15 +2833,38 @@ compcode_to_comparison (enum comparison_code code)
 bool
 inverse_conditions_p (const_tree cond1, const_tree cond2)
 {
-  return (COMPARISON_CLASS_P (cond1)
-	  && COMPARISON_CLASS_P (cond2)
-	  && (invert_tree_comparison
-	      (TREE_CODE (cond1),
-	       HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE (cond2))
-	  && operand_equal_p (TREE_OPERAND (cond1, 0),
-			      TREE_OPERAND (cond2, 0), 0)
-	  && operand_equal_p (TREE_OPERAND (cond1, 1),
-			      TREE_OPERAND (cond2, 1), 0));
+  if (COMPARISON_CLASS_P (cond1)
+      && COMPARISON_CLASS_P (cond2)
+      && (invert_tree_comparison
+	   (TREE_CODE (cond1),
+	    HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE (cond2))
+      && operand_equal_p (TREE_OPERAND (cond1, 0),
+			  TREE_OPERAND (cond2, 0), 0)
+      && operand_equal_p (TREE_OPERAND (cond1, 1),
+			  TREE_OPERAND (cond2, 1), 0))
+    return true;
+
+  if (TREE_CODE (cond1) == SSA_NAME
+      && TREE_CODE (cond2) == SSA_NAME)
+    {
+      gimple *gcond1 = SSA_NAME_DEF_STMT (cond1);
+      gimple *gcond2 = SSA_NAME_DEF_STMT (cond2);
+      if (!is_gimple_assign (gcond1) || !is_gimple_assign (gcond2))
+	return false;
+
+      tree_code code1 = gimple_assign_rhs_code (gcond1);
+      tree_code code2 = gimple_assign_rhs_code (gcond2);
+      return TREE_CODE_CLASS (code1) == tcc_comparison
+	     && TREE_CODE_CLASS (code2) == tcc_comparison
+	     && invert_tree_comparison (code1,
+		  HONOR_NANS (gimple_arg (gcond1, 0))) == code2
+	     && operand_equal_p (gimple_arg (gcond1, 0),
+				 gimple_arg (gcond2, 0), 0)
+	     && operand_equal_p (gimple_arg (gcond1, 1),
+				 gimple_arg (gcond2, 1), 0);
+    }
+
+  return false;
 }
 
 /* Return a tree for the comparison which is the combination of
diff --git a/gcc/match.pd b/gcc/match.pd
index 6d691d302b339c0e4556b40af158b5208c12d08f..bad49dd348add751d9ec1e3023e34d9ac123194f 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1160,6 +1160,32 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
      (convert (bit_and (negate (convert:utype { pmop[0]; }))
 		       (convert:utype @1)))))))
 
+/* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.  */
+(simplify
+ (bit_ior
+  (bit_and:c (convert? @0) @2)
+  (bit_and:c (convert? @1) @3))
+   (if (inverse_conditions_p (@0, @1)
+	/* The scalar version has to be canonicalized after vectorization
+	   because it makes unconditional loads conditional ones, which
+	   means we lose vectorization because the loads may trap.  */
+	&& canonicalize_math_after_vectorization_p ())
+    (bit_and (cond @0 @2 @3) { build_each_one_cst (type); })))
+(simplify
+ (bit_ior
+  (bit_and:c (convert? (vec_cond:s @0 @4 integer_zerop)) @2)
+  (bit_and:c (convert? (vec_cond:s @1 @4 integer_zerop)) @3))
+   (if (inverse_conditions_p (@0, @1)
+	&& integer_onep (@4))
+    (bit_and (vec_cond @0 @2 @3) @4)))
+/* Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d.  */
+(simplify
+ (bit_ior
+  (bit_and:c (convert? (vec_cond:s @0 integer_minus_onep integer_zerop)) @2)
+  (bit_and:c (convert? (vec_cond:s @1 integer_minus_onep integer_zerop)) @3))
+   (if (inverse_conditions_p (@0, @1))
+    (vec_cond @0 @2 @3)))
+
 /* X % Y is smaller than Y.  */
 (for cmp (lt ge)
  (simplify
diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_1.c b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..05a1292fa90c70b14a7985122f43711f55d047ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+/*
+**zoo2:
+**	cmp	w0, w1
+**	csel	w0, w2, w3, lt
+**	and	w0, w0, 1
+**	ret
+*/
+int zoo2 (int a, int b, int c, int d)
+{
+   return ((a < b) & c) | ((a >= b) & d);
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_2.c b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..34bc65f5db10eae81b8dee3316dfb7d12bf471c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+**foo:
+**	cmgt	v0.4s, v1.4s, v0.4s
+**	bsl	v0.16b, v2.16b, v3.16b
+**	ret
+*/
+v4si foo (v4si a, v4si b, v4si c, v4si d) {
+    return ((a < b) & c) | ((a >= b) & d);
+}
+
+
+/**
+**bar:
+**...
+**	cmge	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	bsl	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+**	and	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+**...
+*/
+void bar (int * restrict a, int * restrict b, int * restrict c,
+	  int * restrict d, int * restrict res, int n)
+{
+  for (int i = 0; i < (n & -4); i++)
+    res[i] = ((a[i] < b[i]) & c[i]) | ((a[i] >= b[i]) & d[i]);
+}
+


--