Hi All, This optimizes the following sequence ((a < b) & c) | ((a >= b) & d) into (a < b ? c : d) & 1 for scalar. On vector we can omit the & 1. This changes the code generation from zoo2: cmp w0, w1 cset w0, lt cset w1, ge and w0, w0, w2 and w1, w1, w3 orr w0, w0, w1 ret into cmp w0, w1 csel w0, w2, w3, lt and w0, w0, 1 ret and significantly reduces the number of selects we have to do in the vector code. Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * fold-const.cc (inverse_conditions_p): Traverse if SSA_NAME. * match.pd: Add new rule. gcc/testsuite/ChangeLog: * gcc.target/aarch64/if-compare_1.c: New test. * gcc.target/aarch64/if-compare_2.c: New test. --- inline copy of patch -- diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc index 39a5a52958d87497f301826e706886b290771a2d..f180599b90150acd3ed895a64280aa3255061256 100644 --- a/gcc/fold-const.cc +++ b/gcc/fold-const.cc @@ -2833,15 +2833,38 @@ compcode_to_comparison (enum comparison_code code) bool inverse_conditions_p (const_tree cond1, const_tree cond2) { - return (COMPARISON_CLASS_P (cond1) - && COMPARISON_CLASS_P (cond2) - && (invert_tree_comparison - (TREE_CODE (cond1), - HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE (cond2)) - && operand_equal_p (TREE_OPERAND (cond1, 0), - TREE_OPERAND (cond2, 0), 0) - && operand_equal_p (TREE_OPERAND (cond1, 1), - TREE_OPERAND (cond2, 1), 0)); + if (COMPARISON_CLASS_P (cond1) + && COMPARISON_CLASS_P (cond2) + && (invert_tree_comparison + (TREE_CODE (cond1), + HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE (cond2)) + && operand_equal_p (TREE_OPERAND (cond1, 0), + TREE_OPERAND (cond2, 0), 0) + && operand_equal_p (TREE_OPERAND (cond1, 1), + TREE_OPERAND (cond2, 1), 0)) + return true; + + if (TREE_CODE (cond1) == SSA_NAME + && TREE_CODE (cond2) == SSA_NAME) + { + gimple *gcond1 = SSA_NAME_DEF_STMT (cond1); + gimple *gcond2 = SSA_NAME_DEF_STMT (cond2); + if (!is_gimple_assign (gcond1) || !is_gimple_assign (gcond2)) + return false; + + tree_code code1 = gimple_assign_rhs_code (gcond1); + tree_code code2 = gimple_assign_rhs_code (gcond2); + return TREE_CODE_CLASS (code1) == tcc_comparison + && TREE_CODE_CLASS (code2) == tcc_comparison + && invert_tree_comparison (code1, + HONOR_NANS (gimple_arg (gcond1, 0))) == code2 + && operand_equal_p (gimple_arg (gcond1, 0), + gimple_arg (gcond2, 0), 0) + && operand_equal_p (gimple_arg (gcond1, 1), + gimple_arg (gcond2, 1), 0); + } + + return false; } /* Return a tree for the comparison which is the combination of diff --git a/gcc/match.pd b/gcc/match.pd index 6d691d302b339c0e4556b40af158b5208c12d08f..bad49dd348add751d9ec1e3023e34d9ac123194f 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -1160,6 +1160,32 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (convert (bit_and (negate (convert:utype { pmop[0]; })) (convert:utype @1))))))) +/* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1. */ +(simplify + (bit_ior + (bit_and:c (convert? @0) @2) + (bit_and:c (convert? @1) @3)) + (if (inverse_conditions_p (@0, @1) + /* The scalar version has to be canonicalized after vectorization + because it makes unconditional loads conditional ones, which + means we lose vectorization because the loads may trap. */ + && canonicalize_math_after_vectorization_p ()) + (bit_and (cond @0 @2 @3) { build_each_one_cst (type); }))) +(simplify + (bit_ior + (bit_and:c (convert? (vec_cond:s @0 @4 integer_zerop)) @2) + (bit_and:c (convert? (vec_cond:s @1 @4 integer_zerop)) @3)) + (if (inverse_conditions_p (@0, @1) + && integer_onep (@4)) + (bit_and (vec_cond @0 @2 @3) @4))) +/* Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d. */ +(simplify + (bit_ior + (bit_and:c (convert? (vec_cond:s @0 integer_minus_onep integer_zerop)) @2) + (bit_and:c (convert? (vec_cond:s @1 integer_minus_onep integer_zerop)) @3)) + (if (inverse_conditions_p (@0, @1)) + (vec_cond @0 @2 @3))) + /* X % Y is smaller than Y. */ (for cmp (lt ge) (simplify diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_1.c b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c new file mode 100644 index 0000000000000000000000000000000000000000..05a1292fa90c70b14a7985122f43711f55d047ea --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O" } */ +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ + +/* +**zoo2: +** cmp w0, w1 +** csel w0, w2, w3, lt +** and w0, w0, 1 +** ret +*/ +int zoo2 (int a, int b, int c, int d) +{ + return ((a < b) & c) | ((a >= b) & d); +} + diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_2.c b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c new file mode 100644 index 0000000000000000000000000000000000000000..34bc65f5db10eae81b8dee3316dfb7d12bf471c8 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -std=c99" } */ +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +/* +**foo: +** cmgt v0.4s, v1.4s, v0.4s +** bsl v0.16b, v2.16b, v3.16b +** ret +*/ +v4si foo (v4si a, v4si b, v4si c, v4si d) { + return ((a < b) & c) | ((a >= b) & d); +} + + +/** +**bar: +**... +** cmge v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** bsl v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b +** and v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b +**... +*/ +void bar (int * restrict a, int * restrict b, int * restrict c, + int * restrict d, int * restrict res, int n) +{ + for (int i = 0; i < (n & -4); i++) + res[i] = ((a[i] < b[i]) & c[i]) | ((a[i] >= b[i]) & d[i]); +} + --