Hello, > where logical_inverted is somewhat contradicting using zero_one_valued > instead of truth_valued_p (I think the former might not work for vector > booleans?). > > In the end I'd prefer zero_one_valued_p but avoiding inverse_conditions_p > would be nice. > > Richard. It's not pretty but I've made it work and added more tests. Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * match.pd: Add new rule. gcc/testsuite/ChangeLog: * gcc.target/aarch64/if-compare_1.c: New test. * gcc.target/aarch64/if-compare_2.c: New test. --- inline copy of patch --- diff --git a/gcc/match.pd b/gcc/match.pd index b61ed70e69b881a49177f10f20c1f92712bb8665..39da61bf117a6eb2924fc8a6473fb37ddadd60e9 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -1903,6 +1903,101 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (if (INTEGRAL_TYPE_P (type)) (bit_and @0 @1))) +(for cmp (tcc_comparison) + icmp (inverted_tcc_comparison) + /* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1. */ + (simplify + (bit_ior + (bit_and:c (convert? zero_one_valued_p@0) @2) + (bit_and:c (convert? zero_one_valued_p@1) @3)) + (with { + enum tree_code c1 + = (TREE_CODE (@0) == SSA_NAME + ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@0)) : TREE_CODE (@0)); + + enum tree_code c2 + = (TREE_CODE (@1) == SSA_NAME + ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@1)) : TREE_CODE (@1)); + } + (if (INTEGRAL_TYPE_P (type) + && c1 == cmp + && c2 == icmp + /* The scalar version has to be canonicalized after vectorization + because it makes unconditional loads conditional ones, which + means we lose vectorization because the loads may trap. */ + && canonicalize_math_after_vectorization_p ()) + (bit_and (cond @0 @2 @3) { build_one_cst (type); })))) + + /* Fold ((-(a < b) & c) | (-(a >= b) & d)) into a < b ? c : d. */ + (simplify + (bit_ior + (cond zero_one_valued_p@0 @2 zerop) + (cond zero_one_valued_p@1 @3 zerop)) + (with { + enum tree_code c1 + = (TREE_CODE (@0) == SSA_NAME + ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@0)) : TREE_CODE (@0)); + + enum tree_code c2 + = (TREE_CODE (@1) == SSA_NAME + ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@1)) : TREE_CODE (@1)); + } + (if (INTEGRAL_TYPE_P (type) + && c1 == cmp + && c2 == icmp + /* The scalar version has to be canonicalized after vectorization + because it makes unconditional loads conditional ones, which + means we lose vectorization because the loads may trap. */ + && canonicalize_math_after_vectorization_p ()) + (cond @0 @2 @3)))) + + /* Vector Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d. + and ((~(a < b) & c) | (~(a >= b) & d)) into a < b ? c : d. */ + (simplify + (bit_ior + (bit_and:c (vec_cond:s @0 @4 @5) @2) + (bit_and:c (vec_cond:s @1 @4 @5) @3)) + (with { + enum tree_code c1 + = (TREE_CODE (@0) == SSA_NAME + ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@0)) : TREE_CODE (@0)); + + enum tree_code c2 + = (TREE_CODE (@1) == SSA_NAME + ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@1)) : TREE_CODE (@1)); + } + (if (c1 == cmp && c2 == icmp) + (if (integer_zerop (@5)) + (switch + (if (integer_onep (@4)) + (bit_and (vec_cond @0 @2 @3) @4)) + (if (integer_minus_onep (@4)) + (vec_cond @0 @2 @3))) + (if (integer_zerop (@4)) + (switch + (if (integer_onep (@5)) + (bit_and (vec_cond @0 @3 @2) @5)) + (if (integer_minus_onep (@5)) + (vec_cond @0 @3 @2)))))))) + + /* Scalar Vectorized Fold ((-(a < b) & c) | (-(a >= b) & d)) + into a < b ? d : c. */ + (simplify + (bit_ior + (vec_cond:s @0 @2 integer_zerop) + (vec_cond:s @1 @3 integer_zerop)) + (with { + enum tree_code c1 + = (TREE_CODE (@0) == SSA_NAME + ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@0)) : TREE_CODE (@0)); + + enum tree_code c2 + = (TREE_CODE (@1) == SSA_NAME + ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@1)) : TREE_CODE (@1)); + } + (if (c1 == cmp && c2 == icmp) + (vec_cond @0 @2 @3))))) + /* Transform X & -Y into X * Y when Y is { 0 or 1 }. */ (simplify (bit_and:c (convert? (negate zero_one_valued_p@0)) @1) diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_1.c b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c new file mode 100644 index 0000000000000000000000000000000000000000..53bbd779a30e1a30e0ce0e4e5eaf589bfaf570fe --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c @@ -0,0 +1,47 @@ +/* { dg-do run } */ +/* { dg-additional-options "-O -save-temps" } */ +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ + +extern void abort (); + +/* +**zoo1: +** cmp w0, w1 +** csel w0, w2, w3, lt +** and w0, w0, 1 +** ret +*/ +__attribute((noipa, noinline)) +int zoo1 (int a, int b, int c, int d) +{ + return ((a < b) & c) | ((a >= b) & d); +} + +/* +**zoo2: +** cmp w0, w1 +** csel w0, w2, w3, lt +** ret +*/ +__attribute((noipa, noinline)) +int zoo2 (int a, int b, int c, int d) +{ + return (-(a < b) & c) | (-(a >= b) & d); +} + +int main () +{ + if (zoo1 (-3, 3, 5, 8) != 1) + abort (); + + if (zoo1 (3, -3, 5, 8) != 0) + abort (); + + if (zoo2 (-3, 3, 5, 8) != 5) + abort (); + + if (zoo2 (3, -3, 5, 8) != 8) + abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_2.c b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c new file mode 100644 index 0000000000000000000000000000000000000000..14988abac45989578b198f28c7c0ea203959c08b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c @@ -0,0 +1,96 @@ +/* { dg-do run } */ +/* { dg-additional-options "-O3 -std=c99 -save-temps" } */ +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ + +#pragma GCC target "+nosve" + +#include + +typedef int v4si __attribute__ ((vector_size (16))); + +/* +**foo1: +** cmgt v0.4s, v1.4s, v0.4s +** bsl v0.16b, v2.16b, v3.16b +** ret +*/ +v4si foo1 (v4si a, v4si b, v4si c, v4si d) { + return ((a < b) & c) | ((a >= b) & d); +} + +/* +**foo2: +** cmgt v0.4s, v1.4s, v0.4s +** bsl v0.16b, v3.16b, v2.16b +** ret +*/ +v4si foo2 (v4si a, v4si b, v4si c, v4si d) { + return (~(a < b) & c) | (~(a >= b) & d); +} + + +/** +**bar1: +**... +** cmge v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** bsl v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b +** and v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b +**... +*/ +void bar1 (int * restrict a, int * restrict b, int * restrict c, + int * restrict d, int * restrict res, int n) +{ + for (int i = 0; i < (n & -4); i++) + res[i] = ((a[i] < b[i]) & c[i]) | ((a[i] >= b[i]) & d[i]); +} + +/** +**bar2: +**... +** cmge v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** bsl v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b +**... +*/ +void bar2 (int * restrict a, int * restrict b, int * restrict c, + int * restrict d, int * restrict res, int n) +{ + for (int i = 0; i < (n & -4); i++) + res[i] = (-(a[i] < b[i]) & c[i]) | (-(a[i] >= b[i]) & d[i]); +} + +extern void abort (); + +int main () +{ + + v4si a = { -3, -3, -3, -3 }; + v4si b = { 3, 3, 3, 3 }; + v4si c = { 5, 5, 5, 5 }; + v4si d = { 8, 8, 8, 8 }; + + v4si res1 = foo1 (a, b, c, d); + if (memcmp (&res1, &c, 16UL) != 0) + abort (); + + v4si res2 = foo2 (a, b, c, d); + if (memcmp (&res2, &d, 16UL) != 0) + abort (); + + int ar[4] = { -3, -3, -3, -3 }; + int br[4] = { 3, 3, 3, 3 }; + int cr[4] = { 5, 5, 5, 5 }; + int dr[4] = { 8, 8, 8, 8 }; + + int exp1[4] = { 1, 1, 1, 1 }; + int res3[4]; + bar1 ((int*)&ar, (int*)&br, (int*)&cr, (int*)&dr, (int*)&res3, 4); + if (memcmp (&res3, &exp1, 16UL) != 0) + abort (); + + int res4[4]; + bar2 ((int*)&ar, (int*)&br, (int*)&cr, (int*)&dr, (int*)&res4, 4); + if (memcmp (&res4, &cr, 16UL) != 0) + abort (); + + return 0; +}