* [PATCH] Analyze niter for until-wrap condition [PR101145] @ 2021-07-01 2:05 Jiufu Guo 2021-07-01 7:22 ` Bin.Cheng 2021-07-01 12:35 ` Richard Biener 0 siblings, 2 replies; 8+ messages in thread From: Jiufu Guo @ 2021-07-01 2:05 UTC (permalink / raw) To: gcc-patches; +Cc: guojiufu, wschmidt, segher, dje.gcc, rguenther, jlaw, amker For code like: unsigned foo(unsigned val, unsigned start) { unsigned cnt = 0; for (unsigned i = start; i > val; ++i) cnt++; return cnt; } The number of iterations should be about UINT_MAX - start. There is function adjust_cond_for_loop_until_wrap which handles similar work for const bases. Like adjust_cond_for_loop_until_wrap, this patch enhance function number_of_iterations_cond/number_of_iterations_lt to analyze number of iterations for this kind of loop. Bootstrap and regtest pass on powerpc64le, is this ok for trunk? gcc/ChangeLog: PR tree-optimization/101145 * tree-ssa-loop-niter.c (number_of_iterations_until_wrap): New function. (number_of_iterations_lt): Invoke above function. (adjust_cond_for_loop_until_wrap): Merge to number_of_iterations_until_wrap. (number_of_iterations_cond): Update invokes for adjust_cond_for_loop_until_wrap and number_of_iterations_lt. gcc/testsuite/ChangeLog: PR tree-optimization/101145 * gcc.dg/vect/pr101145.c: New test. * gcc.dg/vect/pr101145.inc: New test. * gcc.dg/vect/pr101145_1.c: New test. * gcc.dg/vect/pr101145_2.c: New test. * gcc.dg/vect/pr101145_3.c: New test. --- gcc/testsuite/gcc.dg/vect/pr101145.c | 187 +++++++++++++++++++++++++ gcc/testsuite/gcc.dg/vect/pr101145.inc | 63 +++++++++ gcc/testsuite/gcc.dg/vect/pr101145_1.c | 15 ++ gcc/testsuite/gcc.dg/vect/pr101145_2.c | 15 ++ gcc/testsuite/gcc.dg/vect/pr101145_3.c | 15 ++ gcc/tree-ssa-loop-niter.c | 150 +++++++++++--------- 6 files changed, 380 insertions(+), 65 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145.c create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145.inc create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_1.c create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_2.c create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_3.c diff --git a/gcc/testsuite/gcc.dg/vect/pr101145.c b/gcc/testsuite/gcc.dg/vect/pr101145.c new file mode 100644 index 00000000000..74031b031cf --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr101145.c @@ -0,0 +1,187 @@ +/* { dg-require-effective-target vect_int } */ +/* { dg-options "-O3 -fdump-tree-vect-details" } */ +#include <limits.h> + +unsigned __attribute__ ((noinline)) +foo (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned n) +{ + while (n < ++l) + *a++ = *b++ + 1; + return l; +} + +unsigned __attribute__ ((noinline)) +foo_1 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned) +{ + while (UINT_MAX - 64 < ++l) + *a++ = *b++ + 1; + return l; +} + +unsigned __attribute__ ((noinline)) +foo_2 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned n) +{ + l = UINT_MAX - 32; + while (n < ++l) + *a++ = *b++ + 1; + return l; +} + +unsigned __attribute__ ((noinline)) +foo_3 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned n) +{ + while (n <= ++l) + *a++ = *b++ + 1; + return l; +} + +unsigned __attribute__ ((noinline)) +foo_4 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned n) +{ // infininate + while (0 <= ++l) + *a++ = *b++ + 1; + return l; +} + +unsigned __attribute__ ((noinline)) +foo_5 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned n) +{ + //no loop + l = UINT_MAX; + while (n < ++l) + *a++ = *b++ + 1; + return l; +} + +unsigned __attribute__ ((noinline)) +bar (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned n) +{ + while (--l < n) + *a++ = *b++ + 1; + return l; +} + +unsigned __attribute__ ((noinline)) +bar_1 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned) +{ + while (--l < 64) + *a++ = *b++ + 1; + return l; +} + +unsigned __attribute__ ((noinline)) +bar_2 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned n) +{ + l = 32; + while (--l < n) + *a++ = *b++ + 1; + return l; +} + + +int a[3200], b[3200]; +int fail; + +int +main () +{ + unsigned l, n; + unsigned res; + /* l > n*/ + n = UINT_MAX - 64; + l = n + 32; + res = foo (a, b, l, n); + if (res != 0) + fail++; + + l = n; + res = foo (a, b, l, n); + if (res != 0) + fail++; + + l = n - 1; + res = foo (a, b, l, n); + if (res != l + 1) + fail++; + + l = n - 32; + res = foo (a, b, l, n); + if (res != l + 1) + fail++; + + l = UINT_MAX; + res = foo (a, b, l, n); + if (res != 0) + fail++; + + l = n + 32; + res = foo_1 (a, b, l, n); + if (res != 0) + fail++; + + l = n + 32; + res = foo_2 (a, b, l, n); + if (res != 0) + fail++; + + l = n; + res = foo_3 (a, b, l, n); + if (res != 0) + fail++; + + l = n - 1; + res = foo_3 (a, b, l, n); + if (res != 0) + fail++; + + l = n - 2; + res = foo_3 (a, b, l, n); + if (res != l + 1) + fail++; + + res = foo_5 (a, b, l, n); + if (res != 0) + fail++; + + n = 64; + l = n - 32; + res = bar (a, b, l, n); + res++; + if (res != 0) + fail++; + + l = n; + res = bar (a, b, l, n); + res++; + if (res != 0) + fail++; + + l = n + 1; + res = bar (a, b, l, n); + res++; + if (res != l) + fail++; + + l = 0; + res = bar (a, b, l, n); + res++; + if (res != l) + fail++; + + l = 32; + res = bar_1 (a, b, l, n); + res++; + if (res != 0) + fail++; + + res = bar_1 (a, b, l, n); + res++; + if (res != 0) + fail++; + + if (fail) + __builtin_abort (); + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 7 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/pr101145.inc b/gcc/testsuite/gcc.dg/vect/pr101145.inc new file mode 100644 index 00000000000..6eed3fa8aca --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr101145.inc @@ -0,0 +1,63 @@ +TYPE __attribute__ ((noinline)) +foo_sign (int *__restrict__ a, int *__restrict__ b, TYPE l, TYPE n) +{ + for (l = L_BASE; n < l; l += C) + *a++ = *b++ + 1; + return l; +} + +TYPE __attribute__ ((noinline)) +bar_sign (int *__restrict__ a, int *__restrict__ b, TYPE l, TYPE n) +{ + for (l = L_BASE_DOWN; l < n; l -= C) + *a++ = *b++ + 1; + return l; +} + +int __attribute__ ((noinline)) neq (int a, int b) { return a != b; } + +int a[1000], b[1000]; +int fail; + +int +main () +{ + TYPE res; + TYPE l; + TYPE n; + n = N_BASE; + l = n - C; + res = foo_sign (a, b, l, n); + if (res != l) + fail++; + + l = n; + res = foo_sign (a, b, l, n); + if (res != l) + fail++; + + l = n + C; + res = foo_sign (a, b, l, n); + if (neq ((res - MIN) / C, 0)) + fail++; + + n = N_BASE_DOWN; + l = n - C; + res = bar_sign (a, b, l, n); + if (neq ((MAX - res) / C, 0)) + fail++; + + l = n; + res = bar_sign (a, b, l, n); + if (res != l) + fail++; + + l = n + C; + res = bar_sign (a, b, l, n); + if (res != l) + fail++; + + if (fail) + __builtin_abort (); + return 0; +} diff --git a/gcc/testsuite/gcc.dg/vect/pr101145_1.c b/gcc/testsuite/gcc.dg/vect/pr101145_1.c new file mode 100644 index 00000000000..94f6b99b893 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr101145_1.c @@ -0,0 +1,15 @@ +/* { dg-require-effective-target vect_int } */ +/* { dg-options "-O3 -fdump-tree-vect-details" } */ +#define TYPE signed char +#define MIN -128 +#define MAX 127 +#define N_BASE (MAX - 32) +#define N_BASE_DOWN (MIN + 32) + +#define C 3 +#define L_BASE l +#define L_BASE_DOWN l + +#include "pr101145.inc" + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/pr101145_2.c b/gcc/testsuite/gcc.dg/vect/pr101145_2.c new file mode 100644 index 00000000000..d3cfc9e01e1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr101145_2.c @@ -0,0 +1,15 @@ +/* { dg-require-effective-target vect_int } */ +/* { dg-options "-O3 -fdump-tree-vect-details" } */ +#define TYPE unsigned char +#define MIN 0 +#define MAX 255 +#define N_BASE (MAX - 32 + 1) +#define N_BASE_DOWN (MIN + 32) + +#define C 2 +#define L_BASE l +#define L_BASE_DOWN l + +#include "pr101145.inc" + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/pr101145_3.c b/gcc/testsuite/gcc.dg/vect/pr101145_3.c new file mode 100644 index 00000000000..e2cf9b1f7e6 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr101145_3.c @@ -0,0 +1,15 @@ +/* { dg-require-effective-target vect_int } */ +/* { dg-options "-O3 -fdump-tree-vect-details" } */ +#define TYPE int * +#define MIN ((TYPE)0) +#define MAX ((TYPE)((long long)-1)) +#define N_BASE ((TYPE) (-32)) +#define N_BASE_DOWN (MIN + 32) + +#define C 1 +#define L_BASE l +#define L_BASE_DOWN l + +#include "pr101145.inc" + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c index b5add827018..06db6a36ef8 100644 --- a/gcc/tree-ssa-loop-niter.c +++ b/gcc/tree-ssa-loop-niter.c @@ -1473,6 +1473,86 @@ assert_loop_rolls_lt (tree type, affine_iv *iv0, affine_iv *iv1, } } +/* Determines number of iterations of loop whose ending condition + is IV0 < IV1 which likes: {base, -C} < n, or n < {base, C}. + The number of iterations is stored to NITER. */ + +static bool +number_of_iterations_until_wrap (class loop *, tree type, affine_iv *iv0, + affine_iv *iv1, class tree_niter_desc *niter) +{ + tree niter_type = unsigned_type_for (type); + tree max, min; + + if (POINTER_TYPE_P (type)) + { + max = fold_convert (type, TYPE_MAX_VALUE (niter_type)); + min = fold_convert (type, TYPE_MIN_VALUE (niter_type)); + } + else + { + max = TYPE_MAX_VALUE (type); + min = TYPE_MIN_VALUE (type); + } + + tree high = max, low = min, one = build_int_cst (niter_type, 1); + tree step; + + /* n < {base, C}. */ + if (integer_zerop (iv0->step) && TREE_CODE (iv1->step) == INTEGER_CST + && !tree_int_cst_sign_bit (iv1->step)) + { + step = iv1->step; + niter->niter = fold_build2 (MINUS_EXPR, niter_type, max, iv1->base); + if (TREE_CODE (iv1->base) == INTEGER_CST) + low = fold_build2 (MINUS_EXPR, type, iv1->base, one); + else if (TREE_CODE (iv0->base) == INTEGER_CST) + low = iv0->base; + } + /* {base, -C} < n. */ + else if (TREE_CODE (iv0->step) == INTEGER_CST + && tree_int_cst_sign_bit (iv0->step) && integer_zerop (iv1->step)) + { + step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv0->step), iv0->step); + niter->niter = fold_build2 (MINUS_EXPR, niter_type, iv0->base, min); + if (TREE_CODE (iv0->base) == INTEGER_CST) + high = fold_build2 (PLUS_EXPR, type, iv0->base, one); + else if (TREE_CODE (iv1->base) == INTEGER_CST) + high = iv1->base; + } + else + return false; + + /* (delta + step - 1) / step */ + step = fold_convert (niter_type, step); + niter->niter = fold_convert (niter_type, niter->niter); + niter->niter = fold_build2 (PLUS_EXPR, niter_type, niter->niter, step); + niter->niter = fold_build2 (FLOOR_DIV_EXPR, niter_type, niter->niter, step); + + tree m = fold_build2 (MINUS_EXPR, niter_type, high, low); + m = fold_convert (niter_type, m); + mpz_t mstep, tmp, mmax; + mpz_init (mstep); + mpz_init (tmp); + mpz_init (mmax); + wi::to_mpz (wi::to_wide (step), mstep, UNSIGNED); + wi::to_mpz (wi::to_wide (m), mmax, UNSIGNED); + mpz_add (tmp, mmax, mstep); + mpz_sub_ui (tmp, tmp, 1); + mpz_fdiv_q (tmp, tmp, mstep); + niter->max = widest_int::from (wi::from_mpz (niter_type, tmp, false), + TYPE_SIGN (niter_type)); + mpz_clear (mstep); + mpz_clear (tmp); + + niter->may_be_zero + = fold_build2 (LE_EXPR, boolean_type_node, iv1->base, iv0->base); + + niter->control.no_overflow = false; + + return true; +} + /* Determines number of iterations of loop whose ending condition is IV0 < IV1. TYPE is the type of the iv. The number of iterations is stored to NITER. BNDS bounds the difference @@ -1501,6 +1581,11 @@ number_of_iterations_lt (class loop *loop, tree type, affine_iv *iv0, niter->bound = iv0->base; } + /* {base, -C} < n, or n < {base, C} */ + if (tree_int_cst_sign_bit (iv0->step) + || (!integer_zerop (iv1->step) && !tree_int_cst_sign_bit (iv1->step))) + return number_of_iterations_until_wrap (loop, type, iv0, iv1, niter); + delta = fold_build2 (MINUS_EXPR, niter_type, fold_convert (niter_type, iv1->base), fold_convert (niter_type, iv0->base)); @@ -1665,62 +1750,6 @@ dump_affine_iv (FILE *file, affine_iv *iv) } } -/* Given exit condition IV0 CODE IV1 in TYPE, this function adjusts - the condition for loop-until-wrap cases. For example: - (unsigned){8, -1}_loop < 10 => {0, 1} != 9 - 10 < (unsigned){0, max - 7}_loop => {0, 1} != 8 - Return true if condition is successfully adjusted. */ - -static bool -adjust_cond_for_loop_until_wrap (tree type, affine_iv *iv0, tree_code *code, - affine_iv *iv1) -{ - /* Only support simple cases for the moment. */ - if (TREE_CODE (iv0->base) != INTEGER_CST - || TREE_CODE (iv1->base) != INTEGER_CST) - return false; - - tree niter_type = unsigned_type_for (type), high, low; - /* Case: i-- < 10. */ - if (integer_zerop (iv1->step)) - { - /* TODO: Should handle case in which abs(step) != 1. */ - if (!integer_minus_onep (iv0->step)) - return false; - /* Give up on infinite loop. */ - if (*code == LE_EXPR - && tree_int_cst_equal (iv1->base, TYPE_MAX_VALUE (type))) - return false; - high = fold_build2 (PLUS_EXPR, niter_type, - fold_convert (niter_type, iv0->base), - build_int_cst (niter_type, 1)); - low = fold_convert (niter_type, TYPE_MIN_VALUE (type)); - } - else if (integer_zerop (iv0->step)) - { - /* TODO: Should handle case in which abs(step) != 1. */ - if (!integer_onep (iv1->step)) - return false; - /* Give up on infinite loop. */ - if (*code == LE_EXPR - && tree_int_cst_equal (iv0->base, TYPE_MIN_VALUE (type))) - return false; - high = fold_convert (niter_type, TYPE_MAX_VALUE (type)); - low = fold_build2 (MINUS_EXPR, niter_type, - fold_convert (niter_type, iv1->base), - build_int_cst (niter_type, 1)); - } - else - gcc_unreachable (); - - iv0->base = low; - iv0->step = fold_convert (niter_type, integer_one_node); - iv1->base = high; - iv1->step = build_int_cst (niter_type, 0); - *code = NE_EXPR; - return true; -} - /* Determine the number of iterations according to condition (for staying inside loop) which compares two induction variables using comparison operator CODE. The induction variable on left side of the comparison @@ -1855,15 +1884,6 @@ number_of_iterations_cond (class loop *loop, return true; } - /* Handle special case loops: while (i-- < 10) and while (10 < i++) by - adjusting iv0, iv1 and code. */ - if (code != NE_EXPR - && (tree_int_cst_sign_bit (iv0->step) - || (!integer_zerop (iv1->step) - && !tree_int_cst_sign_bit (iv1->step))) - && !adjust_cond_for_loop_until_wrap (type, iv0, &code, iv1)) - return false; - /* OK, now we know we have a senseful loop. Handle several cases, depending on what comparison operator is used. */ bound_difference (loop, iv1->base, iv0->base, &bnds); -- 2.17.1 ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] Analyze niter for until-wrap condition [PR101145] 2021-07-01 2:05 [PATCH] Analyze niter for until-wrap condition [PR101145] Jiufu Guo @ 2021-07-01 7:22 ` Bin.Cheng 2021-07-01 11:32 ` guojiufu 2021-07-01 12:35 ` Richard Biener 1 sibling, 1 reply; 8+ messages in thread From: Bin.Cheng @ 2021-07-01 7:22 UTC (permalink / raw) To: Jiufu Guo Cc: gcc-patches List, Richard Biener, amker, Segher Boessenkool, jlaw, Bill Schmidt, dje.gcc On Thu, Jul 1, 2021 at 10:06 AM Jiufu Guo via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > For code like: > unsigned foo(unsigned val, unsigned start) > { > unsigned cnt = 0; > for (unsigned i = start; i > val; ++i) > cnt++; > return cnt; > } > > The number of iterations should be about UINT_MAX - start. > > There is function adjust_cond_for_loop_until_wrap which > handles similar work for const bases. > Like adjust_cond_for_loop_until_wrap, this patch enhance > function number_of_iterations_cond/number_of_iterations_lt > to analyze number of iterations for this kind of loop. > > Bootstrap and regtest pass on powerpc64le, is this ok for trunk? > > gcc/ChangeLog: > > PR tree-optimization/101145 > * tree-ssa-loop-niter.c > (number_of_iterations_until_wrap): New function. > (number_of_iterations_lt): Invoke above function. > (adjust_cond_for_loop_until_wrap): > Merge to number_of_iterations_until_wrap. > (number_of_iterations_cond): Update invokes for > adjust_cond_for_loop_until_wrap and number_of_iterations_lt. > > gcc/testsuite/ChangeLog: > > PR tree-optimization/101145 > * gcc.dg/vect/pr101145.c: New test. > * gcc.dg/vect/pr101145.inc: New test. > * gcc.dg/vect/pr101145_1.c: New test. > * gcc.dg/vect/pr101145_2.c: New test. > * gcc.dg/vect/pr101145_3.c: New test. > --- > gcc/testsuite/gcc.dg/vect/pr101145.c | 187 +++++++++++++++++++++++++ > gcc/testsuite/gcc.dg/vect/pr101145.inc | 63 +++++++++ > gcc/testsuite/gcc.dg/vect/pr101145_1.c | 15 ++ > gcc/testsuite/gcc.dg/vect/pr101145_2.c | 15 ++ > gcc/testsuite/gcc.dg/vect/pr101145_3.c | 15 ++ > gcc/tree-ssa-loop-niter.c | 150 +++++++++++--------- > 6 files changed, 380 insertions(+), 65 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145.c > create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145.inc > create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_1.c > create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_2.c > create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_3.c > > diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c > index b5add827018..06db6a36ef8 100644 > --- a/gcc/tree-ssa-loop-niter.c > +++ b/gcc/tree-ssa-loop-niter.c > @@ -1473,6 +1473,86 @@ assert_loop_rolls_lt (tree type, affine_iv *iv0, affine_iv *iv1, > } > } > > +/* Determines number of iterations of loop whose ending condition > + is IV0 < IV1 which likes: {base, -C} < n, or n < {base, C}. > + The number of iterations is stored to NITER. */ > + > +static bool > +number_of_iterations_until_wrap (class loop *, tree type, affine_iv *iv0, > + affine_iv *iv1, class tree_niter_desc *niter) > +{ > + tree niter_type = unsigned_type_for (type); > + tree max, min; > + > + if (POINTER_TYPE_P (type)) > + { > + max = fold_convert (type, TYPE_MAX_VALUE (niter_type)); > + min = fold_convert (type, TYPE_MIN_VALUE (niter_type)); > + } > + else > + { > + max = TYPE_MAX_VALUE (type); > + min = TYPE_MIN_VALUE (type); > + } > + > + tree high = max, low = min, one = build_int_cst (niter_type, 1); > + tree step; > + > + /* n < {base, C}. */ > + if (integer_zerop (iv0->step) && TREE_CODE (iv1->step) == INTEGER_CST > + && !tree_int_cst_sign_bit (iv1->step)) > + { > + step = iv1->step; > + niter->niter = fold_build2 (MINUS_EXPR, niter_type, max, iv1->base); max/iv1->base could be of pointer type, not sure if this is canonical though. > + if (TREE_CODE (iv1->base) == INTEGER_CST) > + low = fold_build2 (MINUS_EXPR, type, iv1->base, one); > + else if (TREE_CODE (iv0->base) == INTEGER_CST) > + low = iv0->base; > + } > + /* {base, -C} < n. */ > + else if (TREE_CODE (iv0->step) == INTEGER_CST > + && tree_int_cst_sign_bit (iv0->step) && integer_zerop (iv1->step)) > + { > + step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv0->step), iv0->step); > + niter->niter = fold_build2 (MINUS_EXPR, niter_type, iv0->base, min); > + if (TREE_CODE (iv0->base) == INTEGER_CST) > + high = fold_build2 (PLUS_EXPR, type, iv0->base, one); > + else if (TREE_CODE (iv1->base) == INTEGER_CST) > + high = iv1->base; > + } > + else > + return false; > + > + /* (delta + step - 1) / step */ > + step = fold_convert (niter_type, step); > + niter->niter = fold_convert (niter_type, niter->niter); > + niter->niter = fold_build2 (PLUS_EXPR, niter_type, niter->niter, step); > + niter->niter = fold_build2 (FLOOR_DIV_EXPR, niter_type, niter->niter, step); > + > + tree m = fold_build2 (MINUS_EXPR, niter_type, high, low); > + m = fold_convert (niter_type, m); > + mpz_t mstep, tmp, mmax; > + mpz_init (mstep); > + mpz_init (tmp); > + mpz_init (mmax); > + wi::to_mpz (wi::to_wide (step), mstep, UNSIGNED); > + wi::to_mpz (wi::to_wide (m), mmax, UNSIGNED); > + mpz_add (tmp, mmax, mstep); > + mpz_sub_ui (tmp, tmp, 1); > + mpz_fdiv_q (tmp, tmp, mstep); > + niter->max = widest_int::from (wi::from_mpz (niter_type, tmp, false), > + TYPE_SIGN (niter_type)); This computation is similar to function number_of_iterations_lt, could we factor it out into an independent function? > + mpz_clear (mstep); > + mpz_clear (tmp); > + > + niter->may_be_zero > + = fold_build2 (LE_EXPR, boolean_type_node, iv1->base, iv0->base); If iv0->base and iv1->base are constant and iv1->base <= iv0->base, the number of iteration is actually zero, but here we rely on may_be_zero (== true), which loses information. Could we specially handle this case and do a fast return? Could you test this on some more targets(x86, aarch64) please? Otherwise LGTM. Thanks, bin > + > + niter->control.no_overflow = false; > + > + return true; > +} > + > /* Determines number of iterations of loop whose ending condition > is IV0 < IV1. TYPE is the type of the iv. The number of > iterations is stored to NITER. BNDS bounds the difference > @@ -1501,6 +1581,11 @@ number_of_iterations_lt (class loop *loop, tree type, affine_iv *iv0, > niter->bound = iv0->base; > } > > + /* {base, -C} < n, or n < {base, C} */ > + if (tree_int_cst_sign_bit (iv0->step) > + || (!integer_zerop (iv1->step) && !tree_int_cst_sign_bit (iv1->step))) > + return number_of_iterations_until_wrap (loop, type, iv0, iv1, niter); > + > delta = fold_build2 (MINUS_EXPR, niter_type, > fold_convert (niter_type, iv1->base), > fold_convert (niter_type, iv0->base)); > @@ -1665,62 +1750,6 @@ dump_affine_iv (FILE *file, affine_iv *iv) > } > } > > -/* Given exit condition IV0 CODE IV1 in TYPE, this function adjusts > - the condition for loop-until-wrap cases. For example: > - (unsigned){8, -1}_loop < 10 => {0, 1} != 9 > - 10 < (unsigned){0, max - 7}_loop => {0, 1} != 8 > - Return true if condition is successfully adjusted. */ > - > -static bool > -adjust_cond_for_loop_until_wrap (tree type, affine_iv *iv0, tree_code *code, > - affine_iv *iv1) > -{ > - /* Only support simple cases for the moment. */ > - if (TREE_CODE (iv0->base) != INTEGER_CST > - || TREE_CODE (iv1->base) != INTEGER_CST) > - return false; > - > - tree niter_type = unsigned_type_for (type), high, low; > - /* Case: i-- < 10. */ > - if (integer_zerop (iv1->step)) > - { > - /* TODO: Should handle case in which abs(step) != 1. */ > - if (!integer_minus_onep (iv0->step)) > - return false; > - /* Give up on infinite loop. */ > - if (*code == LE_EXPR > - && tree_int_cst_equal (iv1->base, TYPE_MAX_VALUE (type))) > - return false; > - high = fold_build2 (PLUS_EXPR, niter_type, > - fold_convert (niter_type, iv0->base), > - build_int_cst (niter_type, 1)); > - low = fold_convert (niter_type, TYPE_MIN_VALUE (type)); > - } > - else if (integer_zerop (iv0->step)) > - { > - /* TODO: Should handle case in which abs(step) != 1. */ > - if (!integer_onep (iv1->step)) > - return false; > - /* Give up on infinite loop. */ > - if (*code == LE_EXPR > - && tree_int_cst_equal (iv0->base, TYPE_MIN_VALUE (type))) > - return false; > - high = fold_convert (niter_type, TYPE_MAX_VALUE (type)); > - low = fold_build2 (MINUS_EXPR, niter_type, > - fold_convert (niter_type, iv1->base), > - build_int_cst (niter_type, 1)); > - } > - else > - gcc_unreachable (); > - > - iv0->base = low; > - iv0->step = fold_convert (niter_type, integer_one_node); > - iv1->base = high; > - iv1->step = build_int_cst (niter_type, 0); > - *code = NE_EXPR; > - return true; > -} > - > /* Determine the number of iterations according to condition (for staying > inside loop) which compares two induction variables using comparison > operator CODE. The induction variable on left side of the comparison > @@ -1855,15 +1884,6 @@ number_of_iterations_cond (class loop *loop, > return true; > } > > - /* Handle special case loops: while (i-- < 10) and while (10 < i++) by > - adjusting iv0, iv1 and code. */ > - if (code != NE_EXPR > - && (tree_int_cst_sign_bit (iv0->step) > - || (!integer_zerop (iv1->step) > - && !tree_int_cst_sign_bit (iv1->step))) > - && !adjust_cond_for_loop_until_wrap (type, iv0, &code, iv1)) > - return false; > - > /* OK, now we know we have a senseful loop. Handle several cases, depending > on what comparison operator is used. */ > bound_difference (loop, iv1->base, iv0->base, &bnds); > -- > 2.17.1 > ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] Analyze niter for until-wrap condition [PR101145] 2021-07-01 7:22 ` Bin.Cheng @ 2021-07-01 11:32 ` guojiufu 0 siblings, 0 replies; 8+ messages in thread From: guojiufu @ 2021-07-01 11:32 UTC (permalink / raw) To: Bin.Cheng Cc: gcc-patches List, Richard Biener, amker, Segher Boessenkool, jlaw, Bill Schmidt, dje.gcc On 2021-07-01 15:22, Bin.Cheng wrote: > On Thu, Jul 1, 2021 at 10:06 AM Jiufu Guo via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: >> >> For code like: >> unsigned foo(unsigned val, unsigned start) >> { >> unsigned cnt = 0; >> for (unsigned i = start; i > val; ++i) >> cnt++; >> return cnt; >> } >> >> The number of iterations should be about UINT_MAX - start. >> >> There is function adjust_cond_for_loop_until_wrap which >> handles similar work for const bases. >> Like adjust_cond_for_loop_until_wrap, this patch enhance >> function number_of_iterations_cond/number_of_iterations_lt >> to analyze number of iterations for this kind of loop. >> >> Bootstrap and regtest pass on powerpc64le, is this ok for trunk? >> >> gcc/ChangeLog: >> >> PR tree-optimization/101145 >> * tree-ssa-loop-niter.c >> (number_of_iterations_until_wrap): New function. >> (number_of_iterations_lt): Invoke above function. >> (adjust_cond_for_loop_until_wrap): >> Merge to number_of_iterations_until_wrap. >> (number_of_iterations_cond): Update invokes for >> adjust_cond_for_loop_until_wrap and number_of_iterations_lt. >> >> gcc/testsuite/ChangeLog: >> >> PR tree-optimization/101145 >> * gcc.dg/vect/pr101145.c: New test. >> * gcc.dg/vect/pr101145.inc: New test. >> * gcc.dg/vect/pr101145_1.c: New test. >> * gcc.dg/vect/pr101145_2.c: New test. >> * gcc.dg/vect/pr101145_3.c: New test. >> --- >> gcc/testsuite/gcc.dg/vect/pr101145.c | 187 >> +++++++++++++++++++++++++ >> gcc/testsuite/gcc.dg/vect/pr101145.inc | 63 +++++++++ >> gcc/testsuite/gcc.dg/vect/pr101145_1.c | 15 ++ >> gcc/testsuite/gcc.dg/vect/pr101145_2.c | 15 ++ >> gcc/testsuite/gcc.dg/vect/pr101145_3.c | 15 ++ >> gcc/tree-ssa-loop-niter.c | 150 +++++++++++--------- >> 6 files changed, 380 insertions(+), 65 deletions(-) >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145.c >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145.inc >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_1.c >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_2.c >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_3.c >> > >> diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c >> index b5add827018..06db6a36ef8 100644 >> --- a/gcc/tree-ssa-loop-niter.c >> +++ b/gcc/tree-ssa-loop-niter.c >> @@ -1473,6 +1473,86 @@ assert_loop_rolls_lt (tree type, affine_iv >> *iv0, affine_iv *iv1, >> } >> } >> >> +/* Determines number of iterations of loop whose ending condition >> + is IV0 < IV1 which likes: {base, -C} < n, or n < {base, C}. >> + The number of iterations is stored to NITER. */ >> + >> +static bool >> +number_of_iterations_until_wrap (class loop *, tree type, affine_iv >> *iv0, >> + affine_iv *iv1, class tree_niter_desc >> *niter) >> +{ >> + tree niter_type = unsigned_type_for (type); >> + tree max, min; >> + >> + if (POINTER_TYPE_P (type)) >> + { >> + max = fold_convert (type, TYPE_MAX_VALUE (niter_type)); >> + min = fold_convert (type, TYPE_MIN_VALUE (niter_type)); >> + } >> + else >> + { >> + max = TYPE_MAX_VALUE (type); >> + min = TYPE_MIN_VALUE (type); >> + } >> + >> + tree high = max, low = min, one = build_int_cst (niter_type, 1); >> + tree step; >> + >> + /* n < {base, C}. */ >> + if (integer_zerop (iv0->step) && TREE_CODE (iv1->step) == >> INTEGER_CST >> + && !tree_int_cst_sign_bit (iv1->step)) >> + { >> + step = iv1->step; >> + niter->niter = fold_build2 (MINUS_EXPR, niter_type, max, >> iv1->base); > max/iv1->base could be of pointer type, not sure if this is canonical > though. Thanks. Pointer needs careful attention. I added case pr101145_3.c for pointer, as test, the iteration number is 7: 0xffffffffffffffe4 - 0xffffffffffffffff, where pointer type is pointer to int: "int *". It works as expected. I notice in number_of_iterations_lt, there are code likes: delta = fold_build2 (MINUS_EXPR, niter_type, fold_convert (niter_type, iv1->base), fold_convert (niter_type, iv0->base)); This would also be ok. > >> + if (TREE_CODE (iv1->base) == INTEGER_CST) >> + low = fold_build2 (MINUS_EXPR, type, iv1->base, one); >> + else if (TREE_CODE (iv0->base) == INTEGER_CST) >> + low = iv0->base; >> + } >> + /* {base, -C} < n. */ >> + else if (TREE_CODE (iv0->step) == INTEGER_CST >> + && tree_int_cst_sign_bit (iv0->step) && integer_zerop >> (iv1->step)) >> + { >> + step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv0->step), >> iv0->step); >> + niter->niter = fold_build2 (MINUS_EXPR, niter_type, iv0->base, >> min); >> + if (TREE_CODE (iv0->base) == INTEGER_CST) >> + high = fold_build2 (PLUS_EXPR, type, iv0->base, one); >> + else if (TREE_CODE (iv1->base) == INTEGER_CST) >> + high = iv1->base; >> + } >> + else >> + return false; >> + >> + /* (delta + step - 1) / step */ >> + step = fold_convert (niter_type, step); >> + niter->niter = fold_convert (niter_type, niter->niter); >> + niter->niter = fold_build2 (PLUS_EXPR, niter_type, niter->niter, >> step); >> + niter->niter = fold_build2 (FLOOR_DIV_EXPR, niter_type, >> niter->niter, step); >> + >> + tree m = fold_build2 (MINUS_EXPR, niter_type, high, low); >> + m = fold_convert (niter_type, m); >> + mpz_t mstep, tmp, mmax; >> + mpz_init (mstep); >> + mpz_init (tmp); >> + mpz_init (mmax); >> + wi::to_mpz (wi::to_wide (step), mstep, UNSIGNED); >> + wi::to_mpz (wi::to_wide (m), mmax, UNSIGNED); >> + mpz_add (tmp, mmax, mstep); >> + mpz_sub_ui (tmp, tmp, 1); >> + mpz_fdiv_q (tmp, tmp, mstep); >> + niter->max = widest_int::from (wi::from_mpz (niter_type, tmp, >> false), >> + TYPE_SIGN (niter_type)); > This computation is similar to function number_of_iterations_lt, could > we factor it out into an independent function? Thanks! Will update as you said. > >> + mpz_clear (mstep); >> + mpz_clear (tmp); >> + >> + niter->may_be_zero >> + = fold_build2 (LE_EXPR, boolean_type_node, iv1->base, iv0->base); > If iv0->base and iv1->base are constant and iv1->base <= iv0->base, > the number of iteration is actually zero, but here we rely on > may_be_zero (== true), which loses information. Could we specially > handle this case and do a fast return? As tests, if iv1->base <= iv0->base in user source code, this function will not be called, even number_of_iterations_exit_assumptions will not be called. But it is still may be possible to run into this function if some previous optimizations generate constant bases. Then a fast return would be useful, I will update the patch. > > Could you test this on some more targets(x86, aarch64) please? > Otherwise LGTM. I will have some tests on other targets. Hope there is no difference between targets for this patch :) BR, Jiufu Guo > > Thanks, > bin >> + >> + niter->control.no_overflow = false; >> + >> + return true; >> +} >> + >> /* Determines number of iterations of loop whose ending condition >> is IV0 < IV1. TYPE is the type of the iv. The number of >> iterations is stored to NITER. BNDS bounds the difference >> @@ -1501,6 +1581,11 @@ number_of_iterations_lt (class loop *loop, tree >> type, affine_iv *iv0, >> niter->bound = iv0->base; >> } >> >> + /* {base, -C} < n, or n < {base, C} */ >> + if (tree_int_cst_sign_bit (iv0->step) >> + || (!integer_zerop (iv1->step) && !tree_int_cst_sign_bit >> (iv1->step))) >> + return number_of_iterations_until_wrap (loop, type, iv0, iv1, >> niter); >> + >> delta = fold_build2 (MINUS_EXPR, niter_type, >> fold_convert (niter_type, iv1->base), >> fold_convert (niter_type, iv0->base)); >> @@ -1665,62 +1750,6 @@ dump_affine_iv (FILE *file, affine_iv *iv) >> } >> } >> >> -/* Given exit condition IV0 CODE IV1 in TYPE, this function adjusts >> - the condition for loop-until-wrap cases. For example: >> - (unsigned){8, -1}_loop < 10 => {0, 1} != 9 >> - 10 < (unsigned){0, max - 7}_loop => {0, 1} != 8 >> - Return true if condition is successfully adjusted. */ >> - >> -static bool >> -adjust_cond_for_loop_until_wrap (tree type, affine_iv *iv0, tree_code >> *code, >> - affine_iv *iv1) >> -{ >> - /* Only support simple cases for the moment. */ >> - if (TREE_CODE (iv0->base) != INTEGER_CST >> - || TREE_CODE (iv1->base) != INTEGER_CST) >> - return false; >> - >> - tree niter_type = unsigned_type_for (type), high, low; >> - /* Case: i-- < 10. */ >> - if (integer_zerop (iv1->step)) >> - { >> - /* TODO: Should handle case in which abs(step) != 1. */ >> - if (!integer_minus_onep (iv0->step)) >> - return false; >> - /* Give up on infinite loop. */ >> - if (*code == LE_EXPR >> - && tree_int_cst_equal (iv1->base, TYPE_MAX_VALUE (type))) >> - return false; >> - high = fold_build2 (PLUS_EXPR, niter_type, >> - fold_convert (niter_type, iv0->base), >> - build_int_cst (niter_type, 1)); >> - low = fold_convert (niter_type, TYPE_MIN_VALUE (type)); >> - } >> - else if (integer_zerop (iv0->step)) >> - { >> - /* TODO: Should handle case in which abs(step) != 1. */ >> - if (!integer_onep (iv1->step)) >> - return false; >> - /* Give up on infinite loop. */ >> - if (*code == LE_EXPR >> - && tree_int_cst_equal (iv0->base, TYPE_MIN_VALUE (type))) >> - return false; >> - high = fold_convert (niter_type, TYPE_MAX_VALUE (type)); >> - low = fold_build2 (MINUS_EXPR, niter_type, >> - fold_convert (niter_type, iv1->base), >> - build_int_cst (niter_type, 1)); >> - } >> - else >> - gcc_unreachable (); >> - >> - iv0->base = low; >> - iv0->step = fold_convert (niter_type, integer_one_node); >> - iv1->base = high; >> - iv1->step = build_int_cst (niter_type, 0); >> - *code = NE_EXPR; >> - return true; >> -} >> - >> /* Determine the number of iterations according to condition (for >> staying >> inside loop) which compares two induction variables using >> comparison >> operator CODE. The induction variable on left side of the >> comparison >> @@ -1855,15 +1884,6 @@ number_of_iterations_cond (class loop *loop, >> return true; >> } >> >> - /* Handle special case loops: while (i-- < 10) and while (10 < i++) >> by >> - adjusting iv0, iv1 and code. */ >> - if (code != NE_EXPR >> - && (tree_int_cst_sign_bit (iv0->step) >> - || (!integer_zerop (iv1->step) >> - && !tree_int_cst_sign_bit (iv1->step))) >> - && !adjust_cond_for_loop_until_wrap (type, iv0, &code, iv1)) >> - return false; >> - >> /* OK, now we know we have a senseful loop. Handle several cases, >> depending >> on what comparison operator is used. */ >> bound_difference (loop, iv1->base, iv0->base, &bnds); >> -- >> 2.17.1 >> ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] Analyze niter for until-wrap condition [PR101145] 2021-07-01 2:05 [PATCH] Analyze niter for until-wrap condition [PR101145] Jiufu Guo 2021-07-01 7:22 ` Bin.Cheng @ 2021-07-01 12:35 ` Richard Biener 2021-07-01 14:12 ` guojiufu 2021-07-02 7:55 ` guojiufu 1 sibling, 2 replies; 8+ messages in thread From: Richard Biener @ 2021-07-01 12:35 UTC (permalink / raw) To: Jiufu Guo; +Cc: gcc-patches, wschmidt, segher, dje.gcc, jlaw, amker On Thu, 1 Jul 2021, Jiufu Guo wrote: > For code like: > unsigned foo(unsigned val, unsigned start) > { > unsigned cnt = 0; > for (unsigned i = start; i > val; ++i) > cnt++; > return cnt; > } > > The number of iterations should be about UINT_MAX - start. For unsigned foo(unsigned val, unsigned start) { unsigned cnt = 0; for (unsigned i = start; i >= val; ++i) cnt++; return cnt; } and val == 0 the loop never terminates. I don't see anywhere in the patch that you disregard GE_EXPR and I remember the code handles GE as well as GT? From a quick look this is also not covered by a testcase you add - not exactly sure how it would materialize in a miscompilation. > There is function adjust_cond_for_loop_until_wrap which > handles similar work for const bases. > Like adjust_cond_for_loop_until_wrap, this patch enhance > function number_of_iterations_cond/number_of_iterations_lt > to analyze number of iterations for this kind of loop. > > Bootstrap and regtest pass on powerpc64le, is this ok for trunk? > > gcc/ChangeLog: > > PR tree-optimization/101145 > * tree-ssa-loop-niter.c > (number_of_iterations_until_wrap): New function. > (number_of_iterations_lt): Invoke above function. > (adjust_cond_for_loop_until_wrap): > Merge to number_of_iterations_until_wrap. > (number_of_iterations_cond): Update invokes for > adjust_cond_for_loop_until_wrap and number_of_iterations_lt. > > gcc/testsuite/ChangeLog: > > PR tree-optimization/101145 > * gcc.dg/vect/pr101145.c: New test. > * gcc.dg/vect/pr101145.inc: New test. > * gcc.dg/vect/pr101145_1.c: New test. > * gcc.dg/vect/pr101145_2.c: New test. > * gcc.dg/vect/pr101145_3.c: New test. > --- > gcc/testsuite/gcc.dg/vect/pr101145.c | 187 +++++++++++++++++++++++++ > gcc/testsuite/gcc.dg/vect/pr101145.inc | 63 +++++++++ > gcc/testsuite/gcc.dg/vect/pr101145_1.c | 15 ++ > gcc/testsuite/gcc.dg/vect/pr101145_2.c | 15 ++ > gcc/testsuite/gcc.dg/vect/pr101145_3.c | 15 ++ > gcc/tree-ssa-loop-niter.c | 150 +++++++++++--------- > 6 files changed, 380 insertions(+), 65 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145.c > create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145.inc > create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_1.c > create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_2.c > create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_3.c > > diff --git a/gcc/testsuite/gcc.dg/vect/pr101145.c b/gcc/testsuite/gcc.dg/vect/pr101145.c > new file mode 100644 > index 00000000000..74031b031cf > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr101145.c > @@ -0,0 +1,187 @@ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-options "-O3 -fdump-tree-vect-details" } */ > +#include <limits.h> > + > +unsigned __attribute__ ((noinline)) > +foo (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned n) > +{ > + while (n < ++l) > + *a++ = *b++ + 1; > + return l; > +} > + > +unsigned __attribute__ ((noinline)) > +foo_1 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned) > +{ > + while (UINT_MAX - 64 < ++l) > + *a++ = *b++ + 1; > + return l; > +} > + > +unsigned __attribute__ ((noinline)) > +foo_2 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned n) > +{ > + l = UINT_MAX - 32; > + while (n < ++l) > + *a++ = *b++ + 1; > + return l; > +} > + > +unsigned __attribute__ ((noinline)) > +foo_3 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned n) > +{ > + while (n <= ++l) > + *a++ = *b++ + 1; > + return l; > +} > + > +unsigned __attribute__ ((noinline)) > +foo_4 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned n) > +{ // infininate > + while (0 <= ++l) > + *a++ = *b++ + 1; > + return l; > +} > + > +unsigned __attribute__ ((noinline)) > +foo_5 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned n) > +{ > + //no loop > + l = UINT_MAX; > + while (n < ++l) > + *a++ = *b++ + 1; > + return l; > +} > + > +unsigned __attribute__ ((noinline)) > +bar (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned n) > +{ > + while (--l < n) > + *a++ = *b++ + 1; > + return l; > +} > + > +unsigned __attribute__ ((noinline)) > +bar_1 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned) > +{ > + while (--l < 64) > + *a++ = *b++ + 1; > + return l; > +} > + > +unsigned __attribute__ ((noinline)) > +bar_2 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned n) > +{ > + l = 32; > + while (--l < n) > + *a++ = *b++ + 1; > + return l; > +} > + > + > +int a[3200], b[3200]; > +int fail; > + > +int > +main () > +{ > + unsigned l, n; > + unsigned res; > + /* l > n*/ > + n = UINT_MAX - 64; > + l = n + 32; > + res = foo (a, b, l, n); > + if (res != 0) > + fail++; > + > + l = n; > + res = foo (a, b, l, n); > + if (res != 0) > + fail++; > + > + l = n - 1; > + res = foo (a, b, l, n); > + if (res != l + 1) > + fail++; > + > + l = n - 32; > + res = foo (a, b, l, n); > + if (res != l + 1) > + fail++; > + > + l = UINT_MAX; > + res = foo (a, b, l, n); > + if (res != 0) > + fail++; > + > + l = n + 32; > + res = foo_1 (a, b, l, n); > + if (res != 0) > + fail++; > + > + l = n + 32; > + res = foo_2 (a, b, l, n); > + if (res != 0) > + fail++; > + > + l = n; > + res = foo_3 (a, b, l, n); > + if (res != 0) > + fail++; > + > + l = n - 1; > + res = foo_3 (a, b, l, n); > + if (res != 0) > + fail++; > + > + l = n - 2; > + res = foo_3 (a, b, l, n); > + if (res != l + 1) > + fail++; > + > + res = foo_5 (a, b, l, n); > + if (res != 0) > + fail++; > + > + n = 64; > + l = n - 32; > + res = bar (a, b, l, n); > + res++; > + if (res != 0) > + fail++; > + > + l = n; > + res = bar (a, b, l, n); > + res++; > + if (res != 0) > + fail++; > + > + l = n + 1; > + res = bar (a, b, l, n); > + res++; > + if (res != l) > + fail++; > + > + l = 0; > + res = bar (a, b, l, n); > + res++; > + if (res != l) > + fail++; > + > + l = 32; > + res = bar_1 (a, b, l, n); > + res++; > + if (res != 0) > + fail++; > + > + res = bar_1 (a, b, l, n); > + res++; > + if (res != 0) > + fail++; > + > + if (fail) > + __builtin_abort (); > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 7 "vect" } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/pr101145.inc b/gcc/testsuite/gcc.dg/vect/pr101145.inc > new file mode 100644 > index 00000000000..6eed3fa8aca > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr101145.inc > @@ -0,0 +1,63 @@ > +TYPE __attribute__ ((noinline)) > +foo_sign (int *__restrict__ a, int *__restrict__ b, TYPE l, TYPE n) > +{ > + for (l = L_BASE; n < l; l += C) > + *a++ = *b++ + 1; > + return l; > +} > + > +TYPE __attribute__ ((noinline)) > +bar_sign (int *__restrict__ a, int *__restrict__ b, TYPE l, TYPE n) > +{ > + for (l = L_BASE_DOWN; l < n; l -= C) > + *a++ = *b++ + 1; > + return l; > +} > + > +int __attribute__ ((noinline)) neq (int a, int b) { return a != b; } > + > +int a[1000], b[1000]; > +int fail; > + > +int > +main () > +{ > + TYPE res; > + TYPE l; > + TYPE n; > + n = N_BASE; > + l = n - C; > + res = foo_sign (a, b, l, n); > + if (res != l) > + fail++; > + > + l = n; > + res = foo_sign (a, b, l, n); > + if (res != l) > + fail++; > + > + l = n + C; > + res = foo_sign (a, b, l, n); > + if (neq ((res - MIN) / C, 0)) > + fail++; > + > + n = N_BASE_DOWN; > + l = n - C; > + res = bar_sign (a, b, l, n); > + if (neq ((MAX - res) / C, 0)) > + fail++; > + > + l = n; > + res = bar_sign (a, b, l, n); > + if (res != l) > + fail++; > + > + l = n + C; > + res = bar_sign (a, b, l, n); > + if (res != l) > + fail++; > + > + if (fail) > + __builtin_abort (); > + return 0; > +} > diff --git a/gcc/testsuite/gcc.dg/vect/pr101145_1.c b/gcc/testsuite/gcc.dg/vect/pr101145_1.c > new file mode 100644 > index 00000000000..94f6b99b893 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr101145_1.c > @@ -0,0 +1,15 @@ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-options "-O3 -fdump-tree-vect-details" } */ > +#define TYPE signed char > +#define MIN -128 > +#define MAX 127 > +#define N_BASE (MAX - 32) > +#define N_BASE_DOWN (MIN + 32) > + > +#define C 3 > +#define L_BASE l > +#define L_BASE_DOWN l > + > +#include "pr101145.inc" > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/pr101145_2.c b/gcc/testsuite/gcc.dg/vect/pr101145_2.c > new file mode 100644 > index 00000000000..d3cfc9e01e1 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr101145_2.c > @@ -0,0 +1,15 @@ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-options "-O3 -fdump-tree-vect-details" } */ > +#define TYPE unsigned char > +#define MIN 0 > +#define MAX 255 > +#define N_BASE (MAX - 32 + 1) > +#define N_BASE_DOWN (MIN + 32) > + > +#define C 2 > +#define L_BASE l > +#define L_BASE_DOWN l > + > +#include "pr101145.inc" > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/pr101145_3.c b/gcc/testsuite/gcc.dg/vect/pr101145_3.c > new file mode 100644 > index 00000000000..e2cf9b1f7e6 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr101145_3.c > @@ -0,0 +1,15 @@ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-options "-O3 -fdump-tree-vect-details" } */ > +#define TYPE int * > +#define MIN ((TYPE)0) > +#define MAX ((TYPE)((long long)-1)) > +#define N_BASE ((TYPE) (-32)) > +#define N_BASE_DOWN (MIN + 32) > + > +#define C 1 > +#define L_BASE l > +#define L_BASE_DOWN l > + > +#include "pr101145.inc" > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ > diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c > index b5add827018..06db6a36ef8 100644 > --- a/gcc/tree-ssa-loop-niter.c > +++ b/gcc/tree-ssa-loop-niter.c > @@ -1473,6 +1473,86 @@ assert_loop_rolls_lt (tree type, affine_iv *iv0, affine_iv *iv1, > } > } > > +/* Determines number of iterations of loop whose ending condition > + is IV0 < IV1 which likes: {base, -C} < n, or n < {base, C}. > + The number of iterations is stored to NITER. */ > + > +static bool > +number_of_iterations_until_wrap (class loop *, tree type, affine_iv *iv0, > + affine_iv *iv1, class tree_niter_desc *niter) > +{ > + tree niter_type = unsigned_type_for (type); > + tree max, min; > + > + if (POINTER_TYPE_P (type)) > + { > + max = fold_convert (type, TYPE_MAX_VALUE (niter_type)); > + min = fold_convert (type, TYPE_MIN_VALUE (niter_type)); > + } > + else > + { > + max = TYPE_MAX_VALUE (type); > + min = TYPE_MIN_VALUE (type); > + } It would be cheaper to use wide_int here - wi::min/max_value (TYPE_PRECISION (...), TYPE_SIGN (...)) and in the simplification code below. > + tree high = max, low = min, one = build_int_cst (niter_type, 1); > + tree step; > + > + /* n < {base, C}. */ > + if (integer_zerop (iv0->step) && TREE_CODE (iv1->step) == INTEGER_CST > + && !tree_int_cst_sign_bit (iv1->step)) > + { > + step = iv1->step; > + niter->niter = fold_build2 (MINUS_EXPR, niter_type, max, iv1->base); > + if (TREE_CODE (iv1->base) == INTEGER_CST) > + low = fold_build2 (MINUS_EXPR, type, iv1->base, one); > + else if (TREE_CODE (iv0->base) == INTEGER_CST) > + low = iv0->base; > + } > + /* {base, -C} < n. */ > + else if (TREE_CODE (iv0->step) == INTEGER_CST > + && tree_int_cst_sign_bit (iv0->step) && integer_zerop (iv1->step)) > + { > + step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv0->step), iv0->step); > + niter->niter = fold_build2 (MINUS_EXPR, niter_type, iv0->base, min); > + if (TREE_CODE (iv0->base) == INTEGER_CST) > + high = fold_build2 (PLUS_EXPR, type, iv0->base, one); > + else if (TREE_CODE (iv1->base) == INTEGER_CST) > + high = iv1->base; > + } > + else > + return false; > + > + /* (delta + step - 1) / step */ > + step = fold_convert (niter_type, step); > + niter->niter = fold_convert (niter_type, niter->niter); > + niter->niter = fold_build2 (PLUS_EXPR, niter_type, niter->niter, step); > + niter->niter = fold_build2 (FLOOR_DIV_EXPR, niter_type, niter->niter, step); > + > + tree m = fold_build2 (MINUS_EXPR, niter_type, high, low); > + m = fold_convert (niter_type, m); > + mpz_t mstep, tmp, mmax; > + mpz_init (mstep); > + mpz_init (tmp); > + mpz_init (mmax); > + wi::to_mpz (wi::to_wide (step), mstep, UNSIGNED); > + wi::to_mpz (wi::to_wide (m), mmax, UNSIGNED); > + mpz_add (tmp, mmax, mstep); > + mpz_sub_ui (tmp, tmp, 1); > + mpz_fdiv_q (tmp, tmp, mstep); > + niter->max = widest_int::from (wi::from_mpz (niter_type, tmp, false), > + TYPE_SIGN (niter_type)); > + mpz_clear (mstep); > + mpz_clear (tmp); You forget to clear mmax? I wonder why you need to involve GMP here at all since this is just add/sub/divide - you could use widest_int directly I think. At least the back-and-forth between trees, wide-int and mpz looks odd. Richard. > + niter->may_be_zero > + = fold_build2 (LE_EXPR, boolean_type_node, iv1->base, iv0->base); > + > + niter->control.no_overflow = false; > + > + return true; > +} > + > /* Determines number of iterations of loop whose ending condition > is IV0 < IV1. TYPE is the type of the iv. The number of > iterations is stored to NITER. BNDS bounds the difference > @@ -1501,6 +1581,11 @@ number_of_iterations_lt (class loop *loop, tree type, affine_iv *iv0, > niter->bound = iv0->base; > } > > + /* {base, -C} < n, or n < {base, C} */ > + if (tree_int_cst_sign_bit (iv0->step) > + || (!integer_zerop (iv1->step) && !tree_int_cst_sign_bit (iv1->step))) > + return number_of_iterations_until_wrap (loop, type, iv0, iv1, niter); > + > delta = fold_build2 (MINUS_EXPR, niter_type, > fold_convert (niter_type, iv1->base), > fold_convert (niter_type, iv0->base)); > @@ -1665,62 +1750,6 @@ dump_affine_iv (FILE *file, affine_iv *iv) > } > } > > -/* Given exit condition IV0 CODE IV1 in TYPE, this function adjusts > - the condition for loop-until-wrap cases. For example: > - (unsigned){8, -1}_loop < 10 => {0, 1} != 9 > - 10 < (unsigned){0, max - 7}_loop => {0, 1} != 8 > - Return true if condition is successfully adjusted. */ > - > -static bool > -adjust_cond_for_loop_until_wrap (tree type, affine_iv *iv0, tree_code *code, > - affine_iv *iv1) > -{ > - /* Only support simple cases for the moment. */ > - if (TREE_CODE (iv0->base) != INTEGER_CST > - || TREE_CODE (iv1->base) != INTEGER_CST) > - return false; > - > - tree niter_type = unsigned_type_for (type), high, low; > - /* Case: i-- < 10. */ > - if (integer_zerop (iv1->step)) > - { > - /* TODO: Should handle case in which abs(step) != 1. */ > - if (!integer_minus_onep (iv0->step)) > - return false; > - /* Give up on infinite loop. */ > - if (*code == LE_EXPR > - && tree_int_cst_equal (iv1->base, TYPE_MAX_VALUE (type))) > - return false; > - high = fold_build2 (PLUS_EXPR, niter_type, > - fold_convert (niter_type, iv0->base), > - build_int_cst (niter_type, 1)); > - low = fold_convert (niter_type, TYPE_MIN_VALUE (type)); > - } > - else if (integer_zerop (iv0->step)) > - { > - /* TODO: Should handle case in which abs(step) != 1. */ > - if (!integer_onep (iv1->step)) > - return false; > - /* Give up on infinite loop. */ > - if (*code == LE_EXPR > - && tree_int_cst_equal (iv0->base, TYPE_MIN_VALUE (type))) > - return false; > - high = fold_convert (niter_type, TYPE_MAX_VALUE (type)); > - low = fold_build2 (MINUS_EXPR, niter_type, > - fold_convert (niter_type, iv1->base), > - build_int_cst (niter_type, 1)); > - } > - else > - gcc_unreachable (); > - > - iv0->base = low; > - iv0->step = fold_convert (niter_type, integer_one_node); > - iv1->base = high; > - iv1->step = build_int_cst (niter_type, 0); > - *code = NE_EXPR; > - return true; > -} > - > /* Determine the number of iterations according to condition (for staying > inside loop) which compares two induction variables using comparison > operator CODE. The induction variable on left side of the comparison > @@ -1855,15 +1884,6 @@ number_of_iterations_cond (class loop *loop, > return true; > } > > - /* Handle special case loops: while (i-- < 10) and while (10 < i++) by > - adjusting iv0, iv1 and code. */ > - if (code != NE_EXPR > - && (tree_int_cst_sign_bit (iv0->step) > - || (!integer_zerop (iv1->step) > - && !tree_int_cst_sign_bit (iv1->step))) > - && !adjust_cond_for_loop_until_wrap (type, iv0, &code, iv1)) > - return false; > - > /* OK, now we know we have a senseful loop. Handle several cases, depending > on what comparison operator is used. */ > bound_difference (loop, iv1->base, iv0->base, &bnds); > -- Richard Biener <rguenther@suse.de> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg) ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] Analyze niter for until-wrap condition [PR101145] 2021-07-01 12:35 ` Richard Biener @ 2021-07-01 14:12 ` guojiufu 2021-07-02 0:51 ` Bin.Cheng 2021-07-02 7:55 ` guojiufu 1 sibling, 1 reply; 8+ messages in thread From: guojiufu @ 2021-07-01 14:12 UTC (permalink / raw) To: Richard Biener; +Cc: gcc-patches, wschmidt, segher, dje.gcc, jlaw, amker On 2021-07-01 20:35, Richard Biener wrote: > On Thu, 1 Jul 2021, Jiufu Guo wrote: > >> For code like: >> unsigned foo(unsigned val, unsigned start) >> { >> unsigned cnt = 0; >> for (unsigned i = start; i > val; ++i) >> cnt++; >> return cnt; >> } >> >> The number of iterations should be about UINT_MAX - start. > > For > > unsigned foo(unsigned val, unsigned start) > { > unsigned cnt = 0; > for (unsigned i = start; i >= val; ++i) > cnt++; > return cnt; > } > > and val == 0 the loop never terminates. I don't see anywhere > in the patch that you disregard GE_EXPR and I remember > the code handles GE as well as GT? From a quick look this is > also not covered by a testcase you add - not exactly sure > how it would materialize in a miscompilation. In number_of_iterations_cond, there is code: if (code == GE_EXPR || code == GT_EXPR || (code == NE_EXPR && integer_zerop (iv0->step))) { std::swap (iv0, iv1); code = swap_tree_comparison (code); } It converts "GT/GE" (i >= val) to "LT/LE" (val <= i), and LE (val <= i) is converted to LT (val - 1 < i). So, the code is added to number_of_iterations_lt. But, this patch leads mis-compilation for unsigned "i >= val" as above transforms: converting LE (val <= i) to LT (val - 1 < i) seems not appropriate (e.g where val=0). Thanks for pointing out this!!! I would investigate a way to handle this correctly. A possible way maybe just to return false for this kind of LE. Any suggestions? > >> There is function adjust_cond_for_loop_until_wrap which >> handles similar work for const bases. >> Like adjust_cond_for_loop_until_wrap, this patch enhance >> function number_of_iterations_cond/number_of_iterations_lt >> to analyze number of iterations for this kind of loop. >> >> Bootstrap and regtest pass on powerpc64le, is this ok for trunk? >> >> gcc/ChangeLog: >> >> PR tree-optimization/101145 >> * tree-ssa-loop-niter.c >> (number_of_iterations_until_wrap): New function. >> (number_of_iterations_lt): Invoke above function. >> (adjust_cond_for_loop_until_wrap): >> Merge to number_of_iterations_until_wrap. >> (number_of_iterations_cond): Update invokes for >> adjust_cond_for_loop_until_wrap and number_of_iterations_lt. >> >> gcc/testsuite/ChangeLog: >> >> PR tree-optimization/101145 >> * gcc.dg/vect/pr101145.c: New test. >> * gcc.dg/vect/pr101145.inc: New test. >> * gcc.dg/vect/pr101145_1.c: New test. >> * gcc.dg/vect/pr101145_2.c: New test. >> * gcc.dg/vect/pr101145_3.c: New test. >> --- >> gcc/testsuite/gcc.dg/vect/pr101145.c | 187 >> +++++++++++++++++++++++++ >> gcc/testsuite/gcc.dg/vect/pr101145.inc | 63 +++++++++ >> gcc/testsuite/gcc.dg/vect/pr101145_1.c | 15 ++ >> gcc/testsuite/gcc.dg/vect/pr101145_2.c | 15 ++ >> gcc/testsuite/gcc.dg/vect/pr101145_3.c | 15 ++ >> gcc/tree-ssa-loop-niter.c | 150 +++++++++++--------- >> 6 files changed, 380 insertions(+), 65 deletions(-) >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145.c >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145.inc >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_1.c >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_2.c >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_3.c >> >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145.c >> b/gcc/testsuite/gcc.dg/vect/pr101145.c >> new file mode 100644 >> index 00000000000..74031b031cf >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145.c >> @@ -0,0 +1,187 @@ >> +/* { dg-require-effective-target vect_int } */ >> +/* { dg-options "-O3 -fdump-tree-vect-details" } */ >> +#include <limits.h> >> + >> +unsigned __attribute__ ((noinline)) >> +foo (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> n) >> +{ >> + while (n < ++l) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +unsigned __attribute__ ((noinline)) >> +foo_1 (int *__restrict__ a, int *__restrict__ b, unsigned l, >> unsigned) >> +{ >> + while (UINT_MAX - 64 < ++l) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +unsigned __attribute__ ((noinline)) >> +foo_2 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> n) >> +{ >> + l = UINT_MAX - 32; >> + while (n < ++l) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +unsigned __attribute__ ((noinline)) >> +foo_3 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> n) >> +{ >> + while (n <= ++l) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +unsigned __attribute__ ((noinline)) >> +foo_4 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> n) >> +{ // infininate >> + while (0 <= ++l) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +unsigned __attribute__ ((noinline)) >> +foo_5 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> n) >> +{ >> + //no loop >> + l = UINT_MAX; >> + while (n < ++l) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +unsigned __attribute__ ((noinline)) >> +bar (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> n) >> +{ >> + while (--l < n) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +unsigned __attribute__ ((noinline)) >> +bar_1 (int *__restrict__ a, int *__restrict__ b, unsigned l, >> unsigned) >> +{ >> + while (--l < 64) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +unsigned __attribute__ ((noinline)) >> +bar_2 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> n) >> +{ >> + l = 32; >> + while (--l < n) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> + >> +int a[3200], b[3200]; >> +int fail; >> + >> +int >> +main () >> +{ >> + unsigned l, n; >> + unsigned res; >> + /* l > n*/ >> + n = UINT_MAX - 64; >> + l = n + 32; >> + res = foo (a, b, l, n); >> + if (res != 0) >> + fail++; >> + >> + l = n; >> + res = foo (a, b, l, n); >> + if (res != 0) >> + fail++; >> + >> + l = n - 1; >> + res = foo (a, b, l, n); >> + if (res != l + 1) >> + fail++; >> + >> + l = n - 32; >> + res = foo (a, b, l, n); >> + if (res != l + 1) >> + fail++; >> + >> + l = UINT_MAX; >> + res = foo (a, b, l, n); >> + if (res != 0) >> + fail++; >> + >> + l = n + 32; >> + res = foo_1 (a, b, l, n); >> + if (res != 0) >> + fail++; >> + >> + l = n + 32; >> + res = foo_2 (a, b, l, n); >> + if (res != 0) >> + fail++; >> + >> + l = n; >> + res = foo_3 (a, b, l, n); >> + if (res != 0) >> + fail++; >> + >> + l = n - 1; >> + res = foo_3 (a, b, l, n); >> + if (res != 0) >> + fail++; >> + >> + l = n - 2; >> + res = foo_3 (a, b, l, n); >> + if (res != l + 1) >> + fail++; >> + >> + res = foo_5 (a, b, l, n); >> + if (res != 0) >> + fail++; >> + >> + n = 64; >> + l = n - 32; >> + res = bar (a, b, l, n); >> + res++; >> + if (res != 0) >> + fail++; >> + >> + l = n; >> + res = bar (a, b, l, n); >> + res++; >> + if (res != 0) >> + fail++; >> + >> + l = n + 1; >> + res = bar (a, b, l, n); >> + res++; >> + if (res != l) >> + fail++; >> + >> + l = 0; >> + res = bar (a, b, l, n); >> + res++; >> + if (res != l) >> + fail++; >> + >> + l = 32; >> + res = bar_1 (a, b, l, n); >> + res++; >> + if (res != 0) >> + fail++; >> + >> + res = bar_1 (a, b, l, n); >> + res++; >> + if (res != 0) >> + fail++; >> + >> + if (fail) >> + __builtin_abort (); >> + return 0; >> +} >> + >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 7 "vect" } >> } */ >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145.inc >> b/gcc/testsuite/gcc.dg/vect/pr101145.inc >> new file mode 100644 >> index 00000000000..6eed3fa8aca >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145.inc >> @@ -0,0 +1,63 @@ >> +TYPE __attribute__ ((noinline)) >> +foo_sign (int *__restrict__ a, int *__restrict__ b, TYPE l, TYPE n) >> +{ >> + for (l = L_BASE; n < l; l += C) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +TYPE __attribute__ ((noinline)) >> +bar_sign (int *__restrict__ a, int *__restrict__ b, TYPE l, TYPE n) >> +{ >> + for (l = L_BASE_DOWN; l < n; l -= C) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +int __attribute__ ((noinline)) neq (int a, int b) { return a != b; } >> + >> +int a[1000], b[1000]; >> +int fail; >> + >> +int >> +main () >> +{ >> + TYPE res; >> + TYPE l; >> + TYPE n; >> + n = N_BASE; >> + l = n - C; >> + res = foo_sign (a, b, l, n); >> + if (res != l) >> + fail++; >> + >> + l = n; >> + res = foo_sign (a, b, l, n); >> + if (res != l) >> + fail++; >> + >> + l = n + C; >> + res = foo_sign (a, b, l, n); >> + if (neq ((res - MIN) / C, 0)) >> + fail++; >> + >> + n = N_BASE_DOWN; >> + l = n - C; >> + res = bar_sign (a, b, l, n); >> + if (neq ((MAX - res) / C, 0)) >> + fail++; >> + >> + l = n; >> + res = bar_sign (a, b, l, n); >> + if (res != l) >> + fail++; >> + >> + l = n + C; >> + res = bar_sign (a, b, l, n); >> + if (res != l) >> + fail++; >> + >> + if (fail) >> + __builtin_abort (); >> + return 0; >> +} >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145_1.c >> b/gcc/testsuite/gcc.dg/vect/pr101145_1.c >> new file mode 100644 >> index 00000000000..94f6b99b893 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145_1.c >> @@ -0,0 +1,15 @@ >> +/* { dg-require-effective-target vect_int } */ >> +/* { dg-options "-O3 -fdump-tree-vect-details" } */ >> +#define TYPE signed char >> +#define MIN -128 >> +#define MAX 127 >> +#define N_BASE (MAX - 32) >> +#define N_BASE_DOWN (MIN + 32) >> + >> +#define C 3 >> +#define L_BASE l >> +#define L_BASE_DOWN l >> + >> +#include "pr101145.inc" >> + >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } >> } */ >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145_2.c >> b/gcc/testsuite/gcc.dg/vect/pr101145_2.c >> new file mode 100644 >> index 00000000000..d3cfc9e01e1 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145_2.c >> @@ -0,0 +1,15 @@ >> +/* { dg-require-effective-target vect_int } */ >> +/* { dg-options "-O3 -fdump-tree-vect-details" } */ >> +#define TYPE unsigned char >> +#define MIN 0 >> +#define MAX 255 >> +#define N_BASE (MAX - 32 + 1) >> +#define N_BASE_DOWN (MIN + 32) >> + >> +#define C 2 >> +#define L_BASE l >> +#define L_BASE_DOWN l >> + >> +#include "pr101145.inc" >> + >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } >> } */ >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145_3.c >> b/gcc/testsuite/gcc.dg/vect/pr101145_3.c >> new file mode 100644 >> index 00000000000..e2cf9b1f7e6 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145_3.c >> @@ -0,0 +1,15 @@ >> +/* { dg-require-effective-target vect_int } */ >> +/* { dg-options "-O3 -fdump-tree-vect-details" } */ >> +#define TYPE int * >> +#define MIN ((TYPE)0) >> +#define MAX ((TYPE)((long long)-1)) >> +#define N_BASE ((TYPE) (-32)) >> +#define N_BASE_DOWN (MIN + 32) >> + >> +#define C 1 >> +#define L_BASE l >> +#define L_BASE_DOWN l >> + >> +#include "pr101145.inc" >> + >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } >> } */ >> diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c >> index b5add827018..06db6a36ef8 100644 >> --- a/gcc/tree-ssa-loop-niter.c >> +++ b/gcc/tree-ssa-loop-niter.c >> @@ -1473,6 +1473,86 @@ assert_loop_rolls_lt (tree type, affine_iv >> *iv0, affine_iv *iv1, >> } >> } >> >> +/* Determines number of iterations of loop whose ending condition >> + is IV0 < IV1 which likes: {base, -C} < n, or n < {base, C}. >> + The number of iterations is stored to NITER. */ >> + >> +static bool >> +number_of_iterations_until_wrap (class loop *, tree type, affine_iv >> *iv0, >> + affine_iv *iv1, class tree_niter_desc *niter) >> +{ >> + tree niter_type = unsigned_type_for (type); >> + tree max, min; >> + >> + if (POINTER_TYPE_P (type)) >> + { >> + max = fold_convert (type, TYPE_MAX_VALUE (niter_type)); >> + min = fold_convert (type, TYPE_MIN_VALUE (niter_type)); >> + } >> + else >> + { >> + max = TYPE_MAX_VALUE (type); >> + min = TYPE_MIN_VALUE (type); >> + } > > It would be cheaper to use wide_int here - wi::min/max_value > (TYPE_PRECISION (...), TYPE_SIGN (...)) and in the simplification > code below. Thanks! I think you mean to use wide_int for "tree high = max, low = min,". I would check and update the patch to use wide_int. The MAX/MIN would be in type 'tree' when binding to MINUS_EXPR with iv0/iv1->base, for this, we may still need to use "TYPE_MAX/MIN_VALUE (type)", right? > >> + tree high = max, low = min, one = build_int_cst (niter_type, 1); >> + tree step; >> + >> + /* n < {base, C}. */ >> + if (integer_zerop (iv0->step) && TREE_CODE (iv1->step) == >> INTEGER_CST >> + && !tree_int_cst_sign_bit (iv1->step)) >> + { >> + step = iv1->step; >> + niter->niter = fold_build2 (MINUS_EXPR, niter_type, max, >> iv1->base); >> + if (TREE_CODE (iv1->base) == INTEGER_CST) >> + low = fold_build2 (MINUS_EXPR, type, iv1->base, one); >> + else if (TREE_CODE (iv0->base) == INTEGER_CST) >> + low = iv0->base; >> + } >> + /* {base, -C} < n. */ >> + else if (TREE_CODE (iv0->step) == INTEGER_CST >> + && tree_int_cst_sign_bit (iv0->step) && integer_zerop >> (iv1->step)) >> + { >> + step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv0->step), >> iv0->step); >> + niter->niter = fold_build2 (MINUS_EXPR, niter_type, iv0->base, >> min); >> + if (TREE_CODE (iv0->base) == INTEGER_CST) >> + high = fold_build2 (PLUS_EXPR, type, iv0->base, one); >> + else if (TREE_CODE (iv1->base) == INTEGER_CST) >> + high = iv1->base; >> + } >> + else >> + return false; >> + >> + /* (delta + step - 1) / step */ >> + step = fold_convert (niter_type, step); >> + niter->niter = fold_convert (niter_type, niter->niter); >> + niter->niter = fold_build2 (PLUS_EXPR, niter_type, niter->niter, >> step); >> + niter->niter = fold_build2 (FLOOR_DIV_EXPR, niter_type, >> niter->niter, step); >> + >> + tree m = fold_build2 (MINUS_EXPR, niter_type, high, low); >> + m = fold_convert (niter_type, m); >> + mpz_t mstep, tmp, mmax; >> + mpz_init (mstep); >> + mpz_init (tmp); >> + mpz_init (mmax); >> + wi::to_mpz (wi::to_wide (step), mstep, UNSIGNED); >> + wi::to_mpz (wi::to_wide (m), mmax, UNSIGNED); >> + mpz_add (tmp, mmax, mstep); >> + mpz_sub_ui (tmp, tmp, 1); >> + mpz_fdiv_q (tmp, tmp, mstep); >> + niter->max = widest_int::from (wi::from_mpz (niter_type, tmp, >> false), >> + TYPE_SIGN (niter_type)); >> + mpz_clear (mstep); >> + mpz_clear (tmp); > > You forget to clear mmax? I wonder why you need to involve GMP here Oh, thanks for pointing out this :) need to clear mmax. > at all since this is just add/sub/divide - you could use > widest_int directly I think. At least the back-and-forth between > trees, wide-int and mpz looks odd. I once use 'trees' to do the calculation, while I encounter the issue that the may not hold the precision for some cases: "mmax" is MAX(1<<64-1); step is 4, then mmax + 4 becomes 3. I would check to use widest_int (e.g. long[4]) which seems enough to hold all types. BR, Jiufu Guo. > > Richard. > >> + niter->may_be_zero >> + = fold_build2 (LE_EXPR, boolean_type_node, iv1->base, iv0->base); >> + >> + niter->control.no_overflow = false; >> + >> + return true; >> +} >> + >> /* Determines number of iterations of loop whose ending condition >> is IV0 < IV1. TYPE is the type of the iv. The number of >> iterations is stored to NITER. BNDS bounds the difference >> @@ -1501,6 +1581,11 @@ number_of_iterations_lt (class loop *loop, tree >> type, affine_iv *iv0, >> niter->bound = iv0->base; >> } >> >> + /* {base, -C} < n, or n < {base, C} */ >> + if (tree_int_cst_sign_bit (iv0->step) >> + || (!integer_zerop (iv1->step) && !tree_int_cst_sign_bit >> (iv1->step))) >> + return number_of_iterations_until_wrap (loop, type, iv0, iv1, >> niter); >> + >> delta = fold_build2 (MINUS_EXPR, niter_type, >> fold_convert (niter_type, iv1->base), >> fold_convert (niter_type, iv0->base)); >> @@ -1665,62 +1750,6 @@ dump_affine_iv (FILE *file, affine_iv *iv) >> } >> } >> >> -/* Given exit condition IV0 CODE IV1 in TYPE, this function adjusts >> - the condition for loop-until-wrap cases. For example: >> - (unsigned){8, -1}_loop < 10 => {0, 1} != 9 >> - 10 < (unsigned){0, max - 7}_loop => {0, 1} != 8 >> - Return true if condition is successfully adjusted. */ >> - >> -static bool >> -adjust_cond_for_loop_until_wrap (tree type, affine_iv *iv0, tree_code >> *code, >> - affine_iv *iv1) >> -{ >> - /* Only support simple cases for the moment. */ >> - if (TREE_CODE (iv0->base) != INTEGER_CST >> - || TREE_CODE (iv1->base) != INTEGER_CST) >> - return false; >> - >> - tree niter_type = unsigned_type_for (type), high, low; >> - /* Case: i-- < 10. */ >> - if (integer_zerop (iv1->step)) >> - { >> - /* TODO: Should handle case in which abs(step) != 1. */ >> - if (!integer_minus_onep (iv0->step)) >> - return false; >> - /* Give up on infinite loop. */ >> - if (*code == LE_EXPR >> - && tree_int_cst_equal (iv1->base, TYPE_MAX_VALUE (type))) >> - return false; >> - high = fold_build2 (PLUS_EXPR, niter_type, >> - fold_convert (niter_type, iv0->base), >> - build_int_cst (niter_type, 1)); >> - low = fold_convert (niter_type, TYPE_MIN_VALUE (type)); >> - } >> - else if (integer_zerop (iv0->step)) >> - { >> - /* TODO: Should handle case in which abs(step) != 1. */ >> - if (!integer_onep (iv1->step)) >> - return false; >> - /* Give up on infinite loop. */ >> - if (*code == LE_EXPR >> - && tree_int_cst_equal (iv0->base, TYPE_MIN_VALUE (type))) >> - return false; >> - high = fold_convert (niter_type, TYPE_MAX_VALUE (type)); >> - low = fold_build2 (MINUS_EXPR, niter_type, >> - fold_convert (niter_type, iv1->base), >> - build_int_cst (niter_type, 1)); >> - } >> - else >> - gcc_unreachable (); >> - >> - iv0->base = low; >> - iv0->step = fold_convert (niter_type, integer_one_node); >> - iv1->base = high; >> - iv1->step = build_int_cst (niter_type, 0); >> - *code = NE_EXPR; >> - return true; >> -} >> - >> /* Determine the number of iterations according to condition (for >> staying >> inside loop) which compares two induction variables using >> comparison >> operator CODE. The induction variable on left side of the >> comparison >> @@ -1855,15 +1884,6 @@ number_of_iterations_cond (class loop *loop, >> return true; >> } >> >> - /* Handle special case loops: while (i-- < 10) and while (10 < i++) >> by >> - adjusting iv0, iv1 and code. */ >> - if (code != NE_EXPR >> - && (tree_int_cst_sign_bit (iv0->step) >> - || (!integer_zerop (iv1->step) >> - && !tree_int_cst_sign_bit (iv1->step))) >> - && !adjust_cond_for_loop_until_wrap (type, iv0, &code, iv1)) >> - return false; >> - >> /* OK, now we know we have a senseful loop. Handle several cases, >> depending >> on what comparison operator is used. */ >> bound_difference (loop, iv1->base, iv0->base, &bnds); >> ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] Analyze niter for until-wrap condition [PR101145] 2021-07-01 14:12 ` guojiufu @ 2021-07-02 0:51 ` Bin.Cheng 2021-07-02 4:06 ` guojiufu 0 siblings, 1 reply; 8+ messages in thread From: Bin.Cheng @ 2021-07-02 0:51 UTC (permalink / raw) To: guojiufu Cc: Richard Biener, Segher Boessenkool, amker, gcc-patches List, Bill Schmidt, jlaw, dje.gcc On Thu, Jul 1, 2021 at 10:15 PM guojiufu via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > On 2021-07-01 20:35, Richard Biener wrote: > > On Thu, 1 Jul 2021, Jiufu Guo wrote: > > > >> For code like: > >> unsigned foo(unsigned val, unsigned start) > >> { > >> unsigned cnt = 0; > >> for (unsigned i = start; i > val; ++i) > >> cnt++; > >> return cnt; > >> } > >> > >> The number of iterations should be about UINT_MAX - start. > > > > For > > > > unsigned foo(unsigned val, unsigned start) > > { > > unsigned cnt = 0; > > for (unsigned i = start; i >= val; ++i) > > cnt++; > > return cnt; > > } > > > > and val == 0 the loop never terminates. I don't see anywhere > > in the patch that you disregard GE_EXPR and I remember > > the code handles GE as well as GT? From a quick look this is > > also not covered by a testcase you add - not exactly sure > > how it would materialize in a miscompilation. > > In number_of_iterations_cond, there is code: > if (code == GE_EXPR || code == GT_EXPR > || (code == NE_EXPR && integer_zerop (iv0->step))) > { > std::swap (iv0, iv1); > code = swap_tree_comparison (code); > } > It converts "GT/GE" (i >= val) to "LT/LE" (val <= i), > and LE (val <= i) is converted to LT (val - 1 < i). > So, the code is added to number_of_iterations_lt. > > But, this patch leads mis-compilation for unsigned "i >= val" as > above transforms: converting LE (val <= i) to LT (val - 1 < i) > seems not appropriate (e.g where val=0). I don't know where the exact code is, but IIRC, number_of_iteration handles boundary conditions when transforming <= into <. You may check it out. > Thanks for pointing out this!!! > > I would investigate a way to handle this correctly. > A possible way maybe just to return false for this kind of LE. IIRC, it checks the boundary conditions, either returns false or simply introduces more assumptions. > > Any suggestions? > > > > >> There is function adjust_cond_for_loop_until_wrap which > >> handles similar work for const bases. > >> Like adjust_cond_for_loop_until_wrap, this patch enhance > >> function number_of_iterations_cond/number_of_iterations_lt > >> to analyze number of iterations for this kind of loop. > >> > >> Bootstrap and regtest pass on powerpc64le, is this ok for trunk? > >> > >> gcc/ChangeLog: > >> > >> PR tree-optimization/101145 > >> * tree-ssa-loop-niter.c > >> (number_of_iterations_until_wrap): New function. > >> (number_of_iterations_lt): Invoke above function. > >> (adjust_cond_for_loop_until_wrap): > >> Merge to number_of_iterations_until_wrap. > >> (number_of_iterations_cond): Update invokes for > >> adjust_cond_for_loop_until_wrap and number_of_iterations_lt. > >> > >> gcc/testsuite/ChangeLog: > >> > >> PR tree-optimization/101145 > >> * gcc.dg/vect/pr101145.c: New test. > >> * gcc.dg/vect/pr101145.inc: New test. > >> * gcc.dg/vect/pr101145_1.c: New test. > >> * gcc.dg/vect/pr101145_2.c: New test. > >> * gcc.dg/vect/pr101145_3.c: New test. > >> --- > >> gcc/testsuite/gcc.dg/vect/pr101145.c | 187 > >> +++++++++++++++++++++++++ > >> gcc/testsuite/gcc.dg/vect/pr101145.inc | 63 +++++++++ > >> gcc/testsuite/gcc.dg/vect/pr101145_1.c | 15 ++ > >> gcc/testsuite/gcc.dg/vect/pr101145_2.c | 15 ++ > >> gcc/testsuite/gcc.dg/vect/pr101145_3.c | 15 ++ > >> gcc/tree-ssa-loop-niter.c | 150 +++++++++++--------- > >> 6 files changed, 380 insertions(+), 65 deletions(-) > >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145.c > >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145.inc > >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_1.c > >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_2.c > >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_3.c > >> > >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145.c > >> b/gcc/testsuite/gcc.dg/vect/pr101145.c > >> new file mode 100644 > >> index 00000000000..74031b031cf > >> --- /dev/null > >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145.c > >> @@ -0,0 +1,187 @@ > >> +/* { dg-require-effective-target vect_int } */ > >> +/* { dg-options "-O3 -fdump-tree-vect-details" } */ > >> +#include <limits.h> > >> + > >> +unsigned __attribute__ ((noinline)) > >> +foo (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned > >> n) > >> +{ > >> + while (n < ++l) > >> + *a++ = *b++ + 1; > >> + return l; > >> +} > >> + > >> +unsigned __attribute__ ((noinline)) > >> +foo_1 (int *__restrict__ a, int *__restrict__ b, unsigned l, > >> unsigned) > >> +{ > >> + while (UINT_MAX - 64 < ++l) > >> + *a++ = *b++ + 1; > >> + return l; > >> +} > >> + > >> +unsigned __attribute__ ((noinline)) > >> +foo_2 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned > >> n) > >> +{ > >> + l = UINT_MAX - 32; > >> + while (n < ++l) > >> + *a++ = *b++ + 1; > >> + return l; > >> +} > >> + > >> +unsigned __attribute__ ((noinline)) > >> +foo_3 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned > >> n) > >> +{ > >> + while (n <= ++l) > >> + *a++ = *b++ + 1; > >> + return l; > >> +} > >> + > >> +unsigned __attribute__ ((noinline)) > >> +foo_4 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned > >> n) > >> +{ // infininate > >> + while (0 <= ++l) > >> + *a++ = *b++ + 1; > >> + return l; > >> +} > >> + > >> +unsigned __attribute__ ((noinline)) > >> +foo_5 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned > >> n) > >> +{ > >> + //no loop > >> + l = UINT_MAX; > >> + while (n < ++l) > >> + *a++ = *b++ + 1; > >> + return l; > >> +} > >> + > >> +unsigned __attribute__ ((noinline)) > >> +bar (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned > >> n) > >> +{ > >> + while (--l < n) > >> + *a++ = *b++ + 1; > >> + return l; > >> +} > >> + > >> +unsigned __attribute__ ((noinline)) > >> +bar_1 (int *__restrict__ a, int *__restrict__ b, unsigned l, > >> unsigned) > >> +{ > >> + while (--l < 64) > >> + *a++ = *b++ + 1; > >> + return l; > >> +} > >> + > >> +unsigned __attribute__ ((noinline)) > >> +bar_2 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned > >> n) > >> +{ > >> + l = 32; > >> + while (--l < n) > >> + *a++ = *b++ + 1; > >> + return l; > >> +} > >> + > >> + > >> +int a[3200], b[3200]; > >> +int fail; > >> + > >> +int > >> +main () > >> +{ > >> + unsigned l, n; > >> + unsigned res; > >> + /* l > n*/ > >> + n = UINT_MAX - 64; > >> + l = n + 32; > >> + res = foo (a, b, l, n); > >> + if (res != 0) > >> + fail++; > >> + > >> + l = n; > >> + res = foo (a, b, l, n); > >> + if (res != 0) > >> + fail++; > >> + > >> + l = n - 1; > >> + res = foo (a, b, l, n); > >> + if (res != l + 1) > >> + fail++; > >> + > >> + l = n - 32; > >> + res = foo (a, b, l, n); > >> + if (res != l + 1) > >> + fail++; > >> + > >> + l = UINT_MAX; > >> + res = foo (a, b, l, n); > >> + if (res != 0) > >> + fail++; > >> + > >> + l = n + 32; > >> + res = foo_1 (a, b, l, n); > >> + if (res != 0) > >> + fail++; > >> + > >> + l = n + 32; > >> + res = foo_2 (a, b, l, n); > >> + if (res != 0) > >> + fail++; > >> + > >> + l = n; > >> + res = foo_3 (a, b, l, n); > >> + if (res != 0) > >> + fail++; > >> + > >> + l = n - 1; > >> + res = foo_3 (a, b, l, n); > >> + if (res != 0) > >> + fail++; > >> + > >> + l = n - 2; > >> + res = foo_3 (a, b, l, n); > >> + if (res != l + 1) > >> + fail++; > >> + > >> + res = foo_5 (a, b, l, n); > >> + if (res != 0) > >> + fail++; > >> + > >> + n = 64; > >> + l = n - 32; > >> + res = bar (a, b, l, n); > >> + res++; > >> + if (res != 0) > >> + fail++; > >> + > >> + l = n; > >> + res = bar (a, b, l, n); > >> + res++; > >> + if (res != 0) > >> + fail++; > >> + > >> + l = n + 1; > >> + res = bar (a, b, l, n); > >> + res++; > >> + if (res != l) > >> + fail++; > >> + > >> + l = 0; > >> + res = bar (a, b, l, n); > >> + res++; > >> + if (res != l) > >> + fail++; > >> + > >> + l = 32; > >> + res = bar_1 (a, b, l, n); > >> + res++; > >> + if (res != 0) > >> + fail++; > >> + > >> + res = bar_1 (a, b, l, n); > >> + res++; > >> + if (res != 0) > >> + fail++; > >> + > >> + if (fail) > >> + __builtin_abort (); > >> + return 0; > >> +} > >> + > >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 7 "vect" } > >> } */ > >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145.inc > >> b/gcc/testsuite/gcc.dg/vect/pr101145.inc > >> new file mode 100644 > >> index 00000000000..6eed3fa8aca > >> --- /dev/null > >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145.inc > >> @@ -0,0 +1,63 @@ > >> +TYPE __attribute__ ((noinline)) > >> +foo_sign (int *__restrict__ a, int *__restrict__ b, TYPE l, TYPE n) > >> +{ > >> + for (l = L_BASE; n < l; l += C) > >> + *a++ = *b++ + 1; > >> + return l; > >> +} > >> + > >> +TYPE __attribute__ ((noinline)) > >> +bar_sign (int *__restrict__ a, int *__restrict__ b, TYPE l, TYPE n) > >> +{ > >> + for (l = L_BASE_DOWN; l < n; l -= C) > >> + *a++ = *b++ + 1; > >> + return l; > >> +} > >> + > >> +int __attribute__ ((noinline)) neq (int a, int b) { return a != b; } > >> + > >> +int a[1000], b[1000]; > >> +int fail; > >> + > >> +int > >> +main () > >> +{ > >> + TYPE res; > >> + TYPE l; > >> + TYPE n; > >> + n = N_BASE; > >> + l = n - C; > >> + res = foo_sign (a, b, l, n); > >> + if (res != l) > >> + fail++; > >> + > >> + l = n; > >> + res = foo_sign (a, b, l, n); > >> + if (res != l) > >> + fail++; > >> + > >> + l = n + C; > >> + res = foo_sign (a, b, l, n); > >> + if (neq ((res - MIN) / C, 0)) > >> + fail++; > >> + > >> + n = N_BASE_DOWN; > >> + l = n - C; > >> + res = bar_sign (a, b, l, n); > >> + if (neq ((MAX - res) / C, 0)) > >> + fail++; > >> + > >> + l = n; > >> + res = bar_sign (a, b, l, n); > >> + if (res != l) > >> + fail++; > >> + > >> + l = n + C; > >> + res = bar_sign (a, b, l, n); > >> + if (res != l) > >> + fail++; > >> + > >> + if (fail) > >> + __builtin_abort (); > >> + return 0; > >> +} > >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145_1.c > >> b/gcc/testsuite/gcc.dg/vect/pr101145_1.c > >> new file mode 100644 > >> index 00000000000..94f6b99b893 > >> --- /dev/null > >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145_1.c > >> @@ -0,0 +1,15 @@ > >> +/* { dg-require-effective-target vect_int } */ > >> +/* { dg-options "-O3 -fdump-tree-vect-details" } */ > >> +#define TYPE signed char > >> +#define MIN -128 > >> +#define MAX 127 > >> +#define N_BASE (MAX - 32) > >> +#define N_BASE_DOWN (MIN + 32) > >> + > >> +#define C 3 > >> +#define L_BASE l > >> +#define L_BASE_DOWN l > >> + > >> +#include "pr101145.inc" > >> + > >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } > >> } */ > >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145_2.c > >> b/gcc/testsuite/gcc.dg/vect/pr101145_2.c > >> new file mode 100644 > >> index 00000000000..d3cfc9e01e1 > >> --- /dev/null > >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145_2.c > >> @@ -0,0 +1,15 @@ > >> +/* { dg-require-effective-target vect_int } */ > >> +/* { dg-options "-O3 -fdump-tree-vect-details" } */ > >> +#define TYPE unsigned char > >> +#define MIN 0 > >> +#define MAX 255 > >> +#define N_BASE (MAX - 32 + 1) > >> +#define N_BASE_DOWN (MIN + 32) > >> + > >> +#define C 2 > >> +#define L_BASE l > >> +#define L_BASE_DOWN l > >> + > >> +#include "pr101145.inc" > >> + > >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } > >> } */ > >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145_3.c > >> b/gcc/testsuite/gcc.dg/vect/pr101145_3.c > >> new file mode 100644 > >> index 00000000000..e2cf9b1f7e6 > >> --- /dev/null > >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145_3.c > >> @@ -0,0 +1,15 @@ > >> +/* { dg-require-effective-target vect_int } */ > >> +/* { dg-options "-O3 -fdump-tree-vect-details" } */ > >> +#define TYPE int * > >> +#define MIN ((TYPE)0) > >> +#define MAX ((TYPE)((long long)-1)) > >> +#define N_BASE ((TYPE) (-32)) > >> +#define N_BASE_DOWN (MIN + 32) > >> + > >> +#define C 1 > >> +#define L_BASE l > >> +#define L_BASE_DOWN l > >> + > >> +#include "pr101145.inc" > >> + > >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } > >> } */ > >> diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c > >> index b5add827018..06db6a36ef8 100644 > >> --- a/gcc/tree-ssa-loop-niter.c > >> +++ b/gcc/tree-ssa-loop-niter.c > >> @@ -1473,6 +1473,86 @@ assert_loop_rolls_lt (tree type, affine_iv > >> *iv0, affine_iv *iv1, > >> } > >> } > >> > >> +/* Determines number of iterations of loop whose ending condition > >> + is IV0 < IV1 which likes: {base, -C} < n, or n < {base, C}. > >> + The number of iterations is stored to NITER. */ > >> + > >> +static bool > >> +number_of_iterations_until_wrap (class loop *, tree type, affine_iv > >> *iv0, > >> + affine_iv *iv1, class tree_niter_desc *niter) > >> +{ > >> + tree niter_type = unsigned_type_for (type); > >> + tree max, min; > >> + > >> + if (POINTER_TYPE_P (type)) > >> + { > >> + max = fold_convert (type, TYPE_MAX_VALUE (niter_type)); > >> + min = fold_convert (type, TYPE_MIN_VALUE (niter_type)); > >> + } > >> + else > >> + { > >> + max = TYPE_MAX_VALUE (type); > >> + min = TYPE_MIN_VALUE (type); > >> + } > > > > It would be cheaper to use wide_int here - wi::min/max_value > > (TYPE_PRECISION (...), TYPE_SIGN (...)) and in the simplification > > code below. > > Thanks! I think you mean to use wide_int for "tree high = max, low = > min,". > I would check and update the patch to use wide_int. > > The MAX/MIN would be in type 'tree' when binding to MINUS_EXPR with > iv0/iv1->base, > for this, we may still need to use "TYPE_MAX/MIN_VALUE (type)", right? > > > > >> + tree high = max, low = min, one = build_int_cst (niter_type, 1); > >> + tree step; > >> + > >> + /* n < {base, C}. */ > >> + if (integer_zerop (iv0->step) && TREE_CODE (iv1->step) == > >> INTEGER_CST > >> + && !tree_int_cst_sign_bit (iv1->step)) > >> + { > >> + step = iv1->step; > >> + niter->niter = fold_build2 (MINUS_EXPR, niter_type, max, > >> iv1->base); > >> + if (TREE_CODE (iv1->base) == INTEGER_CST) > >> + low = fold_build2 (MINUS_EXPR, type, iv1->base, one); > >> + else if (TREE_CODE (iv0->base) == INTEGER_CST) > >> + low = iv0->base; > >> + } > >> + /* {base, -C} < n. */ > >> + else if (TREE_CODE (iv0->step) == INTEGER_CST > >> + && tree_int_cst_sign_bit (iv0->step) && integer_zerop > >> (iv1->step)) > >> + { > >> + step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv0->step), > >> iv0->step); > >> + niter->niter = fold_build2 (MINUS_EXPR, niter_type, iv0->base, > >> min); > >> + if (TREE_CODE (iv0->base) == INTEGER_CST) > >> + high = fold_build2 (PLUS_EXPR, type, iv0->base, one); > >> + else if (TREE_CODE (iv1->base) == INTEGER_CST) > >> + high = iv1->base; > >> + } > >> + else > >> + return false; > >> + > >> + /* (delta + step - 1) / step */ > >> + step = fold_convert (niter_type, step); > >> + niter->niter = fold_convert (niter_type, niter->niter); > >> + niter->niter = fold_build2 (PLUS_EXPR, niter_type, niter->niter, > >> step); > >> + niter->niter = fold_build2 (FLOOR_DIV_EXPR, niter_type, > >> niter->niter, step); > >> + > >> + tree m = fold_build2 (MINUS_EXPR, niter_type, high, low); > >> + m = fold_convert (niter_type, m); > >> + mpz_t mstep, tmp, mmax; > >> + mpz_init (mstep); > >> + mpz_init (tmp); > >> + mpz_init (mmax); > >> + wi::to_mpz (wi::to_wide (step), mstep, UNSIGNED); > >> + wi::to_mpz (wi::to_wide (m), mmax, UNSIGNED); > >> + mpz_add (tmp, mmax, mstep); > >> + mpz_sub_ui (tmp, tmp, 1); > >> + mpz_fdiv_q (tmp, tmp, mstep); > >> + niter->max = widest_int::from (wi::from_mpz (niter_type, tmp, > >> false), > >> + TYPE_SIGN (niter_type)); > >> + mpz_clear (mstep); > >> + mpz_clear (tmp); > > > > You forget to clear mmax? I wonder why you need to involve GMP here > Oh, thanks for pointing out this :) need to clear mmax. > > > at all since this is just add/sub/divide - you could use > > widest_int directly I think. At least the back-and-forth between > > trees, wide-int and mpz looks odd. > I once use 'trees' to do the calculation, while I encounter the issue > that the may not hold the precision for some cases: > "mmax" is MAX(1<<64-1); step is 4, then mmax + 4 becomes 3. > > I would check to use widest_int (e.g. long[4]) which seems enough to > hold all types. > > > BR, > Jiufu Guo. > > > > > Richard. > > > >> + niter->may_be_zero > >> + = fold_build2 (LE_EXPR, boolean_type_node, iv1->base, iv0->base); > >> + > >> + niter->control.no_overflow = false; > >> + > >> + return true; > >> +} > >> + > >> /* Determines number of iterations of loop whose ending condition > >> is IV0 < IV1. TYPE is the type of the iv. The number of > >> iterations is stored to NITER. BNDS bounds the difference > >> @@ -1501,6 +1581,11 @@ number_of_iterations_lt (class loop *loop, tree > >> type, affine_iv *iv0, > >> niter->bound = iv0->base; > >> } > >> > >> + /* {base, -C} < n, or n < {base, C} */ > >> + if (tree_int_cst_sign_bit (iv0->step) > >> + || (!integer_zerop (iv1->step) && !tree_int_cst_sign_bit > >> (iv1->step))) > >> + return number_of_iterations_until_wrap (loop, type, iv0, iv1, > >> niter); > >> + > >> delta = fold_build2 (MINUS_EXPR, niter_type, > >> fold_convert (niter_type, iv1->base), > >> fold_convert (niter_type, iv0->base)); > >> @@ -1665,62 +1750,6 @@ dump_affine_iv (FILE *file, affine_iv *iv) > >> } > >> } > >> > >> -/* Given exit condition IV0 CODE IV1 in TYPE, this function adjusts > >> - the condition for loop-until-wrap cases. For example: > >> - (unsigned){8, -1}_loop < 10 => {0, 1} != 9 > >> - 10 < (unsigned){0, max - 7}_loop => {0, 1} != 8 > >> - Return true if condition is successfully adjusted. */ > >> - > >> -static bool > >> -adjust_cond_for_loop_until_wrap (tree type, affine_iv *iv0, tree_code > >> *code, > >> - affine_iv *iv1) > >> -{ > >> - /* Only support simple cases for the moment. */ > >> - if (TREE_CODE (iv0->base) != INTEGER_CST > >> - || TREE_CODE (iv1->base) != INTEGER_CST) > >> - return false; > >> - > >> - tree niter_type = unsigned_type_for (type), high, low; > >> - /* Case: i-- < 10. */ > >> - if (integer_zerop (iv1->step)) > >> - { > >> - /* TODO: Should handle case in which abs(step) != 1. */ > >> - if (!integer_minus_onep (iv0->step)) > >> - return false; > >> - /* Give up on infinite loop. */ > >> - if (*code == LE_EXPR > >> - && tree_int_cst_equal (iv1->base, TYPE_MAX_VALUE (type))) > >> - return false; > >> - high = fold_build2 (PLUS_EXPR, niter_type, > >> - fold_convert (niter_type, iv0->base), > >> - build_int_cst (niter_type, 1)); > >> - low = fold_convert (niter_type, TYPE_MIN_VALUE (type)); > >> - } > >> - else if (integer_zerop (iv0->step)) > >> - { > >> - /* TODO: Should handle case in which abs(step) != 1. */ > >> - if (!integer_onep (iv1->step)) > >> - return false; > >> - /* Give up on infinite loop. */ > >> - if (*code == LE_EXPR > >> - && tree_int_cst_equal (iv0->base, TYPE_MIN_VALUE (type))) > >> - return false; > >> - high = fold_convert (niter_type, TYPE_MAX_VALUE (type)); > >> - low = fold_build2 (MINUS_EXPR, niter_type, > >> - fold_convert (niter_type, iv1->base), > >> - build_int_cst (niter_type, 1)); > >> - } > >> - else > >> - gcc_unreachable (); > >> - > >> - iv0->base = low; > >> - iv0->step = fold_convert (niter_type, integer_one_node); > >> - iv1->base = high; > >> - iv1->step = build_int_cst (niter_type, 0); > >> - *code = NE_EXPR; > >> - return true; > >> -} > >> - > >> /* Determine the number of iterations according to condition (for > >> staying > >> inside loop) which compares two induction variables using > >> comparison > >> operator CODE. The induction variable on left side of the > >> comparison > >> @@ -1855,15 +1884,6 @@ number_of_iterations_cond (class loop *loop, > >> return true; > >> } > >> > >> - /* Handle special case loops: while (i-- < 10) and while (10 < i++) > >> by > >> - adjusting iv0, iv1 and code. */ > >> - if (code != NE_EXPR > >> - && (tree_int_cst_sign_bit (iv0->step) > >> - || (!integer_zerop (iv1->step) > >> - && !tree_int_cst_sign_bit (iv1->step))) > >> - && !adjust_cond_for_loop_until_wrap (type, iv0, &code, iv1)) > >> - return false; > >> - > >> /* OK, now we know we have a senseful loop. Handle several cases, > >> depending > >> on what comparison operator is used. */ > >> bound_difference (loop, iv1->base, iv0->base, &bnds); > >> ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] Analyze niter for until-wrap condition [PR101145] 2021-07-02 0:51 ` Bin.Cheng @ 2021-07-02 4:06 ` guojiufu 0 siblings, 0 replies; 8+ messages in thread From: guojiufu @ 2021-07-02 4:06 UTC (permalink / raw) To: Bin.Cheng Cc: Richard Biener, Segher Boessenkool, amker, gcc-patches List, Bill Schmidt, jlaw, dje.gcc On 2021-07-02 08:51, Bin.Cheng wrote: > On Thu, Jul 1, 2021 at 10:15 PM guojiufu via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: >> >> On 2021-07-01 20:35, Richard Biener wrote: >> > On Thu, 1 Jul 2021, Jiufu Guo wrote: >> > >> >> For code like: >> >> unsigned foo(unsigned val, unsigned start) >> >> { >> >> unsigned cnt = 0; >> >> for (unsigned i = start; i > val; ++i) >> >> cnt++; >> >> return cnt; >> >> } >> >> >> >> The number of iterations should be about UINT_MAX - start. >> > >> > For >> > >> > unsigned foo(unsigned val, unsigned start) >> > { >> > unsigned cnt = 0; >> > for (unsigned i = start; i >= val; ++i) >> > cnt++; >> > return cnt; >> > } >> > >> > and val == 0 the loop never terminates. I don't see anywhere >> > in the patch that you disregard GE_EXPR and I remember >> > the code handles GE as well as GT? From a quick look this is >> > also not covered by a testcase you add - not exactly sure >> > how it would materialize in a miscompilation. >> >> In number_of_iterations_cond, there is code: >> if (code == GE_EXPR || code == GT_EXPR >> || (code == NE_EXPR && integer_zerop (iv0->step))) >> { >> std::swap (iv0, iv1); >> code = swap_tree_comparison (code); >> } >> It converts "GT/GE" (i >= val) to "LT/LE" (val <= i), >> and LE (val <= i) is converted to LT (val - 1 < i). >> So, the code is added to number_of_iterations_lt. >> >> But, this patch leads mis-compilation for unsigned "i >= val" as >> above transforms: converting LE (val <= i) to LT (val - 1 < i) >> seems not appropriate (e.g where val=0). > I don't know where the exact code is, but IIRC, number_of_iteration > handles boundary conditions when transforming <= into <. You may > check it out. Yes, in number_of_iterations_le, there is code to check MAX/MIN if (integer_nonzerop (iv0->step)) assumption = fold_build2 (NE_EXPR, boolean_type_node, iv1->base, TYPE_MAX_VALUE (type)); else assumption = fold_build2 (NE_EXPR, boolean_type_node, iv0->base, TYPE_MIN_VALUE (type)); Checking why this code does not help. > >> Thanks for pointing out this!!! >> >> I would investigate a way to handle this correctly. >> A possible way maybe just to return false for this kind of LE. > IIRC, it checks the boundary conditions, either returns false or > simply introduces more assumptions. Thanks! Adding more assumptions would help. The below code also runs into infinite, more assumptions may help this code. __attribute__ ((noinline)) unsigned foo(unsigned val, unsigned start) { unsigned cnt = 0; for (unsigned i = start; val <= i; i+=16) cnt++; return cnt; } foo (4, 8); Thanks again! BR, Jiufu Guo >> >> Any suggestions? >> >> > >> >> There is function adjust_cond_for_loop_until_wrap which >> >> handles similar work for const bases. >> >> Like adjust_cond_for_loop_until_wrap, this patch enhance >> >> function number_of_iterations_cond/number_of_iterations_lt >> >> to analyze number of iterations for this kind of loop. >> >> >> >> Bootstrap and regtest pass on powerpc64le, is this ok for trunk? >> >> >> >> gcc/ChangeLog: >> >> >> >> PR tree-optimization/101145 >> >> * tree-ssa-loop-niter.c >> >> (number_of_iterations_until_wrap): New function. >> >> (number_of_iterations_lt): Invoke above function. >> >> (adjust_cond_for_loop_until_wrap): >> >> Merge to number_of_iterations_until_wrap. >> >> (number_of_iterations_cond): Update invokes for >> >> adjust_cond_for_loop_until_wrap and number_of_iterations_lt. >> >> >> >> gcc/testsuite/ChangeLog: >> >> >> >> PR tree-optimization/101145 >> >> * gcc.dg/vect/pr101145.c: New test. >> >> * gcc.dg/vect/pr101145.inc: New test. >> >> * gcc.dg/vect/pr101145_1.c: New test. >> >> * gcc.dg/vect/pr101145_2.c: New test. >> >> * gcc.dg/vect/pr101145_3.c: New test. >> >> --- >> >> gcc/testsuite/gcc.dg/vect/pr101145.c | 187 >> >> +++++++++++++++++++++++++ >> >> gcc/testsuite/gcc.dg/vect/pr101145.inc | 63 +++++++++ >> >> gcc/testsuite/gcc.dg/vect/pr101145_1.c | 15 ++ >> >> gcc/testsuite/gcc.dg/vect/pr101145_2.c | 15 ++ >> >> gcc/testsuite/gcc.dg/vect/pr101145_3.c | 15 ++ >> >> gcc/tree-ssa-loop-niter.c | 150 +++++++++++--------- >> >> 6 files changed, 380 insertions(+), 65 deletions(-) >> >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145.c >> >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145.inc >> >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_1.c >> >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_2.c >> >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_3.c >> >> >> >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145.c >> >> b/gcc/testsuite/gcc.dg/vect/pr101145.c >> >> new file mode 100644 >> >> index 00000000000..74031b031cf >> >> --- /dev/null >> >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145.c >> >> @@ -0,0 +1,187 @@ >> >> +/* { dg-require-effective-target vect_int } */ >> >> +/* { dg-options "-O3 -fdump-tree-vect-details" } */ >> >> +#include <limits.h> >> >> + >> >> +unsigned __attribute__ ((noinline)) >> >> +foo (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> >> n) >> >> +{ >> >> + while (n < ++l) >> >> + *a++ = *b++ + 1; >> >> + return l; >> >> +} >> >> + >> >> +unsigned __attribute__ ((noinline)) >> >> +foo_1 (int *__restrict__ a, int *__restrict__ b, unsigned l, >> >> unsigned) >> >> +{ >> >> + while (UINT_MAX - 64 < ++l) >> >> + *a++ = *b++ + 1; >> >> + return l; >> >> +} >> >> + >> >> +unsigned __attribute__ ((noinline)) >> >> +foo_2 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> >> n) >> >> +{ >> >> + l = UINT_MAX - 32; >> >> + while (n < ++l) >> >> + *a++ = *b++ + 1; >> >> + return l; >> >> +} >> >> + >> >> +unsigned __attribute__ ((noinline)) >> >> +foo_3 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> >> n) >> >> +{ >> >> + while (n <= ++l) >> >> + *a++ = *b++ + 1; >> >> + return l; >> >> +} >> >> + >> >> +unsigned __attribute__ ((noinline)) >> >> +foo_4 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> >> n) >> >> +{ // infininate >> >> + while (0 <= ++l) >> >> + *a++ = *b++ + 1; >> >> + return l; >> >> +} >> >> + >> >> +unsigned __attribute__ ((noinline)) >> >> +foo_5 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> >> n) >> >> +{ >> >> + //no loop >> >> + l = UINT_MAX; >> >> + while (n < ++l) >> >> + *a++ = *b++ + 1; >> >> + return l; >> >> +} >> >> + >> >> +unsigned __attribute__ ((noinline)) >> >> +bar (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> >> n) >> >> +{ >> >> + while (--l < n) >> >> + *a++ = *b++ + 1; >> >> + return l; >> >> +} >> >> + >> >> +unsigned __attribute__ ((noinline)) >> >> +bar_1 (int *__restrict__ a, int *__restrict__ b, unsigned l, >> >> unsigned) >> >> +{ >> >> + while (--l < 64) >> >> + *a++ = *b++ + 1; >> >> + return l; >> >> +} >> >> + >> >> +unsigned __attribute__ ((noinline)) >> >> +bar_2 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> >> n) >> >> +{ >> >> + l = 32; >> >> + while (--l < n) >> >> + *a++ = *b++ + 1; >> >> + return l; >> >> +} >> >> + >> >> + >> >> +int a[3200], b[3200]; >> >> +int fail; >> >> + >> >> +int >> >> +main () >> >> +{ >> >> + unsigned l, n; >> >> + unsigned res; >> >> + /* l > n*/ >> >> + n = UINT_MAX - 64; >> >> + l = n + 32; >> >> + res = foo (a, b, l, n); >> >> + if (res != 0) >> >> + fail++; >> >> + >> >> + l = n; >> >> + res = foo (a, b, l, n); >> >> + if (res != 0) >> >> + fail++; >> >> + >> >> + l = n - 1; >> >> + res = foo (a, b, l, n); >> >> + if (res != l + 1) >> >> + fail++; >> >> + >> >> + l = n - 32; >> >> + res = foo (a, b, l, n); >> >> + if (res != l + 1) >> >> + fail++; >> >> + >> >> + l = UINT_MAX; >> >> + res = foo (a, b, l, n); >> >> + if (res != 0) >> >> + fail++; >> >> + >> >> + l = n + 32; >> >> + res = foo_1 (a, b, l, n); >> >> + if (res != 0) >> >> + fail++; >> >> + >> >> + l = n + 32; >> >> + res = foo_2 (a, b, l, n); >> >> + if (res != 0) >> >> + fail++; >> >> + >> >> + l = n; >> >> + res = foo_3 (a, b, l, n); >> >> + if (res != 0) >> >> + fail++; >> >> + >> >> + l = n - 1; >> >> + res = foo_3 (a, b, l, n); >> >> + if (res != 0) >> >> + fail++; >> >> + >> >> + l = n - 2; >> >> + res = foo_3 (a, b, l, n); >> >> + if (res != l + 1) >> >> + fail++; >> >> + >> >> + res = foo_5 (a, b, l, n); >> >> + if (res != 0) >> >> + fail++; >> >> + >> >> + n = 64; >> >> + l = n - 32; >> >> + res = bar (a, b, l, n); >> >> + res++; >> >> + if (res != 0) >> >> + fail++; >> >> + >> >> + l = n; >> >> + res = bar (a, b, l, n); >> >> + res++; >> >> + if (res != 0) >> >> + fail++; >> >> + >> >> + l = n + 1; >> >> + res = bar (a, b, l, n); >> >> + res++; >> >> + if (res != l) >> >> + fail++; >> >> + >> >> + l = 0; >> >> + res = bar (a, b, l, n); >> >> + res++; >> >> + if (res != l) >> >> + fail++; >> >> + >> >> + l = 32; >> >> + res = bar_1 (a, b, l, n); >> >> + res++; >> >> + if (res != 0) >> >> + fail++; >> >> + >> >> + res = bar_1 (a, b, l, n); >> >> + res++; >> >> + if (res != 0) >> >> + fail++; >> >> + >> >> + if (fail) >> >> + __builtin_abort (); >> >> + return 0; >> >> +} >> >> + >> >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 7 "vect" } >> >> } */ >> >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145.inc >> >> b/gcc/testsuite/gcc.dg/vect/pr101145.inc >> >> new file mode 100644 >> >> index 00000000000..6eed3fa8aca >> >> --- /dev/null >> >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145.inc >> >> @@ -0,0 +1,63 @@ >> >> +TYPE __attribute__ ((noinline)) >> >> +foo_sign (int *__restrict__ a, int *__restrict__ b, TYPE l, TYPE n) >> >> +{ >> >> + for (l = L_BASE; n < l; l += C) >> >> + *a++ = *b++ + 1; >> >> + return l; >> >> +} >> >> + >> >> +TYPE __attribute__ ((noinline)) >> >> +bar_sign (int *__restrict__ a, int *__restrict__ b, TYPE l, TYPE n) >> >> +{ >> >> + for (l = L_BASE_DOWN; l < n; l -= C) >> >> + *a++ = *b++ + 1; >> >> + return l; >> >> +} >> >> + >> >> +int __attribute__ ((noinline)) neq (int a, int b) { return a != b; } >> >> + >> >> +int a[1000], b[1000]; >> >> +int fail; >> >> + >> >> +int >> >> +main () >> >> +{ >> >> + TYPE res; >> >> + TYPE l; >> >> + TYPE n; >> >> + n = N_BASE; >> >> + l = n - C; >> >> + res = foo_sign (a, b, l, n); >> >> + if (res != l) >> >> + fail++; >> >> + >> >> + l = n; >> >> + res = foo_sign (a, b, l, n); >> >> + if (res != l) >> >> + fail++; >> >> + >> >> + l = n + C; >> >> + res = foo_sign (a, b, l, n); >> >> + if (neq ((res - MIN) / C, 0)) >> >> + fail++; >> >> + >> >> + n = N_BASE_DOWN; >> >> + l = n - C; >> >> + res = bar_sign (a, b, l, n); >> >> + if (neq ((MAX - res) / C, 0)) >> >> + fail++; >> >> + >> >> + l = n; >> >> + res = bar_sign (a, b, l, n); >> >> + if (res != l) >> >> + fail++; >> >> + >> >> + l = n + C; >> >> + res = bar_sign (a, b, l, n); >> >> + if (res != l) >> >> + fail++; >> >> + >> >> + if (fail) >> >> + __builtin_abort (); >> >> + return 0; >> >> +} >> >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145_1.c >> >> b/gcc/testsuite/gcc.dg/vect/pr101145_1.c >> >> new file mode 100644 >> >> index 00000000000..94f6b99b893 >> >> --- /dev/null >> >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145_1.c >> >> @@ -0,0 +1,15 @@ >> >> +/* { dg-require-effective-target vect_int } */ >> >> +/* { dg-options "-O3 -fdump-tree-vect-details" } */ >> >> +#define TYPE signed char >> >> +#define MIN -128 >> >> +#define MAX 127 >> >> +#define N_BASE (MAX - 32) >> >> +#define N_BASE_DOWN (MIN + 32) >> >> + >> >> +#define C 3 >> >> +#define L_BASE l >> >> +#define L_BASE_DOWN l >> >> + >> >> +#include "pr101145.inc" >> >> + >> >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } >> >> } */ >> >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145_2.c >> >> b/gcc/testsuite/gcc.dg/vect/pr101145_2.c >> >> new file mode 100644 >> >> index 00000000000..d3cfc9e01e1 >> >> --- /dev/null >> >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145_2.c >> >> @@ -0,0 +1,15 @@ >> >> +/* { dg-require-effective-target vect_int } */ >> >> +/* { dg-options "-O3 -fdump-tree-vect-details" } */ >> >> +#define TYPE unsigned char >> >> +#define MIN 0 >> >> +#define MAX 255 >> >> +#define N_BASE (MAX - 32 + 1) >> >> +#define N_BASE_DOWN (MIN + 32) >> >> + >> >> +#define C 2 >> >> +#define L_BASE l >> >> +#define L_BASE_DOWN l >> >> + >> >> +#include "pr101145.inc" >> >> + >> >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } >> >> } */ >> >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145_3.c >> >> b/gcc/testsuite/gcc.dg/vect/pr101145_3.c >> >> new file mode 100644 >> >> index 00000000000..e2cf9b1f7e6 >> >> --- /dev/null >> >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145_3.c >> >> @@ -0,0 +1,15 @@ >> >> +/* { dg-require-effective-target vect_int } */ >> >> +/* { dg-options "-O3 -fdump-tree-vect-details" } */ >> >> +#define TYPE int * >> >> +#define MIN ((TYPE)0) >> >> +#define MAX ((TYPE)((long long)-1)) >> >> +#define N_BASE ((TYPE) (-32)) >> >> +#define N_BASE_DOWN (MIN + 32) >> >> + >> >> +#define C 1 >> >> +#define L_BASE l >> >> +#define L_BASE_DOWN l >> >> + >> >> +#include "pr101145.inc" >> >> + >> >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } >> >> } */ >> >> diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c >> >> index b5add827018..06db6a36ef8 100644 >> >> --- a/gcc/tree-ssa-loop-niter.c >> >> +++ b/gcc/tree-ssa-loop-niter.c >> >> @@ -1473,6 +1473,86 @@ assert_loop_rolls_lt (tree type, affine_iv >> >> *iv0, affine_iv *iv1, >> >> } >> >> } >> >> >> >> +/* Determines number of iterations of loop whose ending condition >> >> + is IV0 < IV1 which likes: {base, -C} < n, or n < {base, C}. >> >> + The number of iterations is stored to NITER. */ >> >> + >> >> +static bool >> >> +number_of_iterations_until_wrap (class loop *, tree type, affine_iv >> >> *iv0, >> >> + affine_iv *iv1, class tree_niter_desc *niter) >> >> +{ >> >> + tree niter_type = unsigned_type_for (type); >> >> + tree max, min; >> >> + >> >> + if (POINTER_TYPE_P (type)) >> >> + { >> >> + max = fold_convert (type, TYPE_MAX_VALUE (niter_type)); >> >> + min = fold_convert (type, TYPE_MIN_VALUE (niter_type)); >> >> + } >> >> + else >> >> + { >> >> + max = TYPE_MAX_VALUE (type); >> >> + min = TYPE_MIN_VALUE (type); >> >> + } >> > >> > It would be cheaper to use wide_int here - wi::min/max_value >> > (TYPE_PRECISION (...), TYPE_SIGN (...)) and in the simplification >> > code below. >> >> Thanks! I think you mean to use wide_int for "tree high = max, low = >> min,". >> I would check and update the patch to use wide_int. >> >> The MAX/MIN would be in type 'tree' when binding to MINUS_EXPR with >> iv0/iv1->base, >> for this, we may still need to use "TYPE_MAX/MIN_VALUE (type)", right? >> >> > >> >> + tree high = max, low = min, one = build_int_cst (niter_type, 1); >> >> + tree step; >> >> + >> >> + /* n < {base, C}. */ >> >> + if (integer_zerop (iv0->step) && TREE_CODE (iv1->step) == >> >> INTEGER_CST >> >> + && !tree_int_cst_sign_bit (iv1->step)) >> >> + { >> >> + step = iv1->step; >> >> + niter->niter = fold_build2 (MINUS_EXPR, niter_type, max, >> >> iv1->base); >> >> + if (TREE_CODE (iv1->base) == INTEGER_CST) >> >> + low = fold_build2 (MINUS_EXPR, type, iv1->base, one); >> >> + else if (TREE_CODE (iv0->base) == INTEGER_CST) >> >> + low = iv0->base; >> >> + } >> >> + /* {base, -C} < n. */ >> >> + else if (TREE_CODE (iv0->step) == INTEGER_CST >> >> + && tree_int_cst_sign_bit (iv0->step) && integer_zerop >> >> (iv1->step)) >> >> + { >> >> + step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv0->step), >> >> iv0->step); >> >> + niter->niter = fold_build2 (MINUS_EXPR, niter_type, iv0->base, >> >> min); >> >> + if (TREE_CODE (iv0->base) == INTEGER_CST) >> >> + high = fold_build2 (PLUS_EXPR, type, iv0->base, one); >> >> + else if (TREE_CODE (iv1->base) == INTEGER_CST) >> >> + high = iv1->base; >> >> + } >> >> + else >> >> + return false; >> >> + >> >> + /* (delta + step - 1) / step */ >> >> + step = fold_convert (niter_type, step); >> >> + niter->niter = fold_convert (niter_type, niter->niter); >> >> + niter->niter = fold_build2 (PLUS_EXPR, niter_type, niter->niter, >> >> step); >> >> + niter->niter = fold_build2 (FLOOR_DIV_EXPR, niter_type, >> >> niter->niter, step); >> >> + >> >> + tree m = fold_build2 (MINUS_EXPR, niter_type, high, low); >> >> + m = fold_convert (niter_type, m); >> >> + mpz_t mstep, tmp, mmax; >> >> + mpz_init (mstep); >> >> + mpz_init (tmp); >> >> + mpz_init (mmax); >> >> + wi::to_mpz (wi::to_wide (step), mstep, UNSIGNED); >> >> + wi::to_mpz (wi::to_wide (m), mmax, UNSIGNED); >> >> + mpz_add (tmp, mmax, mstep); >> >> + mpz_sub_ui (tmp, tmp, 1); >> >> + mpz_fdiv_q (tmp, tmp, mstep); >> >> + niter->max = widest_int::from (wi::from_mpz (niter_type, tmp, >> >> false), >> >> + TYPE_SIGN (niter_type)); >> >> + mpz_clear (mstep); >> >> + mpz_clear (tmp); >> > >> > You forget to clear mmax? I wonder why you need to involve GMP here >> Oh, thanks for pointing out this :) need to clear mmax. >> >> > at all since this is just add/sub/divide - you could use >> > widest_int directly I think. At least the back-and-forth between >> > trees, wide-int and mpz looks odd. >> I once use 'trees' to do the calculation, while I encounter the issue >> that the may not hold the precision for some cases: >> "mmax" is MAX(1<<64-1); step is 4, then mmax + 4 becomes 3. >> >> I would check to use widest_int (e.g. long[4]) which seems enough to >> hold all types. >> >> >> BR, >> Jiufu Guo. >> >> > >> > Richard. >> > >> >> + niter->may_be_zero >> >> + = fold_build2 (LE_EXPR, boolean_type_node, iv1->base, iv0->base); >> >> + >> >> + niter->control.no_overflow = false; >> >> + >> >> + return true; >> >> +} >> >> + >> >> /* Determines number of iterations of loop whose ending condition >> >> is IV0 < IV1. TYPE is the type of the iv. The number of >> >> iterations is stored to NITER. BNDS bounds the difference >> >> @@ -1501,6 +1581,11 @@ number_of_iterations_lt (class loop *loop, tree >> >> type, affine_iv *iv0, >> >> niter->bound = iv0->base; >> >> } >> >> >> >> + /* {base, -C} < n, or n < {base, C} */ >> >> + if (tree_int_cst_sign_bit (iv0->step) >> >> + || (!integer_zerop (iv1->step) && !tree_int_cst_sign_bit >> >> (iv1->step))) >> >> + return number_of_iterations_until_wrap (loop, type, iv0, iv1, >> >> niter); >> >> + >> >> delta = fold_build2 (MINUS_EXPR, niter_type, >> >> fold_convert (niter_type, iv1->base), >> >> fold_convert (niter_type, iv0->base)); >> >> @@ -1665,62 +1750,6 @@ dump_affine_iv (FILE *file, affine_iv *iv) >> >> } >> >> } >> >> >> >> -/* Given exit condition IV0 CODE IV1 in TYPE, this function adjusts >> >> - the condition for loop-until-wrap cases. For example: >> >> - (unsigned){8, -1}_loop < 10 => {0, 1} != 9 >> >> - 10 < (unsigned){0, max - 7}_loop => {0, 1} != 8 >> >> - Return true if condition is successfully adjusted. */ >> >> - >> >> -static bool >> >> -adjust_cond_for_loop_until_wrap (tree type, affine_iv *iv0, tree_code >> >> *code, >> >> - affine_iv *iv1) >> >> -{ >> >> - /* Only support simple cases for the moment. */ >> >> - if (TREE_CODE (iv0->base) != INTEGER_CST >> >> - || TREE_CODE (iv1->base) != INTEGER_CST) >> >> - return false; >> >> - >> >> - tree niter_type = unsigned_type_for (type), high, low; >> >> - /* Case: i-- < 10. */ >> >> - if (integer_zerop (iv1->step)) >> >> - { >> >> - /* TODO: Should handle case in which abs(step) != 1. */ >> >> - if (!integer_minus_onep (iv0->step)) >> >> - return false; >> >> - /* Give up on infinite loop. */ >> >> - if (*code == LE_EXPR >> >> - && tree_int_cst_equal (iv1->base, TYPE_MAX_VALUE (type))) >> >> - return false; >> >> - high = fold_build2 (PLUS_EXPR, niter_type, >> >> - fold_convert (niter_type, iv0->base), >> >> - build_int_cst (niter_type, 1)); >> >> - low = fold_convert (niter_type, TYPE_MIN_VALUE (type)); >> >> - } >> >> - else if (integer_zerop (iv0->step)) >> >> - { >> >> - /* TODO: Should handle case in which abs(step) != 1. */ >> >> - if (!integer_onep (iv1->step)) >> >> - return false; >> >> - /* Give up on infinite loop. */ >> >> - if (*code == LE_EXPR >> >> - && tree_int_cst_equal (iv0->base, TYPE_MIN_VALUE (type))) >> >> - return false; >> >> - high = fold_convert (niter_type, TYPE_MAX_VALUE (type)); >> >> - low = fold_build2 (MINUS_EXPR, niter_type, >> >> - fold_convert (niter_type, iv1->base), >> >> - build_int_cst (niter_type, 1)); >> >> - } >> >> - else >> >> - gcc_unreachable (); >> >> - >> >> - iv0->base = low; >> >> - iv0->step = fold_convert (niter_type, integer_one_node); >> >> - iv1->base = high; >> >> - iv1->step = build_int_cst (niter_type, 0); >> >> - *code = NE_EXPR; >> >> - return true; >> >> -} >> >> - >> >> /* Determine the number of iterations according to condition (for >> >> staying >> >> inside loop) which compares two induction variables using >> >> comparison >> >> operator CODE. The induction variable on left side of the >> >> comparison >> >> @@ -1855,15 +1884,6 @@ number_of_iterations_cond (class loop *loop, >> >> return true; >> >> } >> >> >> >> - /* Handle special case loops: while (i-- < 10) and while (10 < i++) >> >> by >> >> - adjusting iv0, iv1 and code. */ >> >> - if (code != NE_EXPR >> >> - && (tree_int_cst_sign_bit (iv0->step) >> >> - || (!integer_zerop (iv1->step) >> >> - && !tree_int_cst_sign_bit (iv1->step))) >> >> - && !adjust_cond_for_loop_until_wrap (type, iv0, &code, iv1)) >> >> - return false; >> >> - >> >> /* OK, now we know we have a senseful loop. Handle several cases, >> >> depending >> >> on what comparison operator is used. */ >> >> bound_difference (loop, iv1->base, iv0->base, &bnds); >> >> ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] Analyze niter for until-wrap condition [PR101145] 2021-07-01 12:35 ` Richard Biener 2021-07-01 14:12 ` guojiufu @ 2021-07-02 7:55 ` guojiufu 1 sibling, 0 replies; 8+ messages in thread From: guojiufu @ 2021-07-02 7:55 UTC (permalink / raw) To: Richard Biener; +Cc: gcc-patches, wschmidt, segher, dje.gcc, jlaw, amker On 2021-07-01 20:35, Richard Biener wrote: > On Thu, 1 Jul 2021, Jiufu Guo wrote: > >> For code like: >> unsigned foo(unsigned val, unsigned start) >> { >> unsigned cnt = 0; >> for (unsigned i = start; i > val; ++i) >> cnt++; >> return cnt; >> } >> >> The number of iterations should be about UINT_MAX - start. > > For > > unsigned foo(unsigned val, unsigned start) > { > unsigned cnt = 0; > for (unsigned i = start; i >= val; ++i) > cnt++; > return cnt; > } > > and val == 0 the loop never terminates. I don't see anywhere > in the patch that you disregard GE_EXPR and I remember > the code handles GE as well as GT? From a quick look this is > also not covered by a testcase you add - not exactly sure > how it would materialize in a miscompilation. > Find a similar issue on the below code with the trunk. The below code should run infinite, but it exits quickly. #include <limits.h> __attribute__ ((noinline)) unsigned foo(unsigned val, unsigned start) { unsigned cnt = 0; for (unsigned i = start; i <= val; i+=16) cnt++; return cnt; } int main() { return foo (UINT_MAX-7, 8); } Just opened https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101291 BR, Jiufu Guo. >> There is function adjust_cond_for_loop_until_wrap which >> handles similar work for const bases. >> Like adjust_cond_for_loop_until_wrap, this patch enhance >> function number_of_iterations_cond/number_of_iterations_lt >> to analyze number of iterations for this kind of loop. >> >> Bootstrap and regtest pass on powerpc64le, is this ok for trunk? >> >> gcc/ChangeLog: >> >> PR tree-optimization/101145 >> * tree-ssa-loop-niter.c >> (number_of_iterations_until_wrap): New function. >> (number_of_iterations_lt): Invoke above function. >> (adjust_cond_for_loop_until_wrap): >> Merge to number_of_iterations_until_wrap. >> (number_of_iterations_cond): Update invokes for >> adjust_cond_for_loop_until_wrap and number_of_iterations_lt. >> >> gcc/testsuite/ChangeLog: >> >> PR tree-optimization/101145 >> * gcc.dg/vect/pr101145.c: New test. >> * gcc.dg/vect/pr101145.inc: New test. >> * gcc.dg/vect/pr101145_1.c: New test. >> * gcc.dg/vect/pr101145_2.c: New test. >> * gcc.dg/vect/pr101145_3.c: New test. >> --- >> gcc/testsuite/gcc.dg/vect/pr101145.c | 187 >> +++++++++++++++++++++++++ >> gcc/testsuite/gcc.dg/vect/pr101145.inc | 63 +++++++++ >> gcc/testsuite/gcc.dg/vect/pr101145_1.c | 15 ++ >> gcc/testsuite/gcc.dg/vect/pr101145_2.c | 15 ++ >> gcc/testsuite/gcc.dg/vect/pr101145_3.c | 15 ++ >> gcc/tree-ssa-loop-niter.c | 150 +++++++++++--------- >> 6 files changed, 380 insertions(+), 65 deletions(-) >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145.c >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145.inc >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_1.c >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_2.c >> create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_3.c >> >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145.c >> b/gcc/testsuite/gcc.dg/vect/pr101145.c >> new file mode 100644 >> index 00000000000..74031b031cf >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145.c >> @@ -0,0 +1,187 @@ >> +/* { dg-require-effective-target vect_int } */ >> +/* { dg-options "-O3 -fdump-tree-vect-details" } */ >> +#include <limits.h> >> + >> +unsigned __attribute__ ((noinline)) >> +foo (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> n) >> +{ >> + while (n < ++l) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +unsigned __attribute__ ((noinline)) >> +foo_1 (int *__restrict__ a, int *__restrict__ b, unsigned l, >> unsigned) >> +{ >> + while (UINT_MAX - 64 < ++l) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +unsigned __attribute__ ((noinline)) >> +foo_2 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> n) >> +{ >> + l = UINT_MAX - 32; >> + while (n < ++l) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +unsigned __attribute__ ((noinline)) >> +foo_3 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> n) >> +{ >> + while (n <= ++l) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +unsigned __attribute__ ((noinline)) >> +foo_4 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> n) >> +{ // infininate >> + while (0 <= ++l) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +unsigned __attribute__ ((noinline)) >> +foo_5 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> n) >> +{ >> + //no loop >> + l = UINT_MAX; >> + while (n < ++l) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +unsigned __attribute__ ((noinline)) >> +bar (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> n) >> +{ >> + while (--l < n) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +unsigned __attribute__ ((noinline)) >> +bar_1 (int *__restrict__ a, int *__restrict__ b, unsigned l, >> unsigned) >> +{ >> + while (--l < 64) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +unsigned __attribute__ ((noinline)) >> +bar_2 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned >> n) >> +{ >> + l = 32; >> + while (--l < n) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> + >> +int a[3200], b[3200]; >> +int fail; >> + >> +int >> +main () >> +{ >> + unsigned l, n; >> + unsigned res; >> + /* l > n*/ >> + n = UINT_MAX - 64; >> + l = n + 32; >> + res = foo (a, b, l, n); >> + if (res != 0) >> + fail++; >> + >> + l = n; >> + res = foo (a, b, l, n); >> + if (res != 0) >> + fail++; >> + >> + l = n - 1; >> + res = foo (a, b, l, n); >> + if (res != l + 1) >> + fail++; >> + >> + l = n - 32; >> + res = foo (a, b, l, n); >> + if (res != l + 1) >> + fail++; >> + >> + l = UINT_MAX; >> + res = foo (a, b, l, n); >> + if (res != 0) >> + fail++; >> + >> + l = n + 32; >> + res = foo_1 (a, b, l, n); >> + if (res != 0) >> + fail++; >> + >> + l = n + 32; >> + res = foo_2 (a, b, l, n); >> + if (res != 0) >> + fail++; >> + >> + l = n; >> + res = foo_3 (a, b, l, n); >> + if (res != 0) >> + fail++; >> + >> + l = n - 1; >> + res = foo_3 (a, b, l, n); >> + if (res != 0) >> + fail++; >> + >> + l = n - 2; >> + res = foo_3 (a, b, l, n); >> + if (res != l + 1) >> + fail++; >> + >> + res = foo_5 (a, b, l, n); >> + if (res != 0) >> + fail++; >> + >> + n = 64; >> + l = n - 32; >> + res = bar (a, b, l, n); >> + res++; >> + if (res != 0) >> + fail++; >> + >> + l = n; >> + res = bar (a, b, l, n); >> + res++; >> + if (res != 0) >> + fail++; >> + >> + l = n + 1; >> + res = bar (a, b, l, n); >> + res++; >> + if (res != l) >> + fail++; >> + >> + l = 0; >> + res = bar (a, b, l, n); >> + res++; >> + if (res != l) >> + fail++; >> + >> + l = 32; >> + res = bar_1 (a, b, l, n); >> + res++; >> + if (res != 0) >> + fail++; >> + >> + res = bar_1 (a, b, l, n); >> + res++; >> + if (res != 0) >> + fail++; >> + >> + if (fail) >> + __builtin_abort (); >> + return 0; >> +} >> + >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 7 "vect" } >> } */ >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145.inc >> b/gcc/testsuite/gcc.dg/vect/pr101145.inc >> new file mode 100644 >> index 00000000000..6eed3fa8aca >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145.inc >> @@ -0,0 +1,63 @@ >> +TYPE __attribute__ ((noinline)) >> +foo_sign (int *__restrict__ a, int *__restrict__ b, TYPE l, TYPE n) >> +{ >> + for (l = L_BASE; n < l; l += C) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +TYPE __attribute__ ((noinline)) >> +bar_sign (int *__restrict__ a, int *__restrict__ b, TYPE l, TYPE n) >> +{ >> + for (l = L_BASE_DOWN; l < n; l -= C) >> + *a++ = *b++ + 1; >> + return l; >> +} >> + >> +int __attribute__ ((noinline)) neq (int a, int b) { return a != b; } >> + >> +int a[1000], b[1000]; >> +int fail; >> + >> +int >> +main () >> +{ >> + TYPE res; >> + TYPE l; >> + TYPE n; >> + n = N_BASE; >> + l = n - C; >> + res = foo_sign (a, b, l, n); >> + if (res != l) >> + fail++; >> + >> + l = n; >> + res = foo_sign (a, b, l, n); >> + if (res != l) >> + fail++; >> + >> + l = n + C; >> + res = foo_sign (a, b, l, n); >> + if (neq ((res - MIN) / C, 0)) >> + fail++; >> + >> + n = N_BASE_DOWN; >> + l = n - C; >> + res = bar_sign (a, b, l, n); >> + if (neq ((MAX - res) / C, 0)) >> + fail++; >> + >> + l = n; >> + res = bar_sign (a, b, l, n); >> + if (res != l) >> + fail++; >> + >> + l = n + C; >> + res = bar_sign (a, b, l, n); >> + if (res != l) >> + fail++; >> + >> + if (fail) >> + __builtin_abort (); >> + return 0; >> +} >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145_1.c >> b/gcc/testsuite/gcc.dg/vect/pr101145_1.c >> new file mode 100644 >> index 00000000000..94f6b99b893 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145_1.c >> @@ -0,0 +1,15 @@ >> +/* { dg-require-effective-target vect_int } */ >> +/* { dg-options "-O3 -fdump-tree-vect-details" } */ >> +#define TYPE signed char >> +#define MIN -128 >> +#define MAX 127 >> +#define N_BASE (MAX - 32) >> +#define N_BASE_DOWN (MIN + 32) >> + >> +#define C 3 >> +#define L_BASE l >> +#define L_BASE_DOWN l >> + >> +#include "pr101145.inc" >> + >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } >> } */ >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145_2.c >> b/gcc/testsuite/gcc.dg/vect/pr101145_2.c >> new file mode 100644 >> index 00000000000..d3cfc9e01e1 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145_2.c >> @@ -0,0 +1,15 @@ >> +/* { dg-require-effective-target vect_int } */ >> +/* { dg-options "-O3 -fdump-tree-vect-details" } */ >> +#define TYPE unsigned char >> +#define MIN 0 >> +#define MAX 255 >> +#define N_BASE (MAX - 32 + 1) >> +#define N_BASE_DOWN (MIN + 32) >> + >> +#define C 2 >> +#define L_BASE l >> +#define L_BASE_DOWN l >> + >> +#include "pr101145.inc" >> + >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } >> } */ >> diff --git a/gcc/testsuite/gcc.dg/vect/pr101145_3.c >> b/gcc/testsuite/gcc.dg/vect/pr101145_3.c >> new file mode 100644 >> index 00000000000..e2cf9b1f7e6 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/vect/pr101145_3.c >> @@ -0,0 +1,15 @@ >> +/* { dg-require-effective-target vect_int } */ >> +/* { dg-options "-O3 -fdump-tree-vect-details" } */ >> +#define TYPE int * >> +#define MIN ((TYPE)0) >> +#define MAX ((TYPE)((long long)-1)) >> +#define N_BASE ((TYPE) (-32)) >> +#define N_BASE_DOWN (MIN + 32) >> + >> +#define C 1 >> +#define L_BASE l >> +#define L_BASE_DOWN l >> + >> +#include "pr101145.inc" >> + >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } >> } */ >> diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c >> index b5add827018..06db6a36ef8 100644 >> --- a/gcc/tree-ssa-loop-niter.c >> +++ b/gcc/tree-ssa-loop-niter.c >> @@ -1473,6 +1473,86 @@ assert_loop_rolls_lt (tree type, affine_iv >> *iv0, affine_iv *iv1, >> } >> } >> >> +/* Determines number of iterations of loop whose ending condition >> + is IV0 < IV1 which likes: {base, -C} < n, or n < {base, C}. >> + The number of iterations is stored to NITER. */ >> + >> +static bool >> +number_of_iterations_until_wrap (class loop *, tree type, affine_iv >> *iv0, >> + affine_iv *iv1, class tree_niter_desc *niter) >> +{ >> + tree niter_type = unsigned_type_for (type); >> + tree max, min; >> + >> + if (POINTER_TYPE_P (type)) >> + { >> + max = fold_convert (type, TYPE_MAX_VALUE (niter_type)); >> + min = fold_convert (type, TYPE_MIN_VALUE (niter_type)); >> + } >> + else >> + { >> + max = TYPE_MAX_VALUE (type); >> + min = TYPE_MIN_VALUE (type); >> + } > > It would be cheaper to use wide_int here - wi::min/max_value > (TYPE_PRECISION (...), TYPE_SIGN (...)) and in the simplification > code below. > >> + tree high = max, low = min, one = build_int_cst (niter_type, 1); >> + tree step; >> + >> + /* n < {base, C}. */ >> + if (integer_zerop (iv0->step) && TREE_CODE (iv1->step) == >> INTEGER_CST >> + && !tree_int_cst_sign_bit (iv1->step)) >> + { >> + step = iv1->step; >> + niter->niter = fold_build2 (MINUS_EXPR, niter_type, max, >> iv1->base); >> + if (TREE_CODE (iv1->base) == INTEGER_CST) >> + low = fold_build2 (MINUS_EXPR, type, iv1->base, one); >> + else if (TREE_CODE (iv0->base) == INTEGER_CST) >> + low = iv0->base; >> + } >> + /* {base, -C} < n. */ >> + else if (TREE_CODE (iv0->step) == INTEGER_CST >> + && tree_int_cst_sign_bit (iv0->step) && integer_zerop >> (iv1->step)) >> + { >> + step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv0->step), >> iv0->step); >> + niter->niter = fold_build2 (MINUS_EXPR, niter_type, iv0->base, >> min); >> + if (TREE_CODE (iv0->base) == INTEGER_CST) >> + high = fold_build2 (PLUS_EXPR, type, iv0->base, one); >> + else if (TREE_CODE (iv1->base) == INTEGER_CST) >> + high = iv1->base; >> + } >> + else >> + return false; >> + >> + /* (delta + step - 1) / step */ >> + step = fold_convert (niter_type, step); >> + niter->niter = fold_convert (niter_type, niter->niter); >> + niter->niter = fold_build2 (PLUS_EXPR, niter_type, niter->niter, >> step); >> + niter->niter = fold_build2 (FLOOR_DIV_EXPR, niter_type, >> niter->niter, step); >> + >> + tree m = fold_build2 (MINUS_EXPR, niter_type, high, low); >> + m = fold_convert (niter_type, m); >> + mpz_t mstep, tmp, mmax; >> + mpz_init (mstep); >> + mpz_init (tmp); >> + mpz_init (mmax); >> + wi::to_mpz (wi::to_wide (step), mstep, UNSIGNED); >> + wi::to_mpz (wi::to_wide (m), mmax, UNSIGNED); >> + mpz_add (tmp, mmax, mstep); >> + mpz_sub_ui (tmp, tmp, 1); >> + mpz_fdiv_q (tmp, tmp, mstep); >> + niter->max = widest_int::from (wi::from_mpz (niter_type, tmp, >> false), >> + TYPE_SIGN (niter_type)); >> + mpz_clear (mstep); >> + mpz_clear (tmp); > > You forget to clear mmax? I wonder why you need to involve GMP here > at all since this is just add/sub/divide - you could use > widest_int directly I think. At least the back-and-forth between > trees, wide-int and mpz looks odd. > > Richard. > >> + niter->may_be_zero >> + = fold_build2 (LE_EXPR, boolean_type_node, iv1->base, iv0->base); >> + >> + niter->control.no_overflow = false; >> + >> + return true; >> +} >> + >> /* Determines number of iterations of loop whose ending condition >> is IV0 < IV1. TYPE is the type of the iv. The number of >> iterations is stored to NITER. BNDS bounds the difference >> @@ -1501,6 +1581,11 @@ number_of_iterations_lt (class loop *loop, tree >> type, affine_iv *iv0, >> niter->bound = iv0->base; >> } >> >> + /* {base, -C} < n, or n < {base, C} */ >> + if (tree_int_cst_sign_bit (iv0->step) >> + || (!integer_zerop (iv1->step) && !tree_int_cst_sign_bit >> (iv1->step))) >> + return number_of_iterations_until_wrap (loop, type, iv0, iv1, >> niter); >> + >> delta = fold_build2 (MINUS_EXPR, niter_type, >> fold_convert (niter_type, iv1->base), >> fold_convert (niter_type, iv0->base)); >> @@ -1665,62 +1750,6 @@ dump_affine_iv (FILE *file, affine_iv *iv) >> } >> } >> >> -/* Given exit condition IV0 CODE IV1 in TYPE, this function adjusts >> - the condition for loop-until-wrap cases. For example: >> - (unsigned){8, -1}_loop < 10 => {0, 1} != 9 >> - 10 < (unsigned){0, max - 7}_loop => {0, 1} != 8 >> - Return true if condition is successfully adjusted. */ >> - >> -static bool >> -adjust_cond_for_loop_until_wrap (tree type, affine_iv *iv0, tree_code >> *code, >> - affine_iv *iv1) >> -{ >> - /* Only support simple cases for the moment. */ >> - if (TREE_CODE (iv0->base) != INTEGER_CST >> - || TREE_CODE (iv1->base) != INTEGER_CST) >> - return false; >> - >> - tree niter_type = unsigned_type_for (type), high, low; >> - /* Case: i-- < 10. */ >> - if (integer_zerop (iv1->step)) >> - { >> - /* TODO: Should handle case in which abs(step) != 1. */ >> - if (!integer_minus_onep (iv0->step)) >> - return false; >> - /* Give up on infinite loop. */ >> - if (*code == LE_EXPR >> - && tree_int_cst_equal (iv1->base, TYPE_MAX_VALUE (type))) >> - return false; >> - high = fold_build2 (PLUS_EXPR, niter_type, >> - fold_convert (niter_type, iv0->base), >> - build_int_cst (niter_type, 1)); >> - low = fold_convert (niter_type, TYPE_MIN_VALUE (type)); >> - } >> - else if (integer_zerop (iv0->step)) >> - { >> - /* TODO: Should handle case in which abs(step) != 1. */ >> - if (!integer_onep (iv1->step)) >> - return false; >> - /* Give up on infinite loop. */ >> - if (*code == LE_EXPR >> - && tree_int_cst_equal (iv0->base, TYPE_MIN_VALUE (type))) >> - return false; >> - high = fold_convert (niter_type, TYPE_MAX_VALUE (type)); >> - low = fold_build2 (MINUS_EXPR, niter_type, >> - fold_convert (niter_type, iv1->base), >> - build_int_cst (niter_type, 1)); >> - } >> - else >> - gcc_unreachable (); >> - >> - iv0->base = low; >> - iv0->step = fold_convert (niter_type, integer_one_node); >> - iv1->base = high; >> - iv1->step = build_int_cst (niter_type, 0); >> - *code = NE_EXPR; >> - return true; >> -} >> - >> /* Determine the number of iterations according to condition (for >> staying >> inside loop) which compares two induction variables using >> comparison >> operator CODE. The induction variable on left side of the >> comparison >> @@ -1855,15 +1884,6 @@ number_of_iterations_cond (class loop *loop, >> return true; >> } >> >> - /* Handle special case loops: while (i-- < 10) and while (10 < i++) >> by >> - adjusting iv0, iv1 and code. */ >> - if (code != NE_EXPR >> - && (tree_int_cst_sign_bit (iv0->step) >> - || (!integer_zerop (iv1->step) >> - && !tree_int_cst_sign_bit (iv1->step))) >> - && !adjust_cond_for_loop_until_wrap (type, iv0, &code, iv1)) >> - return false; >> - >> /* OK, now we know we have a senseful loop. Handle several cases, >> depending >> on what comparison operator is used. */ >> bound_difference (loop, iv1->base, iv0->base, &bnds); >> ^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2021-07-02 7:56 UTC | newest] Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2021-07-01 2:05 [PATCH] Analyze niter for until-wrap condition [PR101145] Jiufu Guo 2021-07-01 7:22 ` Bin.Cheng 2021-07-01 11:32 ` guojiufu 2021-07-01 12:35 ` Richard Biener 2021-07-01 14:12 ` guojiufu 2021-07-02 0:51 ` Bin.Cheng 2021-07-02 4:06 ` guojiufu 2021-07-02 7:55 ` guojiufu
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).