diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h index 528b1219bc37ad8f114d5cf381c0cff899db31ee..9c7f019a51abfe2de8e1dd7135dea2463b0256a0 100644 --- a/gcc/cfgloop.h +++ b/gcc/cfgloop.h @@ -385,6 +385,7 @@ extern basic_block *get_loop_body_in_custom_order (const class loop *, void *, extern auto_vec get_loop_exit_edges (const class loop *, basic_block * = NULL); extern edge single_exit (const class loop *); +extern edge normal_exit (const class loop *); extern edge single_likely_exit (class loop *loop, const vec &); extern unsigned num_loop_branches (const class loop *); diff --git a/gcc/cfgloop.cc b/gcc/cfgloop.cc index 57bf7b1855d4dd20fb3f42388124932d0ca2b48a..97a7373fb6d9514da602d5be01050f2ec66094bc 100644 --- a/gcc/cfgloop.cc +++ b/gcc/cfgloop.cc @@ -1812,6 +1812,20 @@ single_exit (const class loop *loop) return NULL; } +/* Returns the normal exit edge of LOOP, or NULL if LOOP has either no exit. + If loops do not have the exits recorded, NULL is returned always. */ + +edge +normal_exit (const class loop *loop) +{ + struct loop_exit *exit = loop->exits->next; + + if (!loops_state_satisfies_p (LOOPS_HAVE_RECORDED_EXITS)) + return NULL; + + return exit->e; +} + /* Returns true when BB has an incoming edge exiting LOOP. */ bool diff --git a/gcc/doc/loop.texi b/gcc/doc/loop.texi index 6e8657a074d2447db7ae9b75cbfbb71282b84287..e1de2ac40f87f879ab691f68bd41b3bc21a83bf7 100644 --- a/gcc/doc/loop.texi +++ b/gcc/doc/loop.texi @@ -211,6 +211,10 @@ relation, and breath-first search order, respectively. @item @code{single_exit}: Returns the single exit edge of the loop, or @code{NULL} if the loop has more than one exit. You can only use this function if @code{LOOPS_HAVE_RECORDED_EXITS} is used. +function if LOOPS_HAVE_MARKED_SINGLE_EXITS property is used. +@item @code{normal_exit}: Returns the natural exit edge of the loop, +even if the loop has more than one exit. The natural exit is the exit +that would normally be taken where the loop to be fully executed. @item @code{get_loop_exit_edges}: Enumerates the exit edges of a loop. @item @code{just_once_each_iteration_p}: Returns true if the basic block is executed exactly once during each iteration of a loop (that is, it @@ -623,4 +627,4 @@ maximum verbosity the details of a data dependence relations array, @code{dump_dist_dir_vectors} prints only the classical distance and direction vectors for a data dependence relations array, and @code{dump_data_references} prints the details of the data references -contained in a data reference array. +contained in a data reference array diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi index e21a1d381e05da1bfccb555068ea1dbeabd9fc79..16fa94ebf532d27cd9a3a45a7aad578ca6920496 100644 --- a/gcc/doc/sourcebuild.texi +++ b/gcc/doc/sourcebuild.texi @@ -1640,6 +1640,10 @@ Target supports hardware vectors of @code{float} when @option{-funsafe-math-optimizations} is not in effect. This implies @code{vect_float}. +@item vect_early_break +Target supports hardware vectorization of loops with early breaks. +This requires an implementation of the cbranch optab for vectors. + @item vect_int Target supports hardware vectors of @code{int}. diff --git a/gcc/testsuite/g++.dg/vect/vect-early-break_1.cc b/gcc/testsuite/g++.dg/vect/vect-early-break_1.cc new file mode 100644 index 0000000000000000000000000000000000000000..6a83648ca36e2c8feeb78335fccf3f3b82a97d2e --- /dev/null +++ b/gcc/testsuite/g++.dg/vect/vect-early-break_1.cc @@ -0,0 +1,61 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-w -O2" } */ + +void fancy_abort(char *, int, const char *) __attribute__((__noreturn__)); +template struct poly_int_pod { int coeffs[N]; }; +template class poly_int : public poly_int_pod { +public: + template poly_int &operator+=(const poly_int_pod &); +}; +template +template +poly_int &poly_int::operator+=(const poly_int_pod &a) { + for (int i = 0; i < N; i++) + this->coeffs[i] += a.coeffs[i]; + return *this; +} +template +poly_int exact_div(poly_int_pod, Cb) { + poly_int r; + return r; +} +struct vec_prefix { + unsigned m_num; +}; +struct vl_ptr; +struct va_heap { + typedef vl_ptr default_layout; +}; +template +struct vec; +template struct vec { + T &operator[](unsigned); + vec_prefix m_vecpfx; + T m_vecdata[]; +}; +template T &vec::operator[](unsigned ix) { + m_vecpfx.m_num ? fancy_abort("", 9, __FUNCTION__), 0 : 0; + return m_vecdata[ix]; +} +template struct vec { + T &operator[](unsigned ix) { return m_vec[ix]; } + vec m_vec; +}; +class auto_vec : public vec, va_heap> {}; +template class vector_builder : public auto_vec {}; +class int_vector_builder : public vector_builder { +public: + int_vector_builder(poly_int<2, long>, int, int); +}; +bool vect_grouped_store_supported() { + int i; + poly_int<2, long> nelt; + int_vector_builder sel(nelt, 2, 3); + for (i = 0; i < 6; i++) + sel[i] += exact_div(nelt, 2); +} + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/g++.dg/vect/vect-early-break_2.cc b/gcc/testsuite/g++.dg/vect/vect-early-break_2.cc new file mode 100644 index 0000000000000000000000000000000000000000..6a83648ca36e2c8feeb78335fccf3f3b82a97d2e --- /dev/null +++ b/gcc/testsuite/g++.dg/vect/vect-early-break_2.cc @@ -0,0 +1,61 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-w -O2" } */ + +void fancy_abort(char *, int, const char *) __attribute__((__noreturn__)); +template struct poly_int_pod { int coeffs[N]; }; +template class poly_int : public poly_int_pod { +public: + template poly_int &operator+=(const poly_int_pod &); +}; +template +template +poly_int &poly_int::operator+=(const poly_int_pod &a) { + for (int i = 0; i < N; i++) + this->coeffs[i] += a.coeffs[i]; + return *this; +} +template +poly_int exact_div(poly_int_pod, Cb) { + poly_int r; + return r; +} +struct vec_prefix { + unsigned m_num; +}; +struct vl_ptr; +struct va_heap { + typedef vl_ptr default_layout; +}; +template +struct vec; +template struct vec { + T &operator[](unsigned); + vec_prefix m_vecpfx; + T m_vecdata[]; +}; +template T &vec::operator[](unsigned ix) { + m_vecpfx.m_num ? fancy_abort("", 9, __FUNCTION__), 0 : 0; + return m_vecdata[ix]; +} +template struct vec { + T &operator[](unsigned ix) { return m_vec[ix]; } + vec m_vec; +}; +class auto_vec : public vec, va_heap> {}; +template class vector_builder : public auto_vec {}; +class int_vector_builder : public vector_builder { +public: + int_vector_builder(poly_int<2, long>, int, int); +}; +bool vect_grouped_store_supported() { + int i; + poly_int<2, long> nelt; + int_vector_builder sel(nelt, 2, 3); + for (i = 0; i < 6; i++) + sel[i] += exact_div(nelt, 2); +} + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/g++.dg/vect/vect-early-break_3.cc b/gcc/testsuite/g++.dg/vect/vect-early-break_3.cc new file mode 100644 index 0000000000000000000000000000000000000000..a12e5ca434b2ac37c03dbaa12273fd8e5aa2018c --- /dev/null +++ b/gcc/testsuite/g++.dg/vect/vect-early-break_3.cc @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-w -O2" } */ + +int aarch64_advsimd_valid_immediate_hs_val32; +bool aarch64_advsimd_valid_immediate_hs() { + for (int shift = 0; shift < 32; shift += 8) + if (aarch64_advsimd_valid_immediate_hs_val32 & shift) + return aarch64_advsimd_valid_immediate_hs_val32; + for (;;) + ; +} + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_1.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_1.c new file mode 100644 index 0000000000000000000000000000000000000000..2495b36a72eae94cb7abc4a0d17a5c979fd78083 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_1.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 803 +#define P 0 +#include "vect-early-break-template_1.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_10.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_10.c new file mode 100644 index 0000000000000000000000000000000000000000..9bcd7f7e57ef9a1d4649d18569b3406050e54603 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_10.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 800 +#define P 799 +#include "vect-early-break-template_2.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_2.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_2.c new file mode 100644 index 0000000000000000000000000000000000000000..63f63101a467909f328be7f3acbc5bcb721967ff --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_2.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 803 +#define P 802 +#include "vect-early-break-template_1.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_3.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_3.c new file mode 100644 index 0000000000000000000000000000000000000000..626b95e9b8517081d41d794e9e0264d6301c8589 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_3.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 803 +#define P 5 +#include "vect-early-break-template_1.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_4.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_4.c new file mode 100644 index 0000000000000000000000000000000000000000..7e0e6426120551152a7bd800c15d9ed6ab15bada --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_4.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 803 +#define P 278 +#include "vect-early-break-template_1.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_5.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_5.c new file mode 100644 index 0000000000000000000000000000000000000000..242cf486f9c40055df0aef5fd238d1aff7a7c7da --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_5.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 800 +#define P 799 +#include "vect-early-break-template_1.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_6.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_6.c new file mode 100644 index 0000000000000000000000000000000000000000..9fe7136b7213a463ca6573c60476b7c8f531ddcb --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_6.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 803 +#define P 0 +#include "vect-early-break-template_2.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_7.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_7.c new file mode 100644 index 0000000000000000000000000000000000000000..02f93d77dba31b938f6fd9e8c7f5e4acde4aeec9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_7.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 803 +#define P 802 +#include "vect-early-break-template_2.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_8.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_8.c new file mode 100644 index 0000000000000000000000000000000000000000..a614925465606b54c638221ffb95a5e8d3bee797 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_8.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 803 +#define P 5 +#include "vect-early-break-template_2.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_9.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_9.c new file mode 100644 index 0000000000000000000000000000000000000000..94e2b9c301456eda8f9ad7eaa67604563f0afee7 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_9.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 803 +#define P 278 +#include "vect-early-break-template_2.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-template_1.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-template_1.c new file mode 100644 index 0000000000000000000000000000000000000000..af70a8e2a5a9dc9756edb5580f2de02ddcc95de9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-template_1.c @@ -0,0 +1,47 @@ +#ifndef N +#define N 803 +#endif + +#ifndef P +#define P 0 +#endif + +unsigned vect_a[N] = {0}; +unsigned vect_b[N] = {0}; + +__attribute__((noipa, noinline)) +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + break; + vect_a[i] = x; + + } + return ret; +} + +extern void abort (); + +int main () +{ + + int x = 1; + int idx = P; + vect_a[idx] = x + 1; + + test4(x); + + if (vect_b[idx] != (x + idx)) + abort (); + + if (vect_a[idx] != x + 1) + abort (); + + if (idx > 0 && vect_a[idx-1] != x) + abort (); + +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-template_2.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-template_2.c new file mode 100644 index 0000000000000000000000000000000000000000..d0f924d904437e71567d27cc1f1089e5607dca0d --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-template_2.c @@ -0,0 +1,50 @@ +#ifndef N +#define N 803 +#endif + +#ifndef P +#define P 0 +#endif + +unsigned vect_a[N] = {0}; +unsigned vect_b[N] = {0}; + +__attribute__((noipa, noinline)) +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + return i; + vect_a[i] = x; + + } + return ret; +} + +extern void abort (); + +int main () +{ + + int x = 1; + int idx = P; + vect_a[idx] = x + 1; + + unsigned res = test4(x); + + if (res != idx) + abort (); + + if (vect_b[idx] != (x + idx)) + abort (); + + if (vect_a[idx] != x + 1) + abort (); + + if (idx > 0 && vect_a[idx-1] != x) + abort (); + +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_1.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_1.c new file mode 100644 index 0000000000000000000000000000000000000000..51e7d6489b99c25b9b4b3d1c839f98562b6d4dd7 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_1.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#ifndef N +#define N 803 +#endif +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + break; + vect_a[i] = x; + + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_10.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_10.c new file mode 100644 index 0000000000000000000000000000000000000000..9e4ad1763202dfdab3ed7961ead5114fcc61a11b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_10.c @@ -0,0 +1,28 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#ifndef N +#define N 803 +#endif +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x,int y, int z) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + break; + vect_a[i] = x; + } + + ret = x + y * z; + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_11.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_11.c new file mode 100644 index 0000000000000000000000000000000000000000..a613dd9909fb09278dd92a81a24ef854994a9890 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_11.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#ifndef N +#define N 803 +#endif +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x, int y) +{ + unsigned ret = 0; +for (int o = 0; o < y; o++) +{ + ret += o; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + break; + vect_a[i] = x; + + } +} + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_12.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_12.c new file mode 100644 index 0000000000000000000000000000000000000000..cc10f3238f1cb8e1307e024a3ebcb5c25a39d1b2 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_12.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#ifndef N +#define N 803 +#endif +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x, int y) +{ + unsigned ret = 0; +for (int o = 0; o < y; o++) +{ + ret += o; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + return vect_a[i]; + vect_a[i] = x; + + } +} + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_13.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_13.c new file mode 100644 index 0000000000000000000000000000000000000000..6967b7395ed7c19e38a436d6edcfe7c1580c7113 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_13.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#ifndef N +#define N 803 +#endif +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + return vect_a[i] * x; + vect_a[i] = x; + + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_14.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_14.c new file mode 100644 index 0000000000000000000000000000000000000000..03cce5cf6cadecb520b46be666bf608e3bc6a511 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_14.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#define N 803 +unsigned vect_a[N]; +unsigned vect_b[N]; + +int test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + return i; + vect_a[i] += x * vect_b[i]; + + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_15.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_15.c new file mode 100644 index 0000000000000000000000000000000000000000..dec6872e1115ff66695f5a500ffa7ca01c0f8d3a --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_15.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#define N 803 +unsigned vect_a[N]; +unsigned vect_b[N]; + +int test4(unsigned x) +{ + int ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + return i; + vect_a[i] += x * vect_b[i]; + + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c new file mode 100644 index 0000000000000000000000000000000000000000..7268f6ae2485d0274fd85ea53cc1e44ef4b84d5c --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#include + +#define N 1024 +complex double vect_a[N]; +complex double vect_b[N]; + +complex double test4(complex double x) +{ + complex double ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] += x + i; + if (vect_a[i] == x) + return i; + vect_a[i] += x * vect_b[i]; + + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_3.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_3.c new file mode 100644 index 0000000000000000000000000000000000000000..3c6d28bd2d6e6e794146baf89e43c3b70293b7d9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_3.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ + +unsigned test4(char x, char *vect, int n) +{ + unsigned ret = 0; + for (int i = 0; i < n; i++) + { + if (vect[i] > x) + return 1; + + vect[i] = x; + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_4.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_4.c new file mode 100644 index 0000000000000000000000000000000000000000..216c56faf330449bf1969b7e51ff1e94270dc861 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_4.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ + +#define N 1024 +unsigned vect[N]; + +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + if (i > 16 && vect[i] > x) + break; + + vect[i] = x; + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_5.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_5.c new file mode 100644 index 0000000000000000000000000000000000000000..4a36d6979db1fd1f97ba2a290f78ac3b84f6de24 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_5.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#define N 1024 +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + return vect_a[i]; + vect_a[i] = x; + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_6.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_6.c new file mode 100644 index 0000000000000000000000000000000000000000..09632d9afda7e07f1a8417514ef77356f00045bd --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_6.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ + +#define N 1024 +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < (N/2); i+=2) + { + vect_b[i] = x + i; + vect_b[i+1] = x + i+1; + if (vect_a[i] > x || vect_a[i+1] > x) + break; + vect_a[i] += x * vect_b[i]; + vect_a[i+1] += x * vect_b[i+1]; + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c new file mode 100644 index 0000000000000000000000000000000000000000..10fd8b42952c42f3d3a014da103931ca394423d5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#include + +#define N 1024 +complex double vect_a[N]; +complex double vect_b[N]; + +complex double test4(complex double x) +{ + complex double ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] += x + i; + if (vect_a[i] == x) + break; + vect_a[i] += x * vect_b[i]; + + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_8.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_8.c new file mode 100644 index 0000000000000000000000000000000000000000..ae706b2952cfcecf20546a67a735b8d902cbb607 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_8.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#include + +#define N 1024 +char vect_a[N]; +char vect_b[N]; + +char test4(char x, char * restrict res) +{ + char ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] += x + i; + if (vect_a[i] > x) + break; + vect_a[i] += x * vect_b[i]; + res[i] *= vect_b[i]; + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_9.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_9.c new file mode 100644 index 0000000000000000000000000000000000000000..350f02f3c7caef457adbe1be802bba51cd818393 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_9.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ + +#ifndef N +#define N 803 +#endif +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_a[i] = x + i; + if (vect_a[i] > x) + break; + vect_a[i] = x; + + } + return ret; +} diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index ccbbee847f755d6f30116d5b38e4027a998b48fd..5cbf54bd2a23dfdc5dc7b148b0dc6ed4c63814ae 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -3645,6 +3645,18 @@ proc check_effective_target_vect_int { } { }}] } +# Return 1 if the target supports hardware vectorization of early breaks, +# 0 otherwise. +# +# This won't change for different subtargets so cache the result. + +proc check_effective_target_vect_early_break { } { + return [check_cached_effective_target_indexed vect_early_break { + expr { + ([istarget aarch64*-*-*] + && [check_effective_target_aarch64_sve]) + }}] +} # Return 1 if the target supports hardware vectorization of complex additions of # byte, 0 otherwise. # diff --git a/gcc/tree-scalar-evolution.h b/gcc/tree-scalar-evolution.h index 0f90207bc733db3cf85979d9b0b962aefa0831d6..5af7d2bba0d62195704a8d41ef6e600327169770 100644 --- a/gcc/tree-scalar-evolution.h +++ b/gcc/tree-scalar-evolution.h @@ -23,6 +23,7 @@ along with GCC; see the file COPYING3. If not see extern tree number_of_latch_executions (class loop *); extern gcond *get_loop_exit_condition (const class loop *); +extern gcond *get_edge_condition (edge); extern void scev_initialize (void); extern bool scev_initialized_p (void); diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc index 7e2a3e986619de87e4ae9daf16198be1f13b917c..3012871dd7f9a7d1897f96a29b1b0b28d90cb63f 100644 --- a/gcc/tree-scalar-evolution.cc +++ b/gcc/tree-scalar-evolution.cc @@ -884,7 +884,7 @@ scev_dfs::add_to_evolution (tree chrec_before, enum tree_code code, return res; } - + /* Follow the ssa edge into the binary expression RHS0 CODE RHS1. Return true if the strongly connected component has been found. */ @@ -1295,8 +1295,15 @@ tail_recurse: gcond * get_loop_exit_condition (const class loop *loop) { + return get_edge_condition (normal_exit (loop)); +} + +/* If the statement just before the EXIT_EDGE contains a condition then + return the condition, otherwise NULL. */ + +gcond * +get_edge_condition (edge exit_edge){ gcond *res = NULL; - edge exit_edge = single_exit (loop); if (dump_file && (dump_flags & TDF_SCEV)) fprintf (dump_file, "(get_loop_exit_condition \n "); diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index 4a23d6172aaa12ad7049dc626e5c4afbd5ca3f74..02e373171675432cd32c4a72440eebdff988bdcf 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -2072,7 +2072,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) /* Check if we can possibly peel the loop. */ if (!vect_can_advance_ivs_p (loop_vinfo) - || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)) + || !slpeel_can_duplicate_loop_p (loop_vinfo, normal_exit (loop)) || loop->inner) do_peeling = false; diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc index 1d96130c985e2defd141cfdf602224c73b4b41f2..0b2a4920754d83aeb3795b435693d61adcfe92b6 100644 --- a/gcc/tree-vect-loop-manip.cc +++ b/gcc/tree-vect-loop-manip.cc @@ -770,7 +770,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop, add_header_seq (loop, header_seq); /* Get a boolean result that tells us whether to iterate. */ - edge exit_edge = single_exit (loop); + edge exit_edge = normal_exit (loop); tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR; tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl)); gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl, @@ -789,7 +789,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop, if (final_iv) { gassign *assign = gimple_build_assign (final_iv, orig_niters); - gsi_insert_on_edge_immediate (single_exit (loop), assign); + gsi_insert_on_edge_immediate (exit_edge, assign); } return cond_stmt; @@ -799,7 +799,8 @@ vect_set_loop_condition_partial_vectors (class loop *loop, loop handles exactly VF scalars per iteration. */ static gcond * -vect_set_loop_condition_normal (class loop *loop, tree niters, tree step, +vect_set_loop_condition_normal (loop_vec_info loop_vinfo, + class loop *loop, tree niters, tree step, tree final_iv, bool niters_maybe_zero, gimple_stmt_iterator loop_cond_gsi) { @@ -807,7 +808,7 @@ vect_set_loop_condition_normal (class loop *loop, tree niters, tree step, gcond *cond_stmt; gcond *orig_cond; edge pe = loop_preheader_edge (loop); - edge exit_edge = single_exit (loop); + edge exit_edge = normal_exit (loop); gimple_stmt_iterator incr_gsi; bool insert_after; enum tree_code code; @@ -872,7 +873,11 @@ vect_set_loop_condition_normal (class loop *loop, tree niters, tree step, In both cases the loop limit is NITERS - STEP. */ gimple_seq seq = NULL; limit = force_gimple_operand (niters, &seq, true, NULL_TREE); - limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step); + /* For VLA leave limit == niters. Though I wonder if maybe I should + force partial loops here and use vect_set_loop_condition_partial_vectors + instead. The problem is that the VL check is useless here. */ + if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo) && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) + limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step); if (seq) { basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); @@ -907,7 +912,8 @@ vect_set_loop_condition_normal (class loop *loop, tree niters, tree step, gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT); /* Record the number of latch iterations. */ - if (limit == niters) + if (limit == niters + || LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) /* Case A: the loop iterates NITERS times. Subtract one to get the latch count. */ loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters, @@ -918,10 +924,17 @@ vect_set_loop_condition_normal (class loop *loop, tree niters, tree step, loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type, limit, step); - if (final_iv) + auto_vec exits = get_loop_exit_edges (loop); + /* For multiple exits we've already maintained LCSSA form and handled + the scalar iteration update in the code that deals with the merge + block and its updated guard. I could move that code here instead + of in vect_update_ivs_after_early_break but I have to still deal + with the updates to the counter `i`. So for now I'll keep them + together. */ + if (final_iv && exits.length () == 1) { gassign *assign; - edge exit = single_exit (loop); + edge exit = normal_exit (loop); gcc_assert (single_pred_p (exit->dest)); tree phi_dest = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr); @@ -972,13 +985,15 @@ vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo, gcond *orig_cond = get_loop_exit_condition (loop); gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond); - if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) + if (loop_vinfo + && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo, niters, final_iv, niters_maybe_zero, loop_cond_gsi); else - cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv, + cond_stmt = vect_set_loop_condition_normal (loop_vinfo, loop, niters, + step, final_iv, niters_maybe_zero, loop_cond_gsi); @@ -1066,7 +1081,7 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge exit, new_exit; bool duplicate_outer_loop = false; - exit = single_exit (loop); + exit = normal_exit (loop); at_exit = (e == exit); if (!at_exit && e != loop_preheader_edge (loop)) return NULL; @@ -1104,11 +1119,11 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, bbs[0] = preheader; new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1); - exit = single_exit (scalar_loop); + exit = normal_exit (scalar_loop); copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs, &exit, 1, &new_exit, NULL, at_exit ? loop->latch : e->src, true); - exit = single_exit (loop); + exit = normal_exit (loop); basic_block new_preheader = new_bbs[0]; /* Before installing PHI arguments make sure that the edges @@ -1176,11 +1191,53 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, add_phi_arg (phi, orig_arg, new_exit, orig_locus); } } + + /* If have multiple exist, we now need to point the additional exits + from the old loop to the loop pre-header of the new copied loop. + Currently we only support simple early break vectorization so all + additional exits must exit the loop. Additionally we can only place + copies at the end. i.e. we cannot do prologue peeling. */ + auto_vec exits = get_loop_exit_edges (loop); + bool multiple_exits_p = exits.length () > 1; + + /* Check to see if all of the exits point to the loop header. If they + don't then we have an intermediate BB that's no longer useful after + the copy and we should remove it. */ + bool imm_exit = true; + for (auto exit : exits) + { + imm_exit = imm_exit && exit->dest == loop->header; + if (!imm_exit) + break; + } + + for (unsigned i = 1; i < exits.length (); i++) + { + redirect_edge_and_branch (exits[i], new_preheader); + flush_pending_stmts (exits[i]); + } + + /* Main exit must be the last to be rewritten as it's the first phi node + entry. The rest are in array order. */ redirect_edge_and_branch_force (e, new_preheader); flush_pending_stmts (e); - set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src); + + /* Only update the dominators of the new_preheader to the old exit if + we have effectively a single exit. */ + if (!multiple_exits_p + || exits[1]->src != EDGE_PRED (exits[0]->src, 0)->src) + set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src); + else + set_immediate_dominator (CDI_DOMINATORS, new_preheader, exits[1]->src); + + auto_vec new_exits = get_loop_exit_edges (new_loop); if (was_imm_dom || duplicate_outer_loop) - set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src); + { + if (!multiple_exits_p) + set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src); + else + set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exits[1]->src); + } /* And remove the non-necessary forwarder again. Keep the other one so we have a proper pre-header for the loop at the exit edge. */ @@ -1189,6 +1246,39 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, delete_basic_block (preheader); set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, loop_preheader_edge (scalar_loop)->src); + + /* Finally after wiring the new epilogue we need to update its main exit + to the original function exit we recorded. Other exits are already + correct. */ + if (!imm_exit && multiple_exits_p) + { + /* For now we expect at most a single successor here, but we might be + able to extend this to multiple. */ + if (single_succ_p (new_exit->dest) && single_pred_p (new_exit->dest)) + { + edge exit_edge = single_succ_edge (new_exit->dest); + /* Now correct the dominators that were messed up during the copying + as the CFG was tweaked a bit. */ + /* The main exit is now dominated by a new fall through edge. */ + set_immediate_dominator (CDI_DOMINATORS, exit_edge->src, + new_exits[0]->src); + /* If this is a fall through edge then don't update doms. */ + if (!empty_block_p (exit_edge->src)) + set_immediate_dominator (CDI_DOMINATORS, exit_edge->dest, + new_exits[1]->src); + } + + /* The exits from the BB with the early exit dominate the new function + exit edge and also the second part of the loop. The edges were + copied correctly but the doms are wrong because during the copying + some of the intermediate edges are rewritten. */ + set_immediate_dominator (CDI_DOMINATORS, new_exits[0]->src, + new_exits[1]->src); + set_immediate_dominator (CDI_DOMINATORS, new_exits[0]->dest, + new_exits[0]->src); + set_immediate_dominator (CDI_DOMINATORS, new_exits[1]->dest, + new_exits[1]->src); + } } else /* Add the copy at entry. */ { @@ -1310,20 +1400,24 @@ slpeel_add_loop_guard (basic_block guard_bb, tree cond, */ bool -slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e) +slpeel_can_duplicate_loop_p (const loop_vec_info loop_vinfo, const_edge e) { - edge exit_e = single_exit (loop); + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + edge exit_e = normal_exit (loop); edge entry_e = loop_preheader_edge (loop); gcond *orig_cond = get_loop_exit_condition (loop); gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src); unsigned int num_bb = loop->inner? 5 : 2; + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + num_bb += 1; + /* All loops have an outer scope; the only case loop->outer is NULL is for the function itself. */ if (!loop_outer (loop) || loop->num_nodes != num_bb || !empty_block_p (loop->latch) - || !single_exit (loop) + || (!single_exit (loop) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) /* Verify that new loop exit condition can be trivially modified. */ || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi)) || (e != exit_e && e != entry_e)) @@ -1528,6 +1622,12 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, gphi_iterator gsi, gsi1; class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); basic_block update_bb = update_e->dest; + + /* For early exits we'll update the IVs in + vect_update_ivs_after_early_break. */ + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + return; + basic_block exit_bb = single_exit (loop)->dest; /* Make sure there exists a single-predecessor exit bb: */ @@ -1613,6 +1713,186 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, /* Fix phi expressions in the successor bb. */ adjust_phi_and_debug_stmts (phi1, update_e, ni_name); } + return; +} + +/* Function vect_update_ivs_after_early_break. + + "Advance" the induction variables of LOOP to the value they should take + after the execution of LOOP. This is currently necessary because the + vectorizer does not handle induction variables that are used after the + loop. Such a situation occurs when the last iterations of LOOP are + peeled, because of the early exit. With an early exit we always peel the + loop. + + Input: + - LOOP_VINFO - a loop info structure for the loop that is going to be + vectorized. The last few iterations of LOOP were peeled. + - LOOP - a loop that is going to be vectorized. The last few iterations + of LOOP were peeled. + - VF - The loop vectorization factor. + - NITERS_ORIG - the number of iterations that LOOP executes (before it is + vectorized). i.e, the number of times the ivs should be + bumped. + - NITERS_VECTOR - The number of iterations that the vector LOOP executes. + - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path + coming out from LOOP on which there are uses of the LOOP ivs + (this is the path from LOOP->exit to epilog_loop->preheader). + + The new definitions of the ivs are placed in LOOP->exit. + The phi args associated with the edge UPDATE_E in the bb + UPDATE_E->dest are updated accordingly. + + Output: + - If available, the LCSSA phi node for the loop IV temp. + + Assumption 1: Like the rest of the vectorizer, this function assumes + a single loop exit that has a single predecessor. + + Assumption 2: The phi nodes in the LOOP header and in update_bb are + organized in the same order. + + Assumption 3: The access function of the ivs is simple enough (see + vect_can_advance_ivs_p). This assumption will be relaxed in the future. + + Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path + coming out of LOOP on which the ivs of LOOP are used (this is the path + that leads to the epilog loop; other paths skip the epilog loop). This + path starts with the edge UPDATE_E, and its destination (denoted update_bb) + needs to have its phis updated. + */ + +static tree +vect_update_ivs_after_early_break (loop_vec_info loop_vinfo, class loop *, + poly_int64 vf, tree niters_orig, + tree niters_vector, edge update_e) +{ + gphi_iterator gsi, gsi1; + tree ni_name, ivtmp = NULL; + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + basic_block update_bb = update_e->dest; + auto_vec exits = get_loop_exit_edges (loop); + + basic_block exit_bb = exits[0]->dest; + + if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + return NULL; + + for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb); + !gsi_end_p (gsi) && !gsi_end_p (gsi1); + gsi_next (&gsi), gsi_next (&gsi1)) + { + tree init_expr; + tree step_expr; + tree type; + tree var, ni; + gimple_stmt_iterator last_gsi; + + gphi *phi = gsi1.phi (); + tree phi_ssa = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop)); + gphi *phi1 = as_a (SSA_NAME_DEF_STMT (phi_ssa)); + stmt_vec_info phi_info = loop_vinfo->lookup_stmt (gsi.phi ()); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "vect_update_ivs_after_early_break: phi: %G", + (gimple *)phi); + + /* Skip reduction and virtual phis. */ + if (!iv_phi_p (phi_info)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "reduc or virtual phi. skip.\n"); + continue; + } + + /* For multiple exits where we handle early exits we need to carry on + with the previous IV as loop iteration was not done because we exited + early. As such just grab the original IV. */ + if (STMT_VINFO_TYPE (phi_info) != undef_vec_info_type) + { + type = TREE_TYPE (gimple_phi_result (phi)); + step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); + step_expr = unshare_expr (step_expr); + + /* We previously generated the new merged phi in the same BB as the + guard. So use that to perform the scaling on rather than the + normal loop phi which don't take the early breaks into account. */ + init_expr = gimple_phi_result (phi1); //PHI_ARG_DEF_FROM_EDGE (phi1, loop_preheader_edge (loop)); + + ni = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), + fold_convert (TREE_TYPE (step_expr), init_expr), + build_int_cst (TREE_TYPE (step_expr), vf)); + + var = create_tmp_var (type, "tmp"); + + last_gsi = gsi_last_bb (exit_bb); + gimple_seq new_stmts = NULL; + ni_name = force_gimple_operand (ni, &new_stmts, false, var); + /* Exit_bb shouldn't be empty. */ + if (!gsi_end_p (last_gsi)) + gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT); + else + gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT); + + /* Fix phi expressions in the successor bb. */ + adjust_phi_and_debug_stmts (phi, update_e, ni_name); + } + else if (STMT_VINFO_TYPE (phi_info) == undef_vec_info_type) + { + type = TREE_TYPE (gimple_phi_result (phi)); + step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); + step_expr = unshare_expr (step_expr); + + /* We previously generated the new merged phi in the same BB as the + guard. So use that to perform the scaling on rather than the + normal loop phi which don't take the early breaks into account. */ + init_expr = PHI_ARG_DEF_FROM_EDGE (phi1, loop_preheader_edge (loop)); + + if (vf.is_constant ()) + { + ni = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), + fold_convert (TREE_TYPE (step_expr), + niters_vector), + build_int_cst (TREE_TYPE (step_expr), vf)); + + ni = fold_build2 (MINUS_EXPR, TREE_TYPE (step_expr), + fold_convert (TREE_TYPE (step_expr), + niters_orig), + fold_convert (TREE_TYPE (step_expr), ni)); + } + else + /* If the loop's VF isn't constant then the loop must have been + masked, so at the end of the loop we know we have finished + the entire loop and found nothing. */ + ni = build_zero_cst (TREE_TYPE (step_expr)); + + gcc_assert (TREE_CODE (ni) == INTEGER_CST); + + var = create_tmp_var (type, "tmp"); + + last_gsi = gsi_last_bb (exit_bb); + gimple_seq new_stmts = NULL; + ni_name = force_gimple_operand (ni, &new_stmts, false, var); + /* Exit_bb shouldn't be empty. */ + if (!gsi_end_p (last_gsi)) + gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT); + else + gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT); + + adjust_phi_and_debug_stmts (phi1, update_e, ni_name); + + for (unsigned i = 1; i < exits.length (); i++) + adjust_phi_and_debug_stmts (phi1, exits[i], + build_int_cst (TREE_TYPE (step_expr), + vf)); + ivtmp = gimple_phi_result (phi1); + } + else + continue; + } + + return ivtmp; } /* Return a gimple value containing the misalignment (measured in vector @@ -2096,7 +2376,7 @@ vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo, class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree type = TREE_TYPE (niters_vector); tree log_vf = build_int_cst (type, exact_log2 (vf)); - basic_block exit_bb = single_exit (loop)->dest; + basic_block exit_bb = normal_exit (loop)->dest; gcc_assert (niters_vector_mult_vf_ptr != NULL); tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type, @@ -2123,19 +2403,46 @@ find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED, gphi *lcssa_phi) { gphi_iterator gsi; - edge e = single_exit (loop); + edge e = normal_exit (loop); - gcc_assert (single_pred_p (e->dest)); for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi)) { gphi *phi = gsi.phi (); - if (operand_equal_p (PHI_ARG_DEF (phi, 0), - PHI_ARG_DEF (lcssa_phi, 0), 0)) + /* Nested loops with multiple exits can have different no# phi node + arguments between the main loop and epilog as epilog falls to the + second loop. */ + if (gimple_phi_num_args (phi) > e->dest_idx + && operand_equal_p (PHI_ARG_DEF (phi, e->dest_idx), + PHI_ARG_DEF (lcssa_phi, 0), 0)) return PHI_RESULT (phi); } return NULL_TREE; } +/* Starting from the current edge walk all instructions and find the last + VUSE/VDEF in the basic block. */ + +static tree +gimple_find_last_mem_use (edge e) +{ + basic_block bb = e->src; + tree res = NULL; + gimple_stmt_iterator iter = gsi_last_bb (bb); + do + { + gimple *stmt = gsi_stmt (iter); + if ((res = gimple_vdef (stmt))) + return res; + + if ((res = gimple_vuse (stmt))) + return res; + + gsi_prev (&iter); + } while (!gsi_end_p (iter)); + + return NULL; +} + /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND from SECOND/FIRST and puts it at the original loop's preheader/exit edge, the two loops are arranged as below: @@ -2185,6 +2492,7 @@ find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED, static void slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo, class loop *first, class loop *second, + tree *lcssa_ivtmp, bool create_lcssa_for_iv_phis) { gphi_iterator gsi_update, gsi_orig; @@ -2192,10 +2500,18 @@ slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo, edge first_latch_e = EDGE_SUCC (first->latch, 0); edge second_preheader_e = loop_preheader_edge (second); - basic_block between_bb = single_exit (first)->dest; + auto_vec exits = get_loop_exit_edges (first); + basic_block between_bb = exits[0]->dest; + + bool early_exit = LOOP_VINFO_EARLY_BREAKS (loop_vinfo); + /* For early exits when we create the merge BB we must maintain it in + LCSSA form, otherwise the final vectorizer passes will create the + wrong PHI nodes here. */ + create_lcssa_for_iv_phis = create_lcssa_for_iv_phis || early_exit; gcc_assert (between_bb == second_preheader_e->src); - gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb)); + gcc_assert ((single_pred_p (between_bb) && single_succ_p (between_bb)) + || early_exit); /* Either the first loop or the second is the loop to be vectorized. */ gcc_assert (loop == first || loop == second); @@ -2215,10 +2531,40 @@ slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo, { tree new_res = copy_ssa_name (PHI_RESULT (orig_phi)); gphi *lcssa_phi = create_phi_node (new_res, between_bb); - add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION); + + /* The first exit is always the loop latch, so handle that + seperately. */ + gcc_assert (arg); + add_phi_arg (lcssa_phi, arg, exits[0], UNKNOWN_LOCATION); + + /* The early exits are processed in order starting from exit 1. */ + for (unsigned i = 1; i < exits.length (); i++) + { + tree phi_arg; + if (iv_phi_p (vect_phi_info)) + /* For induction values just copy the previous one as the + current iteration did not finish. We'll update as needed + later on. */ + phi_arg = gimple_phi_result (orig_phi); + else + phi_arg = gimple_find_last_mem_use (exits[i]); + /* If we didn't find any just copy the existing one and leave + it to the others to fix it up. */ + if (!phi_arg) + phi_arg = gimple_phi_result (orig_phi); + add_phi_arg (lcssa_phi, phi_arg, exits[i], UNKNOWN_LOCATION); + } arg = new_res; } + /* Normally able to distinguish between the iterator counter and the + ivtemps bu looking at the STMT_VINFO_TYPE of the phi node. + however for some reason this isn't consistently set. Is there a + better way??. */ + if (lcssa_ivtmp + && iv_phi_p (vect_phi_info)) + *lcssa_ivtmp = arg; + /* Update PHI node in the second loop by replacing arg on the loop's incoming edge. */ adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg); @@ -2228,7 +2574,8 @@ slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo, for correct vectorization of live stmts. */ if (loop == first) { - basic_block orig_exit = single_exit (second)->dest; + auto_vec new_exits = get_loop_exit_edges (second); + basic_block orig_exit = new_exits[0]->dest; for (gsi_orig = gsi_start_phis (orig_exit); !gsi_end_p (gsi_orig); gsi_next (&gsi_orig)) { @@ -2243,7 +2590,15 @@ slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo, tree new_res = copy_ssa_name (orig_arg); gphi *lcphi = create_phi_node (new_res, between_bb); - add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION); + /* The first exit is always the loop latch, so handle that + seperately. */ + add_phi_arg (lcphi, orig_arg, new_exits[0], UNKNOWN_LOCATION); + /* The early exits are processed in order starting from exit 1. */ + for (unsigned i = 1; i < new_exits.length (); i++) + { + tree phi_arg = gimple_phi_result (orig_phi); + add_phi_arg (lcphi, phi_arg, exits[i], UNKNOWN_LOCATION); + } } } } @@ -2393,13 +2748,11 @@ slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog, gcc_assert (single_succ_p (merge_bb)); edge e = single_succ_edge (merge_bb); basic_block exit_bb = e->dest; - gcc_assert (single_pred_p (exit_bb)); - gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest); for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi)) { gphi *update_phi = gsi.phi (); - tree old_arg = PHI_ARG_DEF (update_phi, 0); + tree old_arg = PHI_ARG_DEF (update_phi, e->dest_idx); tree merge_arg = NULL_TREE; @@ -2438,12 +2791,14 @@ static void slpeel_update_phi_nodes_for_lcssa (class loop *epilog) { gphi_iterator gsi; - basic_block exit_bb = single_exit (epilog)->dest; + auto_vec exits = get_loop_exit_edges (epilog); - gcc_assert (single_pred_p (exit_bb)); - edge e = EDGE_PRED (exit_bb, 0); - for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi)) - rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e)); + for (unsigned i = 0; i < exits.length (); i++) + { + basic_block exit_bb = exits[i]->dest; + for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi)) + rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), exits[i])); + } } /* EPILOGUE_VINFO is an epilogue loop that we now know would need to @@ -2621,6 +2976,14 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, bound_epilog += vf - 1; if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) bound_epilog += 1; + /* For early breaks the scalar loop needs to execute at most VF times + to find the element that caused the break. */ + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + { + bound_epilog = vf; + /* Force a scalar epilogue as we can't vectorize the index finding. */ + vect_epilogues = false; + } bool epilog_peeling = maybe_ne (bound_epilog, 0U); poly_uint64 bound_scalar = bound_epilog; @@ -2780,16 +3143,24 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, bound_prolog + bound_epilog) : (!LOOP_REQUIRES_VERSIONING (loop_vinfo) || vect_epilogues)); + + /* We only support early break vectorization on known bounds at this time. + This means that if the vector loop can't be entered then we won't generate + it at all. So for now force skip_vector off because the additional control + flow messes with the BB exits and we've already analyzed them. */ + skip_vector = skip_vector && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo); + /* Epilog loop must be executed if the number of iterations for epilog loop is known at compile time, otherwise we need to add a check at the end of vector loop and skip to the end of epilog loop. */ bool skip_epilog = (prolog_peeling < 0 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || !vf.is_constant ()); - /* PEELING_FOR_GAPS is special because epilog loop must be executed. */ - if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) + /* PEELING_FOR_GAPS and peeling for early breaks are special because epilog + loop must be executed. */ + if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) + || LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) skip_epilog = false; - class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); auto_vec original_counts; basic_block *original_bbs = NULL; @@ -2828,7 +3199,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, if (prolog_peeling) { e = loop_preheader_edge (loop); - if (!slpeel_can_duplicate_loop_p (loop, e)) + if (!slpeel_can_duplicate_loop_p (loop_vinfo, e)) { dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, "loop can't be duplicated to preheader edge.\n"); @@ -2843,7 +3214,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, gcc_unreachable (); } prolog->force_vectorize = false; - slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true); + slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, NULL, true); first_loop = prolog; reset_original_copy_tables (); @@ -2902,11 +3273,13 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, if (epilog_peeling) { - e = single_exit (loop); - if (!slpeel_can_duplicate_loop_p (loop, e)) + auto_vec exits = get_loop_exit_edges (loop); + e = exits[0]; + if (!slpeel_can_duplicate_loop_p (loop_vinfo, e)) { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, - "loop can't be duplicated to exit edge.\n"); + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, + "loop can't be duplicated to exit edge.\n"); gcc_unreachable (); } /* Peel epilog and put it on exit edge of loop. If we are vectorizing @@ -2920,12 +3293,16 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e); if (!epilog) { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, - "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n"); + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, + "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n"); gcc_unreachable (); } epilog->force_vectorize = false; - slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false); + + tree early_break_iv_name; + slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, + &early_break_iv_name, false); /* Scalar version loop may be preferred. In this case, add guard and skip to epilog. Note this only happens when the number of @@ -2978,6 +3355,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, vect_gen_vector_loop_niters (loop_vinfo, niters, niters_vector, step_vector, niters_no_overflow); + if (!integer_onep (*step_vector)) { /* On exit from the loop we will have an easy way of calcalating @@ -2987,9 +3365,13 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop (); *niters_vector_mult_vf_var = niters_vector_mult_vf; } + else if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + vect_gen_vector_loop_niters_mult_vf (loop_vinfo, early_break_iv_name, + &niters_vector_mult_vf); else vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector, &niters_vector_mult_vf); + /* Update IVs of original loop as if they were advanced by niters_vector_mult_vf steps. */ gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo)); @@ -2997,12 +3379,97 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf, update_e); + /* For early breaks we must create a guard to check how many iterations + of the scalar loop are yet to be performed. */ + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + { + gcc_assert (early_break_iv_name); + tree ivtmp = + vect_update_ivs_after_early_break (loop_vinfo, epilog, vf, niters, + *niters_vector, update_e); + + tree guard_cond = fold_build2 (EQ_EXPR, boolean_type_node, + fold_convert (TREE_TYPE (niters), + ivtmp), + build_zero_cst (TREE_TYPE (niters))); + basic_block guard_bb = normal_exit (loop)->dest; + auto_vec new_exits = get_loop_exit_edges (epilog); + /* If we had a fallthrough edge, the guard will the threaded through + and so we may need to find the actual final edge. */ + edge final_edge = new_exits[0]; + basic_block guard_to; + bool fn_exit_p = false; + if (gsi_end_p (gsi_start_nondebug_bb (final_edge->dest)) + && !gsi_end_p (gsi_start_phis (final_edge->dest)) + && single_succ_p (final_edge->dest)) + { + auto gsi = gsi_start_phis (final_edge->dest); + while (!gsi_end_p (gsi)) + gsi_remove (&gsi, true); + guard_to = final_edge->dest; + fn_exit_p = true; + } + else + guard_to = split_edge (normal_exit (epilog)); + + edge guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to, + guard_bb, + prob_epilog.invert (), + irred_flag); + + basic_block dest = single_succ (guard_to); + /* If we have a single pred then the previous block is the immediate + dominator. This may or may not be the guard bb. However if we + have multiple pred then the guard BB must be the dominator as all + previous exits got rewrited to the guard BB. */ + if (single_pred_p (dest)) + set_immediate_dominator (CDI_DOMINATORS, dest, guard_to); + else + set_immediate_dominator (CDI_DOMINATORS, dest, guard_bb); + + /* We must update all the edges from the new guard_bb. */ + slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e, + final_edge); + + /* If we have an additional functione exit block, then thread the updates + through to the block. Leaving it up to the LCSSA cleanup pass will + get the wrong values here as it can't handle the merge block we just + made correctly. */ + if (fn_exit_p) + { + gphi_iterator gsi_update, gsi_orig, gsi_vect; + for (gsi_orig = gsi_start_phis (epilog->header), + gsi_update = gsi_start_phis (guard_e->dest), + gsi_vect = gsi_start_phis (loop->header); + !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update) + && !gsi_end_p (gsi_vect); + gsi_next (&gsi_orig), gsi_next (&gsi_update), + gsi_next (&gsi_vect)) + { + gphi *orig_phi = gsi_orig.phi (); + gphi *update_phi = gsi_update.phi (); + gphi *vect_phi = gsi_vect.phi (); + stmt_vec_info phi_info = loop_vinfo->lookup_stmt (vect_phi); + + if (iv_phi_p (phi_info)) + continue; + + tree phi_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, update_e); + SET_PHI_ARG_DEF (update_phi, update_e->dest_idx, phi_arg); + + phi_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, guard_e); + SET_PHI_ARG_DEF (update_phi, guard_e->dest_idx, phi_arg); + } + } + flush_pending_stmts (guard_e); + } + if (skip_epilog) { guard_cond = fold_build2 (EQ_EXPR, boolean_type_node, niters, niters_vector_mult_vf); - guard_bb = single_exit (loop)->dest; - guard_to = split_edge (single_exit (epilog)); + guard_bb = normal_exit (loop)->dest; + guard_to = split_edge (normal_exit (epilog)); guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to, skip_vector ? anchor : guard_bb, prob_epilog.invert (), @@ -3010,7 +3477,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, if (vect_epilogues) epilogue_vinfo->skip_this_loop_edge = guard_e; slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e, - single_exit (epilog)); + normal_exit (epilog)); /* Only need to handle basic block before epilog loop if it's not the guard_bb, which is the case when skip_vector is true. */ if (guard_bb != bb_before_epilog) @@ -3023,7 +3490,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, } else slpeel_update_phi_nodes_for_lcssa (epilog); - unsigned HOST_WIDE_INT bound; if (bound_scalar.is_constant (&bound)) { @@ -3114,7 +3580,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, adjust_vec.release (); free_original_copy_tables (); - return vect_epilogues ? epilog : NULL; } diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index d5c2bff80be9be152707eb9d3932c863948daa73..548946a6bbf8892086a17fe3003da2c3dceadf5b 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -844,80 +844,106 @@ vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo) in NUMBER_OF_ITERATIONSM1. Place the condition under which the niter information holds in ASSUMPTIONS. - Return the loop exit condition. */ + Return the loop exit conditions. */ -static gcond * +static vec vect_get_loop_niters (class loop *loop, tree *assumptions, tree *number_of_iterations, tree *number_of_iterationsm1) { - edge exit = single_exit (loop); + auto_vec exits = get_loop_exit_edges (loop); + vec conds; + conds.create (exits.length ()); class tree_niter_desc niter_desc; tree niter_assumptions, niter, may_be_zero; - gcond *cond = get_loop_exit_condition (loop); *assumptions = boolean_true_node; *number_of_iterationsm1 = chrec_dont_know; *number_of_iterations = chrec_dont_know; + DUMP_VECT_SCOPE ("get_loop_niters"); - if (!exit) - return cond; + if (exits.is_empty ()) + return conds; - may_be_zero = NULL_TREE; - if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL) - || chrec_contains_undetermined (niter_desc.niter)) - return cond; + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n", + exits.length ()); - niter_assumptions = niter_desc.assumptions; - may_be_zero = niter_desc.may_be_zero; - niter = niter_desc.niter; + edge exit; + unsigned int i; + FOR_EACH_VEC_ELT (exits, i, exit) + { + gcond *cond = get_edge_condition (exit); + if (cond) + conds.safe_push (cond); - if (may_be_zero && integer_zerop (may_be_zero)) - may_be_zero = NULL_TREE; + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i); - if (may_be_zero) - { - if (COMPARISON_CLASS_P (may_be_zero)) + may_be_zero = NULL_TREE; + if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL) + || chrec_contains_undetermined (niter_desc.niter)) + continue; + + niter_assumptions = niter_desc.assumptions; + may_be_zero = niter_desc.may_be_zero; + niter = niter_desc.niter; + + if (may_be_zero && integer_zerop (may_be_zero)) + may_be_zero = NULL_TREE; + + if (may_be_zero) { - /* Try to combine may_be_zero with assumptions, this can simplify - computation of niter expression. */ - if (niter_assumptions && !integer_nonzerop (niter_assumptions)) - niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, - niter_assumptions, - fold_build1 (TRUTH_NOT_EXPR, - boolean_type_node, - may_be_zero)); + if (COMPARISON_CLASS_P (may_be_zero)) + { + /* Try to combine may_be_zero with assumptions, this can simplify + computation of niter expression. */ + if (niter_assumptions && !integer_nonzerop (niter_assumptions)) + niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, + niter_assumptions, + fold_build1 (TRUTH_NOT_EXPR, + boolean_type_node, + may_be_zero)); + else + niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero, + build_int_cst (TREE_TYPE (niter), 0), + rewrite_to_non_trapping_overflow (niter)); + + may_be_zero = NULL_TREE; + } + else if (integer_nonzerop (may_be_zero) && i == 0) + { + *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0); + *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1); + continue; + } else - niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero, - build_int_cst (TREE_TYPE (niter), 0), - rewrite_to_non_trapping_overflow (niter)); + continue; + } - may_be_zero = NULL_TREE; - } - else if (integer_nonzerop (may_be_zero)) + /* Loop assumptions are based off the normal exit. */ + if (i == 0) { - *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0); - *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1); - return cond; + *assumptions = niter_assumptions; + *number_of_iterationsm1 = niter; + + /* We want the number of loop header executions which is the number + of latch executions plus one. + ??? For UINT_MAX latch executions this number overflows to zero + for loops like do { n++; } while (n != 0); */ + if (niter && !chrec_contains_undetermined (niter)) + niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), + unshare_expr (niter), + build_int_cst (TREE_TYPE (niter), 1)); + *number_of_iterations = niter; } - else - return cond; } - *assumptions = niter_assumptions; - *number_of_iterationsm1 = niter; - - /* We want the number of loop header executions which is the number - of latch executions plus one. - ??? For UINT_MAX latch executions this number overflows to zero - for loops like do { n++; } while (n != 0); */ - if (niter && !chrec_contains_undetermined (niter)) - niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter), - build_int_cst (TREE_TYPE (niter), 1)); - *number_of_iterations = niter; + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n"); - return cond; + return conds; } /* Function bb_in_loop_p @@ -1455,7 +1481,8 @@ vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) Verify that certain CFG restrictions hold, including: - the loop has a pre-header - - the loop has a single entry and exit + - the loop has a single entry + - nested loops can have only a single exit. - the loop exit condition is simple enough - the number of iterations can be analyzed, i.e, a countable loop. The niter could be analyzed under some assumptions. */ @@ -1484,11 +1511,6 @@ vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info) | (exit-bb) */ - if (loop->num_nodes != 2) - return opt_result::failure_at (vect_location, - "not vectorized:" - " control flow in loop.\n"); - if (empty_block_p (loop->header)) return opt_result::failure_at (vect_location, "not vectorized: empty loop.\n"); @@ -1559,11 +1581,13 @@ vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info) dump_printf_loc (MSG_NOTE, vect_location, "Considering outer-loop vectorization.\n"); info->inner_loop_cond = inner.loop_cond; + + if (!single_exit (loop)) + return opt_result::failure_at (vect_location, + "not vectorized: multiple exits.\n"); + } - if (!single_exit (loop)) - return opt_result::failure_at (vect_location, - "not vectorized: multiple exits.\n"); if (EDGE_COUNT (loop->header->preds) != 2) return opt_result::failure_at (vect_location, "not vectorized:" @@ -1579,21 +1603,45 @@ vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info) "not vectorized: latch block not empty.\n"); /* Make sure the exit is not abnormal. */ - edge e = single_exit (loop); - if (e->flags & EDGE_ABNORMAL) + auto_vec exits = get_loop_exit_edges (loop); + edge nexit = normal_exit (loop); + for (edge e : exits) + { + if (e->flags & EDGE_ABNORMAL) + return opt_result::failure_at (vect_location, + "not vectorized:" + " abnormal loop exit edge.\n"); + /* Early break BB must be after the main exit BB. In theory we should + be able to vectorize the inverse order, but the current flow in the + the vectorizer always assumes you update success PHI nodes, not + preds. */ + if (e != nexit && !dominated_by_p (CDI_DOMINATORS, nexit->src, e->src)) + return opt_result::failure_at (vect_location, + "not vectorized:" + " abnormal loop exit edge order.\n"); + } + + if (exits.length () > 2) return opt_result::failure_at (vect_location, "not vectorized:" - " abnormal loop exit edge.\n"); - - info->loop_cond + " too many exits. Only 1 additional exit" + " supported.\n"); + if (loop->num_nodes != 2 + exits.length () - 1) + return opt_result::failure_at (vect_location, + "not vectorized:" + " unsupported control flow in loop.\n"); + info->conds = vect_get_loop_niters (loop, &info->assumptions, &info->number_of_iterations, &info->number_of_iterationsm1); - if (!info->loop_cond) + + if (info->conds.length () == 0) return opt_result::failure_at (vect_location, "not vectorized: complicated exit condition.\n"); + info->loop_cond = info->conds[0]; + if (integer_zerop (info->assumptions) || !info->number_of_iterations || chrec_contains_undetermined (info->number_of_iterations)) @@ -1638,8 +1686,17 @@ vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared, if (!integer_onep (info->assumptions) && !main_loop_info) LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions; - stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond); - STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type; + unsigned int i; + gcond *cond; + FOR_EACH_VEC_ELT (info->conds, i, cond) + { + stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond); + STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type; + } + + /* Check to see if we're vectorizing multiple exits. */ + LOOP_VINFO_EARLY_BREAKS (loop_vinfo) = info->conds.length () > 1; + if (info->inner_loop_cond) { stmt_vec_info inner_loop_cond_info @@ -2270,10 +2327,13 @@ vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo, bool need_peeling_or_partial_vectors_p = vect_need_peeling_or_partial_vectors_p (loop_vinfo); - /* Decide whether to vectorize the loop with partial vectors. */ + /* Decide whether to vectorize the loop with partial vectors. Currently + early break vectorization does not support partial vectors as we have + to peel a scalar loop that we can't vectorize. */ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) + && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo) && need_peeling_or_partial_vectors_p) { /* For partial-vector-usage=1, try to push the handling of partial @@ -2746,13 +2806,14 @@ start_over: /* If an epilogue loop is required make sure we can create one. */ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) - || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) + || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) + || LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) { if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n"); if (!vect_can_advance_ivs_p (loop_vinfo) - || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo), - single_exit (LOOP_VINFO_LOOP + || !slpeel_can_duplicate_loop_p (loop_vinfo, + normal_exit (LOOP_VINFO_LOOP (loop_vinfo)))) { ok = opt_result::failure_at (vect_location, @@ -3239,6 +3300,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) "***** Choosing vector mode %s\n", GET_MODE_NAME (first_loop_vinfo->vector_mode)); + loop_form_info.conds.release (); + /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is enabled, SIMDUID is not set, it is the innermost loop and we have either already found the loop's SIMDLEN or there was no SIMDLEN to @@ -3350,6 +3413,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) (first_loop_vinfo->epilogue_vinfos[0]->vector_mode)); } + loop_form_info.conds.release (); + return first_loop_vinfo; } @@ -7907,6 +7972,237 @@ vect_transform_reduction (loop_vec_info loop_vinfo, return true; } +/* When vectorizing early break statements instructions that happen before + the early break in the current BB need to be moved to after the early + break. This function deals with that and assumes that any validaty + checks has already been performed. + + While moving the instructions if it encounters a VUSE or VDEF it then + corrects the VUSES as it moves the statements along. CHAINED contains + the list of SSA_NAMES that belong to the dependency chain of the early + break conditional. GDEST is the location in which to insert the new + statements. GSTMT is the iterator to walk up to find statements to + consider moving. REACHING_VUSE contains the dominating VUSE found so far + and CURRENT_VDEF contains the last VDEF we've seen. These are updated in + pre-order and updated in post-order after moving the instruction. */ + +static void +move_early_exit_stmts (hash_set *chained, gimple_stmt_iterator *gdest, + gimple_stmt_iterator *gstmt, tree *reaching_vuse, + tree *current_vdef) +{ + if (gsi_end_p (*gstmt)) + return; + + gimple *stmt = gsi_stmt (*gstmt); + if (gimple_has_ops (stmt)) + { + tree dest = NULL_TREE; + /* Try to find the SSA_NAME being defined. For Statements with an LHS + use the LHS, if not, assume that the first argument of a call is the + value being defined. e.g. MASKED_LOAD etc. */ + if (gimple_has_lhs (stmt)) + { + if (is_gimple_assign (stmt)) + dest = gimple_assign_lhs (stmt); + else if (const gcall *call = dyn_cast (stmt)) + dest = gimple_call_lhs (call); + } + else if (const gcall *call = dyn_cast (stmt)) + dest = gimple_arg (call, 0); + + /* Don't move the scalar instructions. */ + bool move + = dest && (VECTOR_TYPE_P (TREE_TYPE (dest)) + || POINTER_TYPE_P (TREE_TYPE (dest))); + + /* If we found the defining statement of a something that's part of the + chain then expand the chain with the new SSA_VARs being used. */ + if (chained->contains (dest)) + { + for (unsigned x = 0; x < gimple_num_args (stmt); x++) + if (TREE_CODE (gimple_arg (stmt, x)) == SSA_NAME) + chained->add (gimple_arg (stmt, x)); + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "found chain %G", stmt); + update_stmt (stmt); + move = false; + } + + if (move) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "moving stmt %G", stmt); + gsi_move_before (gstmt, gdest); + gsi_prev (gdest); + tree vdef = gimple_vdef (stmt); + + /* If we've moved a VDEF, extract the defining MEM and update + usages of it. TODO: I think this may need some constraints? */ + if (vdef) + { + *current_vdef = vdef; + *reaching_vuse = gimple_vuse (stmt); + imm_use_iterator imm_iter; + gimple *use_stmt; + FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, vdef) + { + if (!is_a (use_stmt)) + continue; + gphi *phi_stmt = as_a (use_stmt); + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "updating vuse %G", use_stmt); + for (unsigned i = 0; i < gimple_phi_num_args (phi_stmt); i++) + if (gimple_phi_arg_def (phi_stmt, i) == vdef) + { + SET_USE (PHI_ARG_DEF_PTR (phi_stmt, i), gimple_vuse (stmt)); + break; + } + } + } + update_stmt (stmt); + } + } + + gsi_prev (gstmt); + move_early_exit_stmts (chained, gdest, gstmt, reaching_vuse, current_vdef); + + if (gimple_vuse (stmt) + && reaching_vuse && *reaching_vuse + && gimple_vuse (stmt) == *current_vdef) + { + unlink_stmt_vdef (stmt); + gimple_set_vuse (stmt, *reaching_vuse); + update_stmt (stmt); + } +} + +/* Transform the definition stmt STMT_INFO of an early exit + value. */ + +bool +vect_transform_early_break (loop_vec_info loop_vinfo, + stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + gimple **vec_stmt, slp_tree slp_node) +{ + tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); + int i; + int ncopies; + int vec_num; + + if (!STMT_VINFO_RELEVANT_P (stmt_info)) + return false; + + gimple_match_op op; + if (!gimple_extract_op (stmt_info->stmt, &op)) + gcc_unreachable (); + gcc_assert (op.code.is_tree_code ()); + auto code = tree_code (op.code); + + tree vectype_in = STMT_VINFO_VECTYPE (stmt_info); + gcc_assert (vectype_in); + + + if (slp_node) + { + ncopies = 1; + vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + } + else + { + ncopies = vect_get_num_copies (loop_vinfo, vectype_in); + vec_num = 1; + } + + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); + + /* Transform. */ + tree new_temp = NULL_TREE; + auto_vec vec_oprnds0; + auto_vec vec_oprnds1; + tree def0; + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "transform early-exit.\n"); + + /* FORNOW: Multiple types are not supported for condition. */ + if (code == COND_EXPR) + gcc_assert (ncopies == 1); + + + gimple *stmt = STMT_VINFO_STMT (stmt_info); + basic_block cond_bb = gimple_bb (stmt); + gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb); + + vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, + op.ops[0], &vec_oprnds0, op.ops[1], &vec_oprnds1, + NULL, NULL); + + gimple *new_stmt = NULL; + tree cst_0 = build_zero_cst (truth_type_for (vectype_out)); + tree cst_m1 = build_minus_one_cst (truth_type_for (vectype_out)); + + FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) + { + tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; + { + tree cond = make_temp_ssa_name (truth_type_for (vectype_out), NULL, "mask"); + gimple *vec_cmp = gimple_build_assign (cond, code, vop[0], vop[1]); + vect_finish_stmt_generation (loop_vinfo, stmt_info, vec_cmp, &cond_gsi); + if (masked_loop_p) + { + tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, + vectype_in, i); + cond = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask, + cond, &cond_gsi); + } + + new_temp = make_temp_ssa_name (truth_type_for (vectype_out), NULL, "vexit"); + gimple *vec_cond = gimple_build_assign (new_temp, VEC_COND_EXPR, + cond, cst_m1, cst_0); + vect_finish_stmt_generation (loop_vinfo, stmt_info, vec_cond, &cond_gsi); + new_stmt = vec_cond; + } + + if (slp_node) + SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); + else + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); + } + + gcc_assert (new_stmt); + tree lhs = gimple_assign_lhs (new_stmt); + + tree t = fold_build2 (NE_EXPR, boolean_type_node, lhs, + build_zero_cst (truth_type_for (vectype_out))); + t = canonicalize_cond_expr_cond (t); + gimple_cond_set_condition_from_tree ((gcond*)stmt, t); + update_stmt (stmt); + + basic_block dest_bb = EDGE_SUCC (cond_bb, 1)->dest; + gimple_stmt_iterator dest_gsi = gsi_start_bb (dest_bb); + + hash_set chained; + gimple_stmt_iterator gsi2 = gsi_for_stmt (new_stmt); + chained.add (lhs); + tree vdef; + tree vuse = gimple_vuse (new_stmt); + move_early_exit_stmts (&chained, &dest_gsi, &gsi2, &vuse, &vdef); + + if (!slp_node) + *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; + + return true; +} + + + /* Transform phase of a cycle PHI. */ bool @@ -8185,6 +8481,186 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, return true; } +/* This function tries to validate whether an early break vectorization + is possible for the current instruction sequence. Returns True i + possible, otherwise False. + + Requirements: + - Any memory access must be to a fixed size buffer. + - There must not be any loads and stores to the same object. + - Multiple loads are allowed as long as they don't alias. + + + Arguments: + - LOOP_VINFO: loop information for the current loop. + - CHAIN: Currently detected sequence of instructions that belong + to the current early break. + - LOADS: List of all loads found during traversal. + - BASES: List of all load datareferences found during traversal. + - GSTMT: Current position to inspect for validity. The sequence + will be moved upwards from this point. */ + +static bool +validate_early_exit_stmts (loop_vec_info loop_vinfo, hash_set *chain, + vec *loads, vec *bases, + gimple_stmt_iterator *gstmt) +{ + if (gsi_end_p (*gstmt)) + return true; + + gimple *stmt = gsi_stmt (*gstmt); + if (gimple_has_ops (stmt)) + { + tree dest = NULL_TREE; + /* Try to find the SSA_NAME being defined. For Statements with an LHS + use the LHS, if not, assume that the first argument of a call is the + value being defined. e.g. MASKED_LOAD etc. */ + if (gimple_has_lhs (stmt)) + { + if (is_gimple_assign (stmt)) + dest = gimple_assign_lhs (stmt); + else if (const gcall *call = dyn_cast (stmt)) + dest = gimple_call_lhs (call); + } + else if (const gcall *call = dyn_cast (stmt)) + dest = gimple_arg (call, 0); + + /* Don't move the scalar instructions. */ + bool move + = dest && (VECTOR_TYPE_P (TREE_TYPE (dest)) + || POINTER_TYPE_P (TREE_TYPE (dest))); + + /* If we found the defining statement of a something that's part of the + chain then expand the chain with the new SSA_VARs being used. */ + if (chain->contains (dest)) + { + for (unsigned x = 0; x < gimple_num_args (stmt); x++) + if (TREE_CODE (gimple_arg (stmt, x)) == SSA_NAME) + chain->add (gimple_arg (stmt, x)); + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "found chain %G", stmt); + + move = false; + } + + stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt); + if (!stmt_vinfo) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "early breaks only supported. Unknown" + " statement: %G", stmt); + return false; + } + + auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo); + if (dr_ref) + { + /* We currenly only support statically allocated objects due to + not having first-faulting loads support or peeling for alignment + support. Compute the isize of the referenced object (it could be + dynamically allocated). */ + tree obj = DR_BASE_ADDRESS (dr_ref); + if (!obj || TREE_CODE (obj) != ADDR_EXPR) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "early breaks only supported on statically" + " allocated objects.\n"); + return false; + } + + tree refop = TREE_OPERAND (obj, 0); + tree refbase = get_base_address (refop); + if (!refbase || !DECL_P (refbase) + || TREE_CODE (DECL_SIZE (refbase)) != INTEGER_CST) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "early breaks only supported on statically" + " allocated objects.\n"); + return false; + } + + if (!move && DR_IS_READ (dr_ref)) + { + loads->safe_push (dest); + bases->safe_push (dr_ref); + } + else if (DR_IS_WRITE (dr_ref)) + { + for (auto dr : bases) + if (same_data_refs_base_objects (dr, dr_ref)) + return false; + } + } + + if (move) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "analyzing stmt %G", stmt); + + for (tree ref : loads) + if (stmt_may_clobber_ref_p (stmt, ref, true)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "early breaks not supported as memory used" + " may alias.\n"); + return false; + } + } + } + + gsi_prev (gstmt); + return validate_early_exit_stmts (loop_vinfo, chain, loads, bases, gstmt); +} + +/* Check to see if the current early break given in STMT_INFO is valid for + vectorization. */ + +bool +vectorizable_early_exit (vec_info *vinfo, + stmt_vec_info stmt_info, slp_tree /* slp_node */, + slp_instance /* slp_node_instance */, + stmt_vector_for_cost * /* cost_vec */) +{ + loop_vec_info loop_vinfo = dyn_cast (vinfo); + + if (!loop_vinfo + || !is_a (STMT_VINFO_STMT (stmt_info))) + return false; + + if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_early_exit_def) + return false; + + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + tree truth_type = truth_type_for (vectype); + + auto optab = direct_optab_handler (cbranch_optab, TYPE_MODE (truth_type)); + if (optab == CODE_FOR_nothing) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't vectorize early exit because the " + "target doesn't support flag setting vector " + "comparisons.\n"); + return false; + } + + hash_set chain; + auto_vec loads; + auto_vec bases; + + gimple *stmt = STMT_VINFO_STMT (stmt_info); + gimple_stmt_iterator gsi = gsi_for_stmt (stmt); + + return validate_early_exit_stmts (loop_vinfo, &chain, &loads, &bases, &gsi); +} + /* Vectorizes LC PHIs. */ bool @@ -9993,13 +10469,24 @@ vectorizable_live_operation (vec_info *vinfo, new_tree = lane_extract ; lhs' = new_tree; */ + /* When vectorizing an early break, any live statement that is used + outside of the loop are dead. The loop will never get to them. + We could change the liveness value during analysis instead but since + the below code is invalid anyway just ignore it during codegen. */ + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + return true; + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); - basic_block exit_bb = single_exit (loop)->dest; + basic_block exit_bb = normal_exit (loop)->dest; gcc_assert (single_pred_p (exit_bb)); tree vec_lhs_phi = copy_ssa_name (vec_lhs); gimple *phi = create_phi_node (vec_lhs_phi, exit_bb); - SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs); + /* For early exits we need to compute the right exit. The current + approach punts to a scalar loop instead. If we were to vectorize + the exit condition below needs to take into account the difference + between a `break` edge and a `return` edge. */ + SET_PHI_ARG_DEF (phi, normal_exit (loop)->dest_idx, vec_lhs); gimple_seq stmts = NULL; tree new_tree; @@ -10438,7 +10925,8 @@ scale_profile_for_vect_loop (class loop *loop, unsigned vf) scale_loop_frequencies (loop, p); } - edge exit_e = single_exit (loop); + edge exit_e = normal_exit (loop); + exit_e->probability = profile_probability::always () / (new_est_niter + 1); edge exit_l = single_pred_edge (loop->latch); @@ -10787,7 +11275,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) /* Make sure there exists a single-predecessor exit bb. Do this before versioning. */ edge e = single_exit (loop); - if (! single_pred_p (e->dest)) + if (e && ! single_pred_p (e->dest)) { split_loop_exit_edge (e, true); if (dump_enabled_p ()) @@ -10813,7 +11301,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) { e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)); - if (! single_pred_p (e->dest)) + if (e && ! single_pred_p (e->dest)) { split_loop_exit_edge (e, true); if (dump_enabled_p ()) @@ -11146,7 +11634,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) /* Loops vectorized with a variable factor won't benefit from unrolling/peeling. */ - if (!vf.is_constant ()) + if (!vf.is_constant () + && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) { loop->unroll = 1; if (dump_enabled_p ()) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 4e0d75e0d7586ad57a37850d8a70f6182ecb13d0..4f9446a5c699288be093c556ec527e87cf788317 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -342,9 +342,28 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, *live_p = false; /* cond stmt other than loop exit cond. */ - if (is_ctrl_stmt (stmt_info->stmt) - && STMT_VINFO_TYPE (stmt_info) != loop_exit_ctrl_vec_info_type) - *relevant = vect_used_in_scope; + if (is_ctrl_stmt (stmt_info->stmt)) + { + /* Ideally EDGE_LOOP_EXIT would have been set on the exit edge, but + it looks like loop_manip doesn't do that.. So we have to do it + the hard way. */ + basic_block bb = gimple_bb (stmt_info->stmt); + basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); + edge exit = BRANCH_EDGE (bb); + unsigned nbbs = loop->num_nodes; + bool exit_bb = true; + for (unsigned i = 0; i < nbbs; i++) + { + if (exit->dest == bbs[i]) + { + exit_bb = false; + break; + } + } + + if (exit_bb) + *relevant = vect_used_in_scope; + } /* changing memory. */ if (gimple_code (stmt_info->stmt) != GIMPLE_PHI) @@ -357,6 +376,11 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, *relevant = vect_used_in_scope; } + auto_vec exits = get_loop_exit_edges (loop); + auto_bitmap exit_bbs; + for (edge exit : exits) + bitmap_set_bit (exit_bbs, exit->dest->index); + /* uses outside the loop. */ FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF) { @@ -375,7 +399,7 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, /* We expect all such uses to be in the loop exit phis (because of loop closed form) */ gcc_assert (gimple_code (USE_STMT (use_p)) == GIMPLE_PHI); - gcc_assert (bb == single_exit (loop)->dest); + gcc_assert (bitmap_bit_p (exit_bbs, bb->index)); *live_p = true; } @@ -1845,7 +1869,7 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, MASK_TYPE is the type of both masks. If new statements are needed, insert them before GSI. */ -static tree +tree prepare_vec_mask (loop_vec_info loop_vinfo, tree mask_type, tree loop_mask, tree vec_mask, gimple_stmt_iterator *gsi) { @@ -11158,11 +11182,14 @@ vect_analyze_stmt (vec_info *vinfo, node_instance, cost_vec); if (!res) return res; - } + } + else if (is_ctrl_stmt (stmt_info->stmt)) + STMT_VINFO_DEF_TYPE (stmt_info) = vect_early_exit_def; switch (STMT_VINFO_DEF_TYPE (stmt_info)) { case vect_internal_def: + case vect_early_exit_def: break; case vect_reduction_def: @@ -11195,6 +11222,7 @@ vect_analyze_stmt (vec_info *vinfo, { gcall *call = dyn_cast (stmt_info->stmt); gcc_assert (STMT_VINFO_VECTYPE (stmt_info) + || gimple_code (stmt_info->stmt) == GIMPLE_COND || (call && gimple_call_lhs (call) == NULL_TREE)); *need_to_vectorize = true; } @@ -11237,7 +11265,9 @@ vect_analyze_stmt (vec_info *vinfo, || vectorizable_lc_phi (as_a (vinfo), stmt_info, NULL, node) || vectorizable_recurr (as_a (vinfo), - stmt_info, NULL, node, cost_vec)); + stmt_info, NULL, node, cost_vec) + || vectorizable_early_exit (vinfo, stmt_info, + node, node_instance, cost_vec)); else { if (bb_vinfo) @@ -11260,7 +11290,10 @@ vect_analyze_stmt (vec_info *vinfo, NULL, NULL, node, cost_vec) || vectorizable_comparison (vinfo, stmt_info, NULL, NULL, node, cost_vec) - || vectorizable_phi (vinfo, stmt_info, NULL, node, cost_vec)); + || vectorizable_phi (vinfo, stmt_info, NULL, node, cost_vec) + || vectorizable_early_exit (vinfo, stmt_info, node, + node_instance, cost_vec)); + } if (node) @@ -11418,6 +11451,12 @@ vect_transform_stmt (vec_info *vinfo, gcc_assert (done); break; + case loop_exit_ctrl_vec_info_type: + done = vect_transform_early_break (as_a (vinfo), stmt_info, + gsi, &vec_stmt, slp_node); + gcc_assert (done); + break; + default: if (!STMT_VINFO_LIVE_P (stmt_info)) { @@ -11816,6 +11855,9 @@ vect_is_simple_use (tree operand, vec_info *vinfo, enum vect_def_type *dt, case vect_first_order_recurrence: dump_printf (MSG_NOTE, "first order recurrence\n"); break; + case vect_early_exit_def: + dump_printf (MSG_NOTE, "early exit\n"); + break; case vect_unknown_def_type: dump_printf (MSG_NOTE, "unknown\n"); break; @@ -12486,6 +12528,8 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info, *nunits_vectype_out = NULL_TREE; if (gimple_get_lhs (stmt) == NULL_TREE + /* Allow vector conditionals through here. */ + && !is_ctrl_stmt (stmt) /* MASK_STORE has no lhs, but is ok. */ && !gimple_call_internal_p (stmt, IFN_MASK_STORE)) { @@ -12502,7 +12546,7 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info, } return opt_result::failure_at (stmt, - "not vectorized: irregular stmt.%G", stmt); + "not vectorized: irregular stmt: %G", stmt); } tree vectype; @@ -12531,6 +12575,8 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info, scalar_type = TREE_TYPE (DR_REF (dr)); else if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3)); + else if (is_ctrl_stmt (stmt)) + scalar_type = TREE_TYPE (gimple_cond_rhs (stmt)); else scalar_type = TREE_TYPE (gimple_get_lhs (stmt)); diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 016961da8510ca7dd2d07e716cbe35623ed2d9a5..edbb7228d3aae29b6f51fdab284f49ac57c6612d 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -63,6 +63,7 @@ enum vect_def_type { vect_internal_def, vect_induction_def, vect_reduction_def, + vect_early_exit_def, vect_double_reduction_def, vect_nested_cycle, vect_first_order_recurrence, @@ -836,6 +837,10 @@ public: we need to peel off iterations at the end to form an epilogue loop. */ bool peeling_for_niter; + /* When the loop has early breaks that we can vectorize we need to peel + the loop for the break finding loop. */ + bool early_breaks; + /* True if there are no loop carried data dependencies in the loop. If loop->safelen <= 1, then this is always true, either the loop didn't have any loop carried data dependencies, or the loop is being @@ -921,6 +926,7 @@ public: #define LOOP_VINFO_REDUCTION_CHAINS(L) (L)->reduction_chains #define LOOP_VINFO_PEELING_FOR_GAPS(L) (L)->peeling_for_gaps #define LOOP_VINFO_PEELING_FOR_NITER(L) (L)->peeling_for_niter +#define LOOP_VINFO_EARLY_BREAKS(L) (L)->early_breaks #define LOOP_VINFO_NO_DATA_DEPENDENCIES(L) (L)->no_data_dependencies #define LOOP_VINFO_SCALAR_LOOP(L) (L)->scalar_loop #define LOOP_VINFO_SCALAR_LOOP_SCALING(L) (L)->scalar_loop_scaling @@ -970,7 +976,7 @@ public: typedef opt_pointer_wrapper opt_loop_vec_info; static inline loop_vec_info -loop_vec_info_for_loop (class loop *loop) +loop_vec_info_for_loop (const class loop *loop) { return (loop_vec_info) loop->aux; } @@ -2107,7 +2113,7 @@ class auto_purge_vect_location in tree-vect-loop-manip.cc. */ extern void vect_set_loop_condition (class loop *, loop_vec_info, tree, tree, tree, bool); -extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge); +extern bool slpeel_can_duplicate_loop_p (const loop_vec_info, const_edge); class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, class loop *, edge); class loop *vect_loop_versioning (loop_vec_info, gimple *); @@ -2306,6 +2312,7 @@ struct vect_loop_form_info tree number_of_iterations; tree number_of_iterationsm1; tree assumptions; + vec conds; gcond *loop_cond; gcond *inner_loop_cond; }; @@ -2326,6 +2333,9 @@ extern bool vectorizable_induction (loop_vec_info, stmt_vec_info, extern bool vect_transform_reduction (loop_vec_info, stmt_vec_info, gimple_stmt_iterator *, gimple **, slp_tree); +extern bool vect_transform_early_break (loop_vec_info, stmt_vec_info, + gimple_stmt_iterator *, + gimple **, slp_tree); extern bool vect_transform_cycle_phi (loop_vec_info, stmt_vec_info, gimple **, slp_tree, slp_instance); @@ -2335,6 +2345,11 @@ extern bool vectorizable_phi (vec_info *, stmt_vec_info, gimple **, slp_tree, stmt_vector_for_cost *); extern bool vectorizable_recurr (loop_vec_info, stmt_vec_info, gimple **, slp_tree, stmt_vector_for_cost *); +extern bool vectorizable_early_exit (vec_info *, stmt_vec_info, + slp_tree, slp_instance, + stmt_vector_for_cost *); +extern tree prepare_vec_mask (loop_vec_info, tree, tree, + tree, gimple_stmt_iterator *); extern bool vect_emulated_vector_p (tree); extern bool vect_can_vectorize_without_simd_p (tree_code); extern bool vect_can_vectorize_without_simd_p (code_helper); diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc index 6ec49511d74bd2e0e5dd51823a6c41180f08716c..4aa46c7c0d8235d3b783ce930e5df3480e1b3ef9 100644 --- a/gcc/tree-vectorizer.cc +++ b/gcc/tree-vectorizer.cc @@ -1382,7 +1382,9 @@ pass_vectorize::execute (function *fun) predicates that need to be shared for optimal predicate usage. However reassoc will re-order them and prevent CSE from working as it should. CSE only the loop body, not the entry. */ - bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index); + auto_vec exits = get_loop_exit_edges (loop); + for (edge exit : exits) + bitmap_set_bit (exit_bbs, exit->dest->index); edge entry = EDGE_PRED (loop_preheader_edge (loop)->src, 0); do_rpo_vn (fun, entry, exit_bbs);