Hi Richi, This is a respin of the mid-end patch. Changes since previous version: - The mismatch in Boolean types is now fixed, and it generates an OR reduction when it needs to. - I've refactored things around to be a bit neater - I've switched to using iterate_fix_dominators which has simplified the loop peeling code a ton. - I've moved the conditionals into the loop structure and use them from there. - I've moved the analysis part early into vect_analyze_data_ref_dependences - I've switched to moving the scalar code instead of the vector code, as moving vector required us to track a lot more complicated things like internal functions. It was also a lot more work when the loop is unrolled or VF is increased due to unpacking. I have verified as much as I can that we don't seem to run into trouble doing this. Outstanding things: - Split off the SCEV parts from the rest of the patch (and determine the "normal" exit based on the counting IV instead) - Merge vectorizable_early_exit and transform_early_exit I'm sending this patch out for you to take a look at the issue we were discussing the issue on IRC (which you can reproduce with testcase gcc.dg/vect/vect-early-break_16.c) That should be the last outstanding issue. Meanwhile I'll finish up the splitting of SCEV and merging the two functions. Any additional comments is appreciated. Will hopefully finish the refactoring today and send out the split patch tomorrow. Thanks, Tamar --- inline copy of patch --- diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h index 528b1219bc37ad8f114d5cf381c0cff899db31ee..9c7f019a51abfe2de8e1dd7135dea2463b0256a0 100644 --- a/gcc/cfgloop.h +++ b/gcc/cfgloop.h @@ -385,6 +385,7 @@ extern basic_block *get_loop_body_in_custom_order (const class loop *, void *, extern auto_vec get_loop_exit_edges (const class loop *, basic_block * = NULL); extern edge single_exit (const class loop *); +extern edge normal_exit (const class loop *); extern edge single_likely_exit (class loop *loop, const vec &); extern unsigned num_loop_branches (const class loop *); diff --git a/gcc/cfgloop.cc b/gcc/cfgloop.cc index 57bf7b1855d4dd20fb3f42388124932d0ca2b48a..97a7373fb6d9514da602d5be01050f2ec66094bc 100644 --- a/gcc/cfgloop.cc +++ b/gcc/cfgloop.cc @@ -1812,6 +1812,20 @@ single_exit (const class loop *loop) return NULL; } +/* Returns the normal exit edge of LOOP, or NULL if LOOP has either no exit. + If loops do not have the exits recorded, NULL is returned always. */ + +edge +normal_exit (const class loop *loop) +{ + struct loop_exit *exit = loop->exits->next; + + if (!loops_state_satisfies_p (LOOPS_HAVE_RECORDED_EXITS)) + return NULL; + + return exit->e; +} + /* Returns true when BB has an incoming edge exiting LOOP. */ bool diff --git a/gcc/doc/loop.texi b/gcc/doc/loop.texi index 6e8657a074d2447db7ae9b75cbfbb71282b84287..e1de2ac40f87f879ab691f68bd41b3bc21a83bf7 100644 --- a/gcc/doc/loop.texi +++ b/gcc/doc/loop.texi @@ -211,6 +211,10 @@ relation, and breath-first search order, respectively. @item @code{single_exit}: Returns the single exit edge of the loop, or @code{NULL} if the loop has more than one exit. You can only use this function if @code{LOOPS_HAVE_RECORDED_EXITS} is used. +function if LOOPS_HAVE_MARKED_SINGLE_EXITS property is used. +@item @code{normal_exit}: Returns the natural exit edge of the loop, +even if the loop has more than one exit. The natural exit is the exit +that would normally be taken where the loop to be fully executed. @item @code{get_loop_exit_edges}: Enumerates the exit edges of a loop. @item @code{just_once_each_iteration_p}: Returns true if the basic block is executed exactly once during each iteration of a loop (that is, it @@ -623,4 +627,4 @@ maximum verbosity the details of a data dependence relations array, @code{dump_dist_dir_vectors} prints only the classical distance and direction vectors for a data dependence relations array, and @code{dump_data_references} prints the details of the data references -contained in a data reference array. +contained in a data reference array diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi index ffe69d6fcb9c46cf97ba570e85b56e586a0c9b99..a82c7b8f1efa01b02b772c9dd0f5b3dcde817091 100644 --- a/gcc/doc/sourcebuild.texi +++ b/gcc/doc/sourcebuild.texi @@ -1637,6 +1637,10 @@ Target supports hardware vectors of @code{float} when @option{-funsafe-math-optimizations} is not in effect. This implies @code{vect_float}. +@item vect_early_break +Target supports hardware vectorization of loops with early breaks. +This requires an implementation of the cbranch optab for vectors. + @item vect_int Target supports hardware vectors of @code{int}. diff --git a/gcc/genemit.cc b/gcc/genemit.cc index 909ac89a16bf9c7830b1710f174e19cbc4a82a51..cf72a9154243d282bc10674c3c83773b76848840 100644 --- a/gcc/genemit.cc +++ b/gcc/genemit.cc @@ -905,6 +905,7 @@ from the machine description file `md'. */\n\n"); printf ("#include \"tm-constrs.h\"\n"); printf ("#include \"ggc.h\"\n"); printf ("#include \"target.h\"\n\n"); + printf ("#include \"rtx-vector-builder.h\"\n\n"); /* Read the machine description. */ diff --git a/gcc/gimple.h b/gcc/gimple.h index adbeb063186d4a8e009e4dd184d73609d2c5d78c..a68b29a9136026e5b4d7dcd27e7352dc07a17b52 100644 --- a/gcc/gimple.h +++ b/gcc/gimple.h @@ -4756,6 +4756,30 @@ gimple_phi_arg_has_location (const gphi *phi, size_t i) return gimple_phi_arg_location (phi, i) != UNKNOWN_LOCATION; } + +/* Check if the gimple PHI or any arguments to the phi has a VDEF. */ +static inline bool +gimple_phi_has_vdef (const gphi *phi) +{ + for (unsigned i = 0; i < gimple_phi_num_args (phi); i++) + { + tree arg = gimple_phi_arg_def (phi, i); + if (TREE_CODE (arg) != SSA_NAME) + continue; + + gimple *def = SSA_NAME_DEF_STMT (arg); + if (gimple_vdef (def)) + return true; + if (auto phi = dyn_cast (def)) + { + if (gimple_phi_has_vdef (phi)) + return true; + } + } + return false; +} + + /* Return the number of arguments that can be accessed by gimple_arg. */ static inline unsigned diff --git a/gcc/testsuite/g++.dg/vect/vect-early-break_1.cc b/gcc/testsuite/g++.dg/vect/vect-early-break_1.cc new file mode 100644 index 0000000000000000000000000000000000000000..6a83648ca36e2c8feeb78335fccf3f3b82a97d2e --- /dev/null +++ b/gcc/testsuite/g++.dg/vect/vect-early-break_1.cc @@ -0,0 +1,61 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-w -O2" } */ + +void fancy_abort(char *, int, const char *) __attribute__((__noreturn__)); +template struct poly_int_pod { int coeffs[N]; }; +template class poly_int : public poly_int_pod { +public: + template poly_int &operator+=(const poly_int_pod &); +}; +template +template +poly_int &poly_int::operator+=(const poly_int_pod &a) { + for (int i = 0; i < N; i++) + this->coeffs[i] += a.coeffs[i]; + return *this; +} +template +poly_int exact_div(poly_int_pod, Cb) { + poly_int r; + return r; +} +struct vec_prefix { + unsigned m_num; +}; +struct vl_ptr; +struct va_heap { + typedef vl_ptr default_layout; +}; +template +struct vec; +template struct vec { + T &operator[](unsigned); + vec_prefix m_vecpfx; + T m_vecdata[]; +}; +template T &vec::operator[](unsigned ix) { + m_vecpfx.m_num ? fancy_abort("", 9, __FUNCTION__), 0 : 0; + return m_vecdata[ix]; +} +template struct vec { + T &operator[](unsigned ix) { return m_vec[ix]; } + vec m_vec; +}; +class auto_vec : public vec, va_heap> {}; +template class vector_builder : public auto_vec {}; +class int_vector_builder : public vector_builder { +public: + int_vector_builder(poly_int<2, long>, int, int); +}; +bool vect_grouped_store_supported() { + int i; + poly_int<2, long> nelt; + int_vector_builder sel(nelt, 2, 3); + for (i = 0; i < 6; i++) + sel[i] += exact_div(nelt, 2); +} + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/g++.dg/vect/vect-early-break_2.cc b/gcc/testsuite/g++.dg/vect/vect-early-break_2.cc new file mode 100644 index 0000000000000000000000000000000000000000..6a83648ca36e2c8feeb78335fccf3f3b82a97d2e --- /dev/null +++ b/gcc/testsuite/g++.dg/vect/vect-early-break_2.cc @@ -0,0 +1,61 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-w -O2" } */ + +void fancy_abort(char *, int, const char *) __attribute__((__noreturn__)); +template struct poly_int_pod { int coeffs[N]; }; +template class poly_int : public poly_int_pod { +public: + template poly_int &operator+=(const poly_int_pod &); +}; +template +template +poly_int &poly_int::operator+=(const poly_int_pod &a) { + for (int i = 0; i < N; i++) + this->coeffs[i] += a.coeffs[i]; + return *this; +} +template +poly_int exact_div(poly_int_pod, Cb) { + poly_int r; + return r; +} +struct vec_prefix { + unsigned m_num; +}; +struct vl_ptr; +struct va_heap { + typedef vl_ptr default_layout; +}; +template +struct vec; +template struct vec { + T &operator[](unsigned); + vec_prefix m_vecpfx; + T m_vecdata[]; +}; +template T &vec::operator[](unsigned ix) { + m_vecpfx.m_num ? fancy_abort("", 9, __FUNCTION__), 0 : 0; + return m_vecdata[ix]; +} +template struct vec { + T &operator[](unsigned ix) { return m_vec[ix]; } + vec m_vec; +}; +class auto_vec : public vec, va_heap> {}; +template class vector_builder : public auto_vec {}; +class int_vector_builder : public vector_builder { +public: + int_vector_builder(poly_int<2, long>, int, int); +}; +bool vect_grouped_store_supported() { + int i; + poly_int<2, long> nelt; + int_vector_builder sel(nelt, 2, 3); + for (i = 0; i < 6; i++) + sel[i] += exact_div(nelt, 2); +} + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/g++.dg/vect/vect-early-break_3.cc b/gcc/testsuite/g++.dg/vect/vect-early-break_3.cc new file mode 100644 index 0000000000000000000000000000000000000000..a12e5ca434b2ac37c03dbaa12273fd8e5aa2018c --- /dev/null +++ b/gcc/testsuite/g++.dg/vect/vect-early-break_3.cc @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-w -O2" } */ + +int aarch64_advsimd_valid_immediate_hs_val32; +bool aarch64_advsimd_valid_immediate_hs() { + for (int shift = 0; shift < 32; shift += 8) + if (aarch64_advsimd_valid_immediate_hs_val32 & shift) + return aarch64_advsimd_valid_immediate_hs_val32; + for (;;) + ; +} + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_1.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_1.c new file mode 100644 index 0000000000000000000000000000000000000000..2495b36a72eae94cb7abc4a0d17a5c979fd78083 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_1.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 803 +#define P 0 +#include "vect-early-break-template_1.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_10.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_10.c new file mode 100644 index 0000000000000000000000000000000000000000..9bcd7f7e57ef9a1d4649d18569b3406050e54603 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_10.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 800 +#define P 799 +#include "vect-early-break-template_2.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_2.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_2.c new file mode 100644 index 0000000000000000000000000000000000000000..63f63101a467909f328be7f3acbc5bcb721967ff --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_2.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 803 +#define P 802 +#include "vect-early-break-template_1.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_3.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_3.c new file mode 100644 index 0000000000000000000000000000000000000000..626b95e9b8517081d41d794e9e0264d6301c8589 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_3.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 803 +#define P 5 +#include "vect-early-break-template_1.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_4.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_4.c new file mode 100644 index 0000000000000000000000000000000000000000..7e0e6426120551152a7bd800c15d9ed6ab15bada --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_4.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 803 +#define P 278 +#include "vect-early-break-template_1.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_5.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_5.c new file mode 100644 index 0000000000000000000000000000000000000000..242cf486f9c40055df0aef5fd238d1aff7a7c7da --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_5.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 800 +#define P 799 +#include "vect-early-break-template_1.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_6.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_6.c new file mode 100644 index 0000000000000000000000000000000000000000..9fe7136b7213a463ca6573c60476b7c8f531ddcb --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_6.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 803 +#define P 0 +#include "vect-early-break-template_2.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_7.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_7.c new file mode 100644 index 0000000000000000000000000000000000000000..02f93d77dba31b938f6fd9e8c7f5e4acde4aeec9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_7.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 803 +#define P 802 +#include "vect-early-break-template_2.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_8.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_8.c new file mode 100644 index 0000000000000000000000000000000000000000..a614925465606b54c638221ffb95a5e8d3bee797 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_8.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 803 +#define P 5 +#include "vect-early-break-template_2.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-run_9.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_9.c new file mode 100644 index 0000000000000000000000000000000000000000..94e2b9c301456eda8f9ad7eaa67604563f0afee7 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-run_9.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast -save-temps" } */ + +#define N 803 +#define P 278 +#include "vect-early-break-template_2.c" + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-template_1.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-template_1.c new file mode 100644 index 0000000000000000000000000000000000000000..af70a8e2a5a9dc9756edb5580f2de02ddcc95de9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-template_1.c @@ -0,0 +1,47 @@ +#ifndef N +#define N 803 +#endif + +#ifndef P +#define P 0 +#endif + +unsigned vect_a[N] = {0}; +unsigned vect_b[N] = {0}; + +__attribute__((noipa, noinline)) +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + break; + vect_a[i] = x; + + } + return ret; +} + +extern void abort (); + +int main () +{ + + int x = 1; + int idx = P; + vect_a[idx] = x + 1; + + test4(x); + + if (vect_b[idx] != (x + idx)) + abort (); + + if (vect_a[idx] != x + 1) + abort (); + + if (idx > 0 && vect_a[idx-1] != x) + abort (); + +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break-template_2.c b/gcc/testsuite/gcc.dg/vect/vect-early-break-template_2.c new file mode 100644 index 0000000000000000000000000000000000000000..d0f924d904437e71567d27cc1f1089e5607dca0d --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break-template_2.c @@ -0,0 +1,50 @@ +#ifndef N +#define N 803 +#endif + +#ifndef P +#define P 0 +#endif + +unsigned vect_a[N] = {0}; +unsigned vect_b[N] = {0}; + +__attribute__((noipa, noinline)) +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + return i; + vect_a[i] = x; + + } + return ret; +} + +extern void abort (); + +int main () +{ + + int x = 1; + int idx = P; + vect_a[idx] = x + 1; + + unsigned res = test4(x); + + if (res != idx) + abort (); + + if (vect_b[idx] != (x + idx)) + abort (); + + if (vect_a[idx] != x + 1) + abort (); + + if (idx > 0 && vect_a[idx-1] != x) + abort (); + +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_1.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_1.c new file mode 100644 index 0000000000000000000000000000000000000000..51e7d6489b99c25b9b4b3d1c839f98562b6d4dd7 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_1.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#ifndef N +#define N 803 +#endif +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + break; + vect_a[i] = x; + + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_10.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_10.c new file mode 100644 index 0000000000000000000000000000000000000000..9e4ad1763202dfdab3ed7961ead5114fcc61a11b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_10.c @@ -0,0 +1,28 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#ifndef N +#define N 803 +#endif +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x,int y, int z) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + break; + vect_a[i] = x; + } + + ret = x + y * z; + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_11.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_11.c new file mode 100644 index 0000000000000000000000000000000000000000..a613dd9909fb09278dd92a81a24ef854994a9890 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_11.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#ifndef N +#define N 803 +#endif +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x, int y) +{ + unsigned ret = 0; +for (int o = 0; o < y; o++) +{ + ret += o; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + break; + vect_a[i] = x; + + } +} + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_12.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_12.c new file mode 100644 index 0000000000000000000000000000000000000000..cc10f3238f1cb8e1307e024a3ebcb5c25a39d1b2 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_12.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#ifndef N +#define N 803 +#endif +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x, int y) +{ + unsigned ret = 0; +for (int o = 0; o < y; o++) +{ + ret += o; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + return vect_a[i]; + vect_a[i] = x; + + } +} + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_13.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_13.c new file mode 100644 index 0000000000000000000000000000000000000000..6967b7395ed7c19e38a436d6edcfe7c1580c7113 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_13.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#ifndef N +#define N 803 +#endif +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + return vect_a[i] * x; + vect_a[i] = x; + + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_14.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_14.c new file mode 100644 index 0000000000000000000000000000000000000000..03cce5cf6cadecb520b46be666bf608e3bc6a511 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_14.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#define N 803 +unsigned vect_a[N]; +unsigned vect_b[N]; + +int test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + return i; + vect_a[i] += x * vect_b[i]; + + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_15.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_15.c new file mode 100644 index 0000000000000000000000000000000000000000..dec6872e1115ff66695f5a500ffa7ca01c0f8d3a --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_15.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#define N 803 +unsigned vect_a[N]; +unsigned vect_b[N]; + +int test4(unsigned x) +{ + int ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + return i; + vect_a[i] += x * vect_b[i]; + + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_16.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_16.c new file mode 100644 index 0000000000000000000000000000000000000000..30812d12a39bd94b4b8a3aade6512b162697d659 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_16.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#define N 1024 +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + return vect_a[i]; + vect_a[i] = x; + ret += vect_a[i] + vect_b[i]; + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_17.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_17.c new file mode 100644 index 0000000000000000000000000000000000000000..510227a18435a8e47c5a754580180c6d340c0823 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_17.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#define N 1024 +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + return vect_a[i]; + vect_a[i] = x; + ret = vect_a[i] + vect_b[i]; + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c new file mode 100644 index 0000000000000000000000000000000000000000..7268f6ae2485d0274fd85ea53cc1e44ef4b84d5c --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#include + +#define N 1024 +complex double vect_a[N]; +complex double vect_b[N]; + +complex double test4(complex double x) +{ + complex double ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] += x + i; + if (vect_a[i] == x) + return i; + vect_a[i] += x * vect_b[i]; + + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_3.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_3.c new file mode 100644 index 0000000000000000000000000000000000000000..3c6d28bd2d6e6e794146baf89e43c3b70293b7d9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_3.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ + +unsigned test4(char x, char *vect, int n) +{ + unsigned ret = 0; + for (int i = 0; i < n; i++) + { + if (vect[i] > x) + return 1; + + vect[i] = x; + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_4.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_4.c new file mode 100644 index 0000000000000000000000000000000000000000..216c56faf330449bf1969b7e51ff1e94270dc861 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_4.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ + +#define N 1024 +unsigned vect[N]; + +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + if (i > 16 && vect[i] > x) + break; + + vect[i] = x; + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_5.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_5.c new file mode 100644 index 0000000000000000000000000000000000000000..4a36d6979db1fd1f97ba2a290f78ac3b84f6de24 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_5.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#define N 1024 +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] > x) + return vect_a[i]; + vect_a[i] = x; + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_6.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_6.c new file mode 100644 index 0000000000000000000000000000000000000000..09632d9afda7e07f1a8417514ef77356f00045bd --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_6.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ + +#define N 1024 +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < (N/2); i+=2) + { + vect_b[i] = x + i; + vect_b[i+1] = x + i+1; + if (vect_a[i] > x || vect_a[i+1] > x) + break; + vect_a[i] += x * vect_b[i]; + vect_a[i+1] += x * vect_b[i+1]; + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c new file mode 100644 index 0000000000000000000000000000000000000000..10fd8b42952c42f3d3a014da103931ca394423d5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#include + +#define N 1024 +complex double vect_a[N]; +complex double vect_b[N]; + +complex double test4(complex double x) +{ + complex double ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] += x + i; + if (vect_a[i] == x) + break; + vect_a[i] += x * vect_b[i]; + + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_8.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_8.c new file mode 100644 index 0000000000000000000000000000000000000000..ae706b2952cfcecf20546a67a735b8d902cbb607 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_8.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#include + +#define N 1024 +char vect_a[N]; +char vect_b[N]; + +char test4(char x, char * restrict res) +{ + char ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] += x + i; + if (vect_a[i] > x) + break; + vect_a[i] += x * vect_b[i]; + res[i] *= vect_b[i]; + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_9.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_9.c new file mode 100644 index 0000000000000000000000000000000000000000..350f02f3c7caef457adbe1be802bba51cd818393 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_9.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-additional-options "-Ofast" } */ + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ + +#ifndef N +#define N 803 +#endif +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_a[i] = x + i; + if (vect_a[i] > x) + break; + vect_a[i] = x; + + } + return ret; +} diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 2a058c67c53466fe41b748d37ab660afd4e3403f..49b3099bf5187b1273ff07808692d62f2cfdfa55 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -3659,6 +3659,17 @@ proc check_effective_target_vect_int { } { }}] } +# Return 1 if the target supports hardware vectorization of early breaks, +# 0 otherwise. +# +# This won't change for different subtargets so cache the result. + +proc check_effective_target_vect_early_break { } { + return [check_cached_effective_target_indexed vect_early_break { + expr { + [istarget aarch64*-*-*] + }}] +} # Return 1 if the target supports hardware vectorization of complex additions of # byte, 0 otherwise. # diff --git a/gcc/tree-scalar-evolution.h b/gcc/tree-scalar-evolution.h index 0f90207bc733db3cf85979d9b0b962aefa0831d6..5af7d2bba0d62195704a8d41ef6e600327169770 100644 --- a/gcc/tree-scalar-evolution.h +++ b/gcc/tree-scalar-evolution.h @@ -23,6 +23,7 @@ along with GCC; see the file COPYING3. If not see extern tree number_of_latch_executions (class loop *); extern gcond *get_loop_exit_condition (const class loop *); +extern gcond *get_edge_condition (edge); extern void scev_initialize (void); extern bool scev_initialized_p (void); diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc index f75398afb7c9fdf42e69e940e2232942143049f6..105cee39277741535252fea56c17f7c047003ce1 100644 --- a/gcc/tree-scalar-evolution.cc +++ b/gcc/tree-scalar-evolution.cc @@ -884,7 +884,7 @@ scev_dfs::add_to_evolution (tree chrec_before, enum tree_code code, return res; } - + /* Follow the ssa edge into the binary expression RHS0 CODE RHS1. Return true if the strongly connected component has been found. */ @@ -1293,8 +1293,15 @@ scev_dfs::follow_ssa_edge_expr (gimple *at_stmt, tree expr, gcond * get_loop_exit_condition (const class loop *loop) { + return get_edge_condition (normal_exit (loop)); +} + +/* If the statement just before the EXIT_EDGE contains a condition then + return the condition, otherwise NULL. */ + +gcond * +get_edge_condition (edge exit_edge){ gcond *res = NULL; - edge exit_edge = single_exit (loop); if (dump_file && (dump_flags & TDF_SCEV)) fprintf (dump_file, "(get_loop_exit_condition \n "); diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index 18b0f962670ffc86fe5cc4e633097f3ce45341fe..2a2f3b4acb5905a2ba1f9d3b54a08a55028cd462 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -570,6 +570,144 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, return opt_result::success (); } +/* This function tries to validate whether an early break vectorization + is possible for the current instruction sequence. Returns True i + possible, otherwise False. + + Requirements: + - Any memory access must be to a fixed size buffer. + - There must not be any loads and stores to the same object. + - Multiple loads are allowed as long as they don't alias. + + + Arguments: + - LOOP_VINFO: loop information for the current loop. + - CHAIN: Currently detected sequence of instructions that belong + to the current early break. + - LOADS: List of all loads found during traversal. + - BASES: List of all load datareferences found during traversal. + - GSTMT: Current position to inspect for validity. The sequence + will be moved upwards from this point. */ + +static bool +validate_early_exit_stmts (loop_vec_info loop_vinfo, hash_set *chain, + vec *loads, vec *bases, + gimple_stmt_iterator *gstmt) +{ + if (gsi_end_p (*gstmt)) + return true; + + gimple *stmt = gsi_stmt (*gstmt); + if (gimple_has_ops (stmt)) + { + tree dest = NULL_TREE; + /* Try to find the SSA_NAME being defined. For Statements with an LHS + use the LHS, if not, assume that the first argument of a call is the + value being defined. e.g. MASKED_LOAD etc. */ + if (gimple_has_lhs (stmt)) + { + if (is_gimple_assign (stmt)) + dest = gimple_assign_lhs (stmt); + else if (const gcall *call = dyn_cast (stmt)) + dest = gimple_call_lhs (call); + } + else if (const gcall *call = dyn_cast (stmt)) + dest = gimple_arg (call, 0); + + /* Don't move the scalar instructions. */ + bool move + = dest && (VECTOR_TYPE_P (TREE_TYPE (dest)) + || POINTER_TYPE_P (TREE_TYPE (dest))); + + /* If we found the defining statement of a something that's part of the + chain then expand the chain with the new SSA_VARs being used. */ + if (chain->contains (dest)) + { + for (unsigned x = 0; x < gimple_num_args (stmt); x++) + if (TREE_CODE (gimple_arg (stmt, x)) == SSA_NAME) + chain->add (gimple_arg (stmt, x)); + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "found chain %G", stmt); + + move = false; + } + + stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt); + if (!stmt_vinfo) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "early breaks only supported. Unknown" + " statement: %G", stmt); + return false; + } + + auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo); + if (dr_ref) + { + /* We currenly only support statically allocated objects due to + not having first-faulting loads support or peeling for alignment + support. Compute the isize of the referenced object (it could be + dynamically allocated). */ + tree obj = DR_BASE_ADDRESS (dr_ref); + if (!obj || TREE_CODE (obj) != ADDR_EXPR) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "early breaks only supported on statically" + " allocated objects.\n"); + return false; + } + + tree refop = TREE_OPERAND (obj, 0); + tree refbase = get_base_address (refop); + if (!refbase || !DECL_P (refbase) || !DECL_SIZE (refbase) + || TREE_CODE (DECL_SIZE (refbase)) != INTEGER_CST) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "early breaks only supported on statically" + " allocated objects.\n"); + return false; + } + + if (!move && DR_IS_READ (dr_ref)) + { + loads->safe_push (dest); + bases->safe_push (dr_ref); + } + else if (DR_IS_WRITE (dr_ref)) + { + for (auto dr : bases) + if (same_data_refs_base_objects (dr, dr_ref)) + return false; + } + } + + if (move) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "analyzing stmt %G", stmt); + + for (tree ref : loads) + if (stmt_may_clobber_ref_p (stmt, ref, true)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "early breaks not supported as memory used" + " may alias.\n"); + return false; + } + } + } + + gsi_prev (gstmt); + return validate_early_exit_stmts (loop_vinfo, chain, loads, bases, gstmt); +} + /* Function vect_analyze_data_ref_dependences. Examine all the data references in the loop, and make sure there do not @@ -614,6 +752,32 @@ vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, return res; } + /* If we have early break statements in the loop, check to see if they + are of a form we can vectorizer. */ + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + { + hash_set chain; + auto_vec loads; + auto_vec bases; + + for (gcond *c : LOOP_VINFO_LOOP_CONDS (loop_vinfo)) + { + stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (c); + if (STMT_VINFO_TYPE (loop_cond_info) != loop_exit_ctrl_vec_info_type) + continue; + + gimple *stmt = STMT_VINFO_STMT (loop_cond_info); + gimple_stmt_iterator gsi = gsi_for_stmt (stmt); + + if (!validate_early_exit_stmts (loop_vinfo, &chain, &loads, &bases, + &gsi)) + return opt_result::failure_at (stmt, + "can't safely apply code motion to " + "dependencies of %G to vectorize " + "the early exit.\n", stmt); + } + } + return opt_result::success (); } @@ -2072,7 +2236,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) /* Check if we can possibly peel the loop. */ if (!vect_can_advance_ivs_p (loop_vinfo) - || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)) + || !slpeel_can_duplicate_loop_p (loop_vinfo, normal_exit (loop)) || loop->inner) do_peeling = false; diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index 6ad6372c55eef94a742a8fa35e79d66aa24e2f3b..632fbf38d91c08f54e731d69ab56f2b8c554d456 100644 --- a/gcc/tree-vect-generic.cc +++ b/gcc/tree-vect-generic.cc @@ -500,7 +500,7 @@ expand_vector_comparison (gimple_stmt_iterator *gsi, tree type, tree op0, } t = gimplify_build1 (gsi, VIEW_CONVERT_EXPR, type, t); } - else + else if (TYPE_VECTOR_SUBPARTS (type).is_constant ()) t = expand_vector_piecewise (gsi, do_compare, type, TREE_TYPE (TREE_TYPE (op0)), op0, op1, code, false); diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc index 1d96130c985e2defd141cfdf602224c73b4b41f2..2b121b0c44ea69b0af93f52fd8596f5495867125 100644 --- a/gcc/tree-vect-loop-manip.cc +++ b/gcc/tree-vect-loop-manip.cc @@ -770,7 +770,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop, add_header_seq (loop, header_seq); /* Get a boolean result that tells us whether to iterate. */ - edge exit_edge = single_exit (loop); + edge exit_edge = normal_exit (loop); tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR; tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl)); gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl, @@ -789,7 +789,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop, if (final_iv) { gassign *assign = gimple_build_assign (final_iv, orig_niters); - gsi_insert_on_edge_immediate (single_exit (loop), assign); + gsi_insert_on_edge_immediate (exit_edge, assign); } return cond_stmt; @@ -799,7 +799,8 @@ vect_set_loop_condition_partial_vectors (class loop *loop, loop handles exactly VF scalars per iteration. */ static gcond * -vect_set_loop_condition_normal (class loop *loop, tree niters, tree step, +vect_set_loop_condition_normal (loop_vec_info loop_vinfo, + class loop *loop, tree niters, tree step, tree final_iv, bool niters_maybe_zero, gimple_stmt_iterator loop_cond_gsi) { @@ -807,7 +808,7 @@ vect_set_loop_condition_normal (class loop *loop, tree niters, tree step, gcond *cond_stmt; gcond *orig_cond; edge pe = loop_preheader_edge (loop); - edge exit_edge = single_exit (loop); + edge exit_edge = normal_exit (loop); gimple_stmt_iterator incr_gsi; bool insert_after; enum tree_code code; @@ -872,7 +873,11 @@ vect_set_loop_condition_normal (class loop *loop, tree niters, tree step, In both cases the loop limit is NITERS - STEP. */ gimple_seq seq = NULL; limit = force_gimple_operand (niters, &seq, true, NULL_TREE); - limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step); + /* For VLA leave limit == niters. Though I wonder if maybe I should + force partial loops here and use vect_set_loop_condition_partial_vectors + instead. The problem is that the VL check is useless here. */ + if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo) && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) + limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step); if (seq) { basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); @@ -907,7 +912,8 @@ vect_set_loop_condition_normal (class loop *loop, tree niters, tree step, gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT); /* Record the number of latch iterations. */ - if (limit == niters) + if (limit == niters + || LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) /* Case A: the loop iterates NITERS times. Subtract one to get the latch count. */ loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters, @@ -918,10 +924,17 @@ vect_set_loop_condition_normal (class loop *loop, tree niters, tree step, loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type, limit, step); - if (final_iv) + auto_vec exits = get_loop_exit_edges (loop); + /* For multiple exits we've already maintained LCSSA form and handled + the scalar iteration update in the code that deals with the merge + block and its updated guard. I could move that code here instead + of in vect_update_ivs_after_early_break but I have to still deal + with the updates to the counter `i`. So for now I'll keep them + together. */ + if (final_iv && exits.length () == 1) { gassign *assign; - edge exit = single_exit (loop); + edge exit = normal_exit (loop); gcc_assert (single_pred_p (exit->dest)); tree phi_dest = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr); @@ -972,13 +985,15 @@ vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo, gcond *orig_cond = get_loop_exit_condition (loop); gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond); - if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) + if (loop_vinfo + && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo, niters, final_iv, niters_maybe_zero, loop_cond_gsi); else - cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv, + cond_stmt = vect_set_loop_condition_normal (loop_vinfo, loop, niters, + step, final_iv, niters_maybe_zero, loop_cond_gsi); @@ -1066,7 +1081,7 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge exit, new_exit; bool duplicate_outer_loop = false; - exit = single_exit (loop); + exit = normal_exit (loop); at_exit = (e == exit); if (!at_exit && e != loop_preheader_edge (loop)) return NULL; @@ -1104,11 +1119,11 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, bbs[0] = preheader; new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1); - exit = single_exit (scalar_loop); + exit = normal_exit (scalar_loop); copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs, &exit, 1, &new_exit, NULL, at_exit ? loop->latch : e->src, true); - exit = single_exit (loop); + exit = normal_exit (loop); basic_block new_preheader = new_bbs[0]; /* Before installing PHI arguments make sure that the edges @@ -1176,10 +1191,30 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, add_phi_arg (phi, orig_arg, new_exit, orig_locus); } } + + /* if have multiple exist, we now need to point the additional exits + from the old loop to the loop pre-header of the new copied loop. + Currently we only support simple early break vectorization so all + additional exits must exit the loop. Additionally we can only place + copies at the end. i.e. we cannot do prologue peeling. */ + auto_vec exits = get_loop_exit_edges (loop); + bool multiple_exits_p = exits.length () > 1; + + for (unsigned i = 1; i < exits.length (); i++) + { + redirect_edge_and_branch (exits[i], new_preheader); + flush_pending_stmts (exits[i]); + } + + /* Main exit must be the last to be rewritten as it's the first phi node + entry. The rest are in array order. */ redirect_edge_and_branch_force (e, new_preheader); flush_pending_stmts (e); + set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src); - if (was_imm_dom || duplicate_outer_loop) + + auto_vec new_exits = get_loop_exit_edges (new_loop); + if ((was_imm_dom || duplicate_outer_loop) && !multiple_exits_p) set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src); /* And remove the non-necessary forwarder again. Keep the other @@ -1189,6 +1224,23 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, delete_basic_block (preheader); set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, loop_preheader_edge (scalar_loop)->src); + + /* Finally after wiring the new epilogue we need to update its main exit + to the original function exit we recorded. Other exits are already + correct. */ + if (multiple_exits_p) + { + auto_vec doms; + for (edge e : exits) + doms.safe_push (e->dest); + for (edge e : new_exits) + doms.safe_push (e->dest); + doms.safe_push (exit_dest); + /* Likely a fall-through edge, so update if needed. */ + if (single_succ_p (exit_dest)) + doms.safe_push (single_succ (exit_dest)); + iterate_fix_dominators (CDI_DOMINATORS, doms, false); + } } else /* Add the copy at entry. */ { @@ -1310,20 +1362,24 @@ slpeel_add_loop_guard (basic_block guard_bb, tree cond, */ bool -slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e) +slpeel_can_duplicate_loop_p (const loop_vec_info loop_vinfo, const_edge e) { - edge exit_e = single_exit (loop); + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + edge exit_e = normal_exit (loop); edge entry_e = loop_preheader_edge (loop); gcond *orig_cond = get_loop_exit_condition (loop); gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src); unsigned int num_bb = loop->inner? 5 : 2; + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + num_bb += 1; + /* All loops have an outer scope; the only case loop->outer is NULL is for the function itself. */ if (!loop_outer (loop) || loop->num_nodes != num_bb || !empty_block_p (loop->latch) - || !single_exit (loop) + || (!single_exit (loop) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) /* Verify that new loop exit condition can be trivially modified. */ || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi)) || (e != exit_e && e != entry_e)) @@ -1528,6 +1584,12 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, gphi_iterator gsi, gsi1; class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); basic_block update_bb = update_e->dest; + + /* For early exits we'll update the IVs in + vect_update_ivs_after_early_break. */ + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + return; + basic_block exit_bb = single_exit (loop)->dest; /* Make sure there exists a single-predecessor exit bb: */ @@ -1613,6 +1675,186 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, /* Fix phi expressions in the successor bb. */ adjust_phi_and_debug_stmts (phi1, update_e, ni_name); } + return; +} + +/* Function vect_update_ivs_after_early_break. + + "Advance" the induction variables of LOOP to the value they should take + after the execution of LOOP. This is currently necessary because the + vectorizer does not handle induction variables that are used after the + loop. Such a situation occurs when the last iterations of LOOP are + peeled, because of the early exit. With an early exit we always peel the + loop. + + Input: + - LOOP_VINFO - a loop info structure for the loop that is going to be + vectorized. The last few iterations of LOOP were peeled. + - LOOP - a loop that is going to be vectorized. The last few iterations + of LOOP were peeled. + - VF - The loop vectorization factor. + - NITERS_ORIG - the number of iterations that LOOP executes (before it is + vectorized). i.e, the number of times the ivs should be + bumped. + - NITERS_VECTOR - The number of iterations that the vector LOOP executes. + - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path + coming out from LOOP on which there are uses of the LOOP ivs + (this is the path from LOOP->exit to epilog_loop->preheader). + + The new definitions of the ivs are placed in LOOP->exit. + The phi args associated with the edge UPDATE_E in the bb + UPDATE_E->dest are updated accordingly. + + Output: + - If available, the LCSSA phi node for the loop IV temp. + + Assumption 1: Like the rest of the vectorizer, this function assumes + a single loop exit that has a single predecessor. + + Assumption 2: The phi nodes in the LOOP header and in update_bb are + organized in the same order. + + Assumption 3: The access function of the ivs is simple enough (see + vect_can_advance_ivs_p). This assumption will be relaxed in the future. + + Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path + coming out of LOOP on which the ivs of LOOP are used (this is the path + that leads to the epilog loop; other paths skip the epilog loop). This + path starts with the edge UPDATE_E, and its destination (denoted update_bb) + needs to have its phis updated. + */ + +static tree +vect_update_ivs_after_early_break (loop_vec_info loop_vinfo, class loop *, + poly_int64 vf, tree niters_orig, + tree niters_vector, edge update_e) +{ + gphi_iterator gsi, gsi1; + tree ni_name, ivtmp = NULL; + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + basic_block update_bb = update_e->dest; + auto_vec exits = get_loop_exit_edges (loop); + + basic_block exit_bb = exits[0]->dest; + + if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + return NULL; + + for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb); + !gsi_end_p (gsi) && !gsi_end_p (gsi1); + gsi_next (&gsi), gsi_next (&gsi1)) + { + tree init_expr; + tree step_expr; + tree type; + tree var, ni; + gimple_stmt_iterator last_gsi; + + gphi *phi = gsi1.phi (); + tree phi_ssa = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop)); + gphi *phi1 = as_a (SSA_NAME_DEF_STMT (phi_ssa)); + stmt_vec_info phi_info = loop_vinfo->lookup_stmt (gsi.phi ()); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "vect_update_ivs_after_early_break: phi: %G", + (gimple *)phi); + + /* Skip reduction and virtual phis. */ + if (!iv_phi_p (phi_info)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "reduc or virtual phi. skip.\n"); + continue; + } + + /* For multiple exits where we handle early exits we need to carry on + with the previous IV as loop iteration was not done because we exited + early. As such just grab the original IV. */ + if (STMT_VINFO_TYPE (phi_info) != undef_vec_info_type) + { + type = TREE_TYPE (gimple_phi_result (phi)); + step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); + step_expr = unshare_expr (step_expr); + + /* We previously generated the new merged phi in the same BB as the + guard. So use that to perform the scaling on rather than the + normal loop phi which don't take the early breaks into account. */ + init_expr = gimple_phi_result (phi1); //PHI_ARG_DEF_FROM_EDGE (phi1, loop_preheader_edge (loop)); + + ni = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), + fold_convert (TREE_TYPE (step_expr), init_expr), + build_int_cst (TREE_TYPE (step_expr), vf)); + + var = create_tmp_var (type, "tmp"); + + last_gsi = gsi_last_bb (exit_bb); + gimple_seq new_stmts = NULL; + ni_name = force_gimple_operand (ni, &new_stmts, false, var); + /* Exit_bb shouldn't be empty. */ + if (!gsi_end_p (last_gsi)) + gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT); + else + gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT); + + /* Fix phi expressions in the successor bb. */ + adjust_phi_and_debug_stmts (phi, update_e, ni_name); + } + else if (STMT_VINFO_TYPE (phi_info) == undef_vec_info_type) + { + type = TREE_TYPE (gimple_phi_result (phi)); + step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); + step_expr = unshare_expr (step_expr); + + /* We previously generated the new merged phi in the same BB as the + guard. So use that to perform the scaling on rather than the + normal loop phi which don't take the early breaks into account. */ + init_expr = PHI_ARG_DEF_FROM_EDGE (phi1, loop_preheader_edge (loop)); + + if (vf.is_constant ()) + { + ni = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), + fold_convert (TREE_TYPE (step_expr), + niters_vector), + build_int_cst (TREE_TYPE (step_expr), vf)); + + ni = fold_build2 (MINUS_EXPR, TREE_TYPE (step_expr), + fold_convert (TREE_TYPE (step_expr), + niters_orig), + fold_convert (TREE_TYPE (step_expr), ni)); + } + else + /* If the loop's VF isn't constant then the loop must have been + masked, so at the end of the loop we know we have finished + the entire loop and found nothing. */ + ni = build_zero_cst (TREE_TYPE (step_expr)); + + gcc_assert (TREE_CODE (ni) == INTEGER_CST); + + var = create_tmp_var (type, "tmp"); + + last_gsi = gsi_last_bb (exit_bb); + gimple_seq new_stmts = NULL; + ni_name = force_gimple_operand (ni, &new_stmts, false, var); + /* Exit_bb shouldn't be empty. */ + if (!gsi_end_p (last_gsi)) + gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT); + else + gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT); + + adjust_phi_and_debug_stmts (phi1, update_e, ni_name); + + for (unsigned i = 1; i < exits.length (); i++) + adjust_phi_and_debug_stmts (phi1, exits[i], + build_int_cst (TREE_TYPE (step_expr), + vf)); + ivtmp = gimple_phi_result (phi1); + } + else + continue; + } + + return ivtmp; } /* Return a gimple value containing the misalignment (measured in vector @@ -2096,7 +2338,7 @@ vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo, class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree type = TREE_TYPE (niters_vector); tree log_vf = build_int_cst (type, exact_log2 (vf)); - basic_block exit_bb = single_exit (loop)->dest; + basic_block exit_bb = normal_exit (loop)->dest; gcc_assert (niters_vector_mult_vf_ptr != NULL); tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type, @@ -2123,19 +2365,46 @@ find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED, gphi *lcssa_phi) { gphi_iterator gsi; - edge e = single_exit (loop); + edge e = normal_exit (loop); - gcc_assert (single_pred_p (e->dest)); for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi)) { gphi *phi = gsi.phi (); - if (operand_equal_p (PHI_ARG_DEF (phi, 0), - PHI_ARG_DEF (lcssa_phi, 0), 0)) + /* Nested loops with multiple exits can have different no# phi node + arguments between the main loop and epilog as epilog falls to the + second loop. */ + if (gimple_phi_num_args (phi) > e->dest_idx + && operand_equal_p (PHI_ARG_DEF (phi, e->dest_idx), + PHI_ARG_DEF (lcssa_phi, 0), 0)) return PHI_RESULT (phi); } return NULL_TREE; } +/* Starting from the current edge walk all instructions and find the last + VUSE/VDEF in the basic block. */ + +static tree +gimple_find_last_mem_use (edge e) +{ + basic_block bb = e->src; + tree res = NULL; + gimple_stmt_iterator iter = gsi_last_bb (bb); + do + { + gimple *stmt = gsi_stmt (iter); + if ((res = gimple_vdef (stmt))) + return res; + + if ((res = gimple_vuse (stmt))) + return res; + + gsi_prev (&iter); + } while (!gsi_end_p (iter)); + + return NULL; +} + /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND from SECOND/FIRST and puts it at the original loop's preheader/exit edge, the two loops are arranged as below: @@ -2185,6 +2454,7 @@ find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED, static void slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo, class loop *first, class loop *second, + tree *lcssa_ivtmp, bool create_lcssa_for_iv_phis) { gphi_iterator gsi_update, gsi_orig; @@ -2192,10 +2462,18 @@ slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo, edge first_latch_e = EDGE_SUCC (first->latch, 0); edge second_preheader_e = loop_preheader_edge (second); - basic_block between_bb = single_exit (first)->dest; + auto_vec exits = get_loop_exit_edges (first); + basic_block between_bb = exits[0]->dest; + + bool early_exit = LOOP_VINFO_EARLY_BREAKS (loop_vinfo); + /* For early exits when we create the merge BB we must maintain it in + LCSSA form, otherwise the final vectorizer passes will create the + wrong PHI nodes here. */ + create_lcssa_for_iv_phis = create_lcssa_for_iv_phis || early_exit; gcc_assert (between_bb == second_preheader_e->src); - gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb)); + gcc_assert ((single_pred_p (between_bb) && single_succ_p (between_bb)) + || early_exit); /* Either the first loop or the second is the loop to be vectorized. */ gcc_assert (loop == first || loop == second); @@ -2215,10 +2493,40 @@ slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo, { tree new_res = copy_ssa_name (PHI_RESULT (orig_phi)); gphi *lcssa_phi = create_phi_node (new_res, between_bb); - add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION); + + /* The first exit is always the loop latch, so handle that + seperately. */ + gcc_assert (arg); + add_phi_arg (lcssa_phi, arg, exits[0], UNKNOWN_LOCATION); + + /* The early exits are processed in order starting from exit 1. */ + for (unsigned i = 1; i < exits.length (); i++) + { + tree phi_arg; + if (gimple_phi_has_vdef (orig_phi)) + phi_arg = gimple_find_last_mem_use (exits[i]); + else + /* For induction values just copy the previous one as the + current iteration did not finish. We'll update as needed + later on. */ + phi_arg = gimple_phi_result (orig_phi); + /* If we didn't find any just copy the existing one and leave + it to the others to fix it up. */ + if (!phi_arg) + phi_arg = gimple_phi_result (orig_phi); + add_phi_arg (lcssa_phi, phi_arg, exits[i], UNKNOWN_LOCATION); + } arg = new_res; } + /* Normally able to distinguish between the iterator counter and the + ivtemps bu looking at the STMT_VINFO_TYPE of the phi node. + however for some reason this isn't consistently set. Is there a + better way??. */ + if (lcssa_ivtmp + && iv_phi_p (vect_phi_info)) + *lcssa_ivtmp = arg; + /* Update PHI node in the second loop by replacing arg on the loop's incoming edge. */ adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg); @@ -2228,7 +2536,8 @@ slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo, for correct vectorization of live stmts. */ if (loop == first) { - basic_block orig_exit = single_exit (second)->dest; + auto_vec new_exits = get_loop_exit_edges (second); + basic_block orig_exit = new_exits[0]->dest; for (gsi_orig = gsi_start_phis (orig_exit); !gsi_end_p (gsi_orig); gsi_next (&gsi_orig)) { @@ -2243,7 +2552,15 @@ slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo, tree new_res = copy_ssa_name (orig_arg); gphi *lcphi = create_phi_node (new_res, between_bb); - add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION); + /* The first exit is always the loop latch, so handle that + seperately. */ + add_phi_arg (lcphi, orig_arg, new_exits[0], UNKNOWN_LOCATION); + /* The early exits are processed in order starting from exit 1. */ + for (unsigned i = 1; i < new_exits.length (); i++) + { + tree phi_arg = gimple_phi_result (orig_phi); + add_phi_arg (lcphi, phi_arg, exits[i], UNKNOWN_LOCATION); + } } } } @@ -2393,13 +2710,11 @@ slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog, gcc_assert (single_succ_p (merge_bb)); edge e = single_succ_edge (merge_bb); basic_block exit_bb = e->dest; - gcc_assert (single_pred_p (exit_bb)); - gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest); for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi)) { gphi *update_phi = gsi.phi (); - tree old_arg = PHI_ARG_DEF (update_phi, 0); + tree old_arg = PHI_ARG_DEF (update_phi, e->dest_idx); tree merge_arg = NULL_TREE; @@ -2438,12 +2753,14 @@ static void slpeel_update_phi_nodes_for_lcssa (class loop *epilog) { gphi_iterator gsi; - basic_block exit_bb = single_exit (epilog)->dest; + auto_vec exits = get_loop_exit_edges (epilog); - gcc_assert (single_pred_p (exit_bb)); - edge e = EDGE_PRED (exit_bb, 0); - for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi)) - rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e)); + for (unsigned i = 0; i < exits.length (); i++) + { + basic_block exit_bb = exits[i]->dest; + for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi)) + rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), exits[i])); + } } /* EPILOGUE_VINFO is an epilogue loop that we now know would need to @@ -2621,6 +2938,14 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, bound_epilog += vf - 1; if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) bound_epilog += 1; + /* For early breaks the scalar loop needs to execute at most VF times + to find the element that caused the break. */ + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + { + bound_epilog = vf; + /* Force a scalar epilogue as we can't vectorize the index finding. */ + vect_epilogues = false; + } bool epilog_peeling = maybe_ne (bound_epilog, 0U); poly_uint64 bound_scalar = bound_epilog; @@ -2780,16 +3105,24 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, bound_prolog + bound_epilog) : (!LOOP_REQUIRES_VERSIONING (loop_vinfo) || vect_epilogues)); + + /* We only support early break vectorization on known bounds at this time. + This means that if the vector loop can't be entered then we won't generate + it at all. So for now force skip_vector off because the additional control + flow messes with the BB exits and we've already analyzed them. */ + skip_vector = skip_vector && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo); + /* Epilog loop must be executed if the number of iterations for epilog loop is known at compile time, otherwise we need to add a check at the end of vector loop and skip to the end of epilog loop. */ bool skip_epilog = (prolog_peeling < 0 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || !vf.is_constant ()); - /* PEELING_FOR_GAPS is special because epilog loop must be executed. */ - if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) + /* PEELING_FOR_GAPS and peeling for early breaks are special because epilog + loop must be executed. */ + if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) + || LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) skip_epilog = false; - class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); auto_vec original_counts; basic_block *original_bbs = NULL; @@ -2828,7 +3161,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, if (prolog_peeling) { e = loop_preheader_edge (loop); - if (!slpeel_can_duplicate_loop_p (loop, e)) + if (!slpeel_can_duplicate_loop_p (loop_vinfo, e)) { dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, "loop can't be duplicated to preheader edge.\n"); @@ -2843,7 +3176,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, gcc_unreachable (); } prolog->force_vectorize = false; - slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true); + slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, NULL, true); first_loop = prolog; reset_original_copy_tables (); @@ -2902,11 +3235,13 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, if (epilog_peeling) { - e = single_exit (loop); - if (!slpeel_can_duplicate_loop_p (loop, e)) + auto_vec exits = get_loop_exit_edges (loop); + e = exits[0]; + if (!slpeel_can_duplicate_loop_p (loop_vinfo, e)) { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, - "loop can't be duplicated to exit edge.\n"); + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, + "loop can't be duplicated to exit edge.\n"); gcc_unreachable (); } /* Peel epilog and put it on exit edge of loop. If we are vectorizing @@ -2920,12 +3255,16 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e); if (!epilog) { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, - "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n"); + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, + "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n"); gcc_unreachable (); } epilog->force_vectorize = false; - slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false); + + tree early_break_iv_name; + slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, + &early_break_iv_name, false); /* Scalar version loop may be preferred. In this case, add guard and skip to epilog. Note this only happens when the number of @@ -2978,6 +3317,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, vect_gen_vector_loop_niters (loop_vinfo, niters, niters_vector, step_vector, niters_no_overflow); + if (!integer_onep (*step_vector)) { /* On exit from the loop we will have an easy way of calcalating @@ -2987,9 +3327,13 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop (); *niters_vector_mult_vf_var = niters_vector_mult_vf; } + else if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + vect_gen_vector_loop_niters_mult_vf (loop_vinfo, early_break_iv_name, + &niters_vector_mult_vf); else vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector, &niters_vector_mult_vf); + /* Update IVs of original loop as if they were advanced by niters_vector_mult_vf steps. */ gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo)); @@ -2997,12 +3341,93 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf, update_e); + /* For early breaks we must create a guard to check how many iterations + of the scalar loop are yet to be performed. */ + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + { + gcc_assert (early_break_iv_name); + tree ivtmp = + vect_update_ivs_after_early_break (loop_vinfo, epilog, vf, niters, + *niters_vector, update_e); + tree guard_cond = fold_build2 (EQ_EXPR, boolean_type_node, + fold_convert (TREE_TYPE (niters), + ivtmp), + build_zero_cst (TREE_TYPE (niters))); + basic_block guard_bb = normal_exit (loop)->dest; + auto_vec new_exits = get_loop_exit_edges (epilog); + /* If we had a fallthrough edge, the guard will the threaded through + and so we may need to find the actual final edge. */ + edge final_edge = new_exits[0]; + basic_block guard_to; + bool fn_exit_p = false; + if (gsi_end_p (gsi_start_nondebug_bb (final_edge->dest)) + && !gsi_end_p (gsi_start_phis (final_edge->dest)) + && single_succ_p (final_edge->dest)) + { + auto gsi = gsi_start_phis (final_edge->dest); + while (!gsi_end_p (gsi)) + gsi_remove (&gsi, true); + guard_to = final_edge->dest; + fn_exit_p = true; + } + else + guard_to = split_edge (normal_exit (epilog)); + edge guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to, + guard_bb, prob_epilog.invert (), + irred_flag); + basic_block dest = single_succ (guard_to); + /* If we have a single pred then the previous block is the immediate + dominator. This may or may not be the guard bb. However if we + have multiple pred then the guard BB must be the dominator as all + previous exits got rewrited to the guard BB. */ + if (single_pred_p (dest)) + set_immediate_dominator (CDI_DOMINATORS, dest, guard_to); + else + set_immediate_dominator (CDI_DOMINATORS, dest, guard_bb); + + /* We must update all the edges from the new guard_bb. */ + slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e, + final_edge); + + /* If we have an additional functione exit block, then thread the updates + through to the block. Leaving it up to the LCSSA cleanup pass will + get the wrong values here as it can't handle the merge block we just + made correctly. */ + if (fn_exit_p) + { + gphi_iterator gsi_update, gsi_orig, gsi_vect; + for (gsi_orig = gsi_start_phis (epilog->header), + gsi_update = gsi_start_phis (guard_e->dest), + gsi_vect = gsi_start_phis (loop->header); + !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update) + && !gsi_end_p (gsi_vect); + gsi_next (&gsi_orig), gsi_next (&gsi_update), + gsi_next (&gsi_vect)) + { + gphi *orig_phi = gsi_orig.phi (); + gphi *update_phi = gsi_update.phi (); + gphi *vect_phi = gsi_vect.phi (); + stmt_vec_info phi_info = loop_vinfo->lookup_stmt (vect_phi); + + if (iv_phi_p (phi_info)) + continue; + + tree phi_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, update_e); + SET_PHI_ARG_DEF (update_phi, update_e->dest_idx, phi_arg); + + phi_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, guard_e); + SET_PHI_ARG_DEF (update_phi, guard_e->dest_idx, phi_arg); + } + } + flush_pending_stmts (guard_e); + } + if (skip_epilog) { guard_cond = fold_build2 (EQ_EXPR, boolean_type_node, niters, niters_vector_mult_vf); - guard_bb = single_exit (loop)->dest; - guard_to = split_edge (single_exit (epilog)); + guard_bb = normal_exit (loop)->dest; + guard_to = split_edge (normal_exit (epilog)); guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to, skip_vector ? anchor : guard_bb, prob_epilog.invert (), @@ -3010,7 +3435,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, if (vect_epilogues) epilogue_vinfo->skip_this_loop_edge = guard_e; slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e, - single_exit (epilog)); + normal_exit (epilog)); /* Only need to handle basic block before epilog loop if it's not the guard_bb, which is the case when skip_vector is true. */ if (guard_bb != bb_before_epilog) @@ -3023,7 +3448,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, } else slpeel_update_phi_nodes_for_lcssa (epilog); - unsigned HOST_WIDE_INT bound; if (bound_scalar.is_constant (&bound)) { @@ -3114,7 +3538,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, adjust_vec.release (); free_original_copy_tables (); - return vect_epilogues ? epilog : NULL; } diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index aacbb12580c89a7a12b11cc096502d669a9e1e21..f37bd10575044c2561f792cb8ba4870becfc4a30 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -844,80 +844,106 @@ vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo) in NUMBER_OF_ITERATIONSM1. Place the condition under which the niter information holds in ASSUMPTIONS. - Return the loop exit condition. */ + Return the loop exit conditions. */ -static gcond * +static vec vect_get_loop_niters (class loop *loop, tree *assumptions, tree *number_of_iterations, tree *number_of_iterationsm1) { - edge exit = single_exit (loop); + auto_vec exits = get_loop_exit_edges (loop); + vec conds; + conds.create (exits.length ()); class tree_niter_desc niter_desc; tree niter_assumptions, niter, may_be_zero; - gcond *cond = get_loop_exit_condition (loop); *assumptions = boolean_true_node; *number_of_iterationsm1 = chrec_dont_know; *number_of_iterations = chrec_dont_know; + DUMP_VECT_SCOPE ("get_loop_niters"); - if (!exit) - return cond; + if (exits.is_empty ()) + return conds; + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n", + exits.length ()); - may_be_zero = NULL_TREE; - if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL) - || chrec_contains_undetermined (niter_desc.niter)) - return cond; + edge exit; + unsigned int i; + FOR_EACH_VEC_ELT (exits, i, exit) + { + gcond *cond = get_edge_condition (exit); + if (cond) + conds.safe_push (cond); - niter_assumptions = niter_desc.assumptions; - may_be_zero = niter_desc.may_be_zero; - niter = niter_desc.niter; + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i); - if (may_be_zero && integer_zerop (may_be_zero)) - may_be_zero = NULL_TREE; + may_be_zero = NULL_TREE; + if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL) + || chrec_contains_undetermined (niter_desc.niter)) + continue; - if (may_be_zero) - { - if (COMPARISON_CLASS_P (may_be_zero)) + niter_assumptions = niter_desc.assumptions; + may_be_zero = niter_desc.may_be_zero; + niter = niter_desc.niter; + + if (may_be_zero && integer_zerop (may_be_zero)) + may_be_zero = NULL_TREE; + + if (may_be_zero) { - /* Try to combine may_be_zero with assumptions, this can simplify - computation of niter expression. */ - if (niter_assumptions && !integer_nonzerop (niter_assumptions)) - niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, - niter_assumptions, - fold_build1 (TRUTH_NOT_EXPR, - boolean_type_node, - may_be_zero)); + if (COMPARISON_CLASS_P (may_be_zero)) + { + /* Try to combine may_be_zero with assumptions, this can simplify + computation of niter expression. */ + if (niter_assumptions && !integer_nonzerop (niter_assumptions)) + niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, + niter_assumptions, + fold_build1 (TRUTH_NOT_EXPR, + boolean_type_node, + may_be_zero)); + else + niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero, + build_int_cst (TREE_TYPE (niter), 0), + rewrite_to_non_trapping_overflow (niter)); + + may_be_zero = NULL_TREE; + } + else if (integer_nonzerop (may_be_zero) && i == 0) + { + *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0); + *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1); + continue; + } else - niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero, - build_int_cst (TREE_TYPE (niter), 0), - rewrite_to_non_trapping_overflow (niter)); + continue; + } - may_be_zero = NULL_TREE; - } - else if (integer_nonzerop (may_be_zero)) + /* Loop assumptions are based off the normal exit. */ + if (i == 0) { - *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0); - *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1); - return cond; + *assumptions = niter_assumptions; + *number_of_iterationsm1 = niter; + + /* We want the number of loop header executions which is the number + of latch executions plus one. + ??? For UINT_MAX latch executions this number overflows to zero + for loops like do { n++; } while (n != 0); */ + if (niter && !chrec_contains_undetermined (niter)) + niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), + unshare_expr (niter), + build_int_cst (TREE_TYPE (niter), 1)); + *number_of_iterations = niter; } - else - return cond; } - *assumptions = niter_assumptions; - *number_of_iterationsm1 = niter; - - /* We want the number of loop header executions which is the number - of latch executions plus one. - ??? For UINT_MAX latch executions this number overflows to zero - for loops like do { n++; } while (n != 0); */ - if (niter && !chrec_contains_undetermined (niter)) - niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter), - build_int_cst (TREE_TYPE (niter), 1)); - *number_of_iterations = niter; + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n"); - return cond; + return conds; } /* Function bb_in_loop_p @@ -1455,7 +1481,8 @@ vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) Verify that certain CFG restrictions hold, including: - the loop has a pre-header - - the loop has a single entry and exit + - the loop has a single entry + - nested loops can have only a single exit. - the loop exit condition is simple enough - the number of iterations can be analyzed, i.e, a countable loop. The niter could be analyzed under some assumptions. */ @@ -1484,11 +1511,6 @@ vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info) | (exit-bb) */ - if (loop->num_nodes != 2) - return opt_result::failure_at (vect_location, - "not vectorized:" - " control flow in loop.\n"); - if (empty_block_p (loop->header)) return opt_result::failure_at (vect_location, "not vectorized: empty loop.\n"); @@ -1559,11 +1581,13 @@ vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info) dump_printf_loc (MSG_NOTE, vect_location, "Considering outer-loop vectorization.\n"); info->inner_loop_cond = inner.loop_cond; + + if (!single_exit (loop)) + return opt_result::failure_at (vect_location, + "not vectorized: multiple exits.\n"); + } - if (!single_exit (loop)) - return opt_result::failure_at (vect_location, - "not vectorized: multiple exits.\n"); if (EDGE_COUNT (loop->header->preds) != 2) return opt_result::failure_at (vect_location, "not vectorized:" @@ -1579,21 +1603,45 @@ vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info) "not vectorized: latch block not empty.\n"); /* Make sure the exit is not abnormal. */ - edge e = single_exit (loop); - if (e->flags & EDGE_ABNORMAL) + auto_vec exits = get_loop_exit_edges (loop); + edge nexit = normal_exit (loop); + for (edge e : exits) + { + if (e->flags & EDGE_ABNORMAL) + return opt_result::failure_at (vect_location, + "not vectorized:" + " abnormal loop exit edge.\n"); + /* Early break BB must be after the main exit BB. In theory we should + be able to vectorize the inverse order, but the current flow in the + the vectorizer always assumes you update success PHI nodes, not + preds. */ + if (e != nexit && !dominated_by_p (CDI_DOMINATORS, nexit->src, e->src)) + return opt_result::failure_at (vect_location, + "not vectorized:" + " abnormal loop exit edge order.\n"); + } + + if (exits.length () > 2) return opt_result::failure_at (vect_location, "not vectorized:" - " abnormal loop exit edge.\n"); - - info->loop_cond + " too many exits. Only 1 additional exit" + " supported.\n"); + if (loop->num_nodes != 2 + exits.length () - 1) + return opt_result::failure_at (vect_location, + "not vectorized:" + " unsupported control flow in loop.\n"); + info->conds = vect_get_loop_niters (loop, &info->assumptions, &info->number_of_iterations, &info->number_of_iterationsm1); - if (!info->loop_cond) + + if (info->conds.is_empty ()) return opt_result::failure_at (vect_location, "not vectorized: complicated exit condition.\n"); + info->loop_cond = info->conds[0]; + if (integer_zerop (info->assumptions) || !info->number_of_iterations || chrec_contains_undetermined (info->number_of_iterations)) @@ -1638,8 +1686,22 @@ vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared, if (!integer_onep (info->assumptions) && !main_loop_info) LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions; - stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond); - STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type; + bool loop_iv_cond = true; + for (gcond *cond : info->conds) + { + if (!loop_iv_cond) + { + stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond); + STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type; + LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (cond); + } + loop_iv_cond = false; + } + + /* Check to see if we're vectorizing multiple exits. */ + LOOP_VINFO_EARLY_BREAKS (loop_vinfo) + = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty (); + if (info->inner_loop_cond) { stmt_vec_info inner_loop_cond_info @@ -2270,10 +2332,13 @@ vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo, bool need_peeling_or_partial_vectors_p = vect_need_peeling_or_partial_vectors_p (loop_vinfo); - /* Decide whether to vectorize the loop with partial vectors. */ + /* Decide whether to vectorize the loop with partial vectors. Currently + early break vectorization does not support partial vectors as we have + to peel a scalar loop that we can't vectorize. */ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) + && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo) && need_peeling_or_partial_vectors_p) { /* For partial-vector-usage=1, try to push the handling of partial @@ -2746,13 +2811,14 @@ start_over: /* If an epilogue loop is required make sure we can create one. */ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) - || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) + || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) + || LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) { if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n"); if (!vect_can_advance_ivs_p (loop_vinfo) - || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo), - single_exit (LOOP_VINFO_LOOP + || !slpeel_can_duplicate_loop_p (loop_vinfo, + normal_exit (LOOP_VINFO_LOOP (loop_vinfo)))) { ok = opt_result::failure_at (vect_location, @@ -3239,6 +3305,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) "***** Choosing vector mode %s\n", GET_MODE_NAME (first_loop_vinfo->vector_mode)); + loop_form_info.conds.release (); + /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is enabled, SIMDUID is not set, it is the innermost loop and we have either already found the loop's SIMDLEN or there was no SIMDLEN to @@ -3350,6 +3418,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) (first_loop_vinfo->epilogue_vinfos[0]->vector_mode)); } + loop_form_info.conds.release (); + return first_loop_vinfo; } @@ -5407,7 +5477,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, basic_block exit_bb; tree scalar_dest; tree scalar_type; - gimple *new_phi = NULL, *phi; + gimple *new_phi = NULL, *phi = NULL; gimple_stmt_iterator exit_gsi; tree new_temp = NULL_TREE, new_name, new_scalar_dest; gimple *epilog_stmt = NULL; @@ -5629,7 +5699,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, Store them in NEW_PHIS. */ if (double_reduc) loop = outer_loop; - exit_bb = single_exit (loop)->dest; + exit_bb = normal_exit (loop)->dest; exit_gsi = gsi_after_labels (exit_bb); reduc_inputs.create (slp_node ? vec_num : ncopies); for (unsigned i = 0; i < vec_num; i++) @@ -5645,10 +5715,37 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, phi = create_phi_node (new_def, exit_bb); if (j) def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]); - SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); + SET_PHI_ARG_DEF (phi, normal_exit (loop)->dest_idx, def); new_def = gimple_convert (&stmts, vectype, new_def); reduc_inputs.quick_push (new_def); } + /* Update the other exits. */ + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + { + auto_vec exits = get_loop_exit_edges (loop); + gphi_iterator gsi, gsi1; + for (unsigned i = 1; i < exits.length (); i++) + { + /* Find the phi node to propaget into the exit block for each + exit edge. */ + edge exit = exits[i]; + for (gsi = gsi_start_phis (exit_bb), + gsi1 = gsi_start_phis (exit->src); + !gsi_end_p (gsi) && !gsi_end_p (gsi1); + gsi_next (&gsi), gsi_next (&gsi1)) + { + /* There really should be a function to just get the number + of phis inside a bb. */ + if (phi && phi == gsi.phi ()) + { + gphi *phi1 = gsi1.phi (); + SET_PHI_ARG_DEF (phi, exit->dest_idx, + PHI_RESULT (phi1)); + break; + } + } + } + } gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); } @@ -9358,8 +9455,22 @@ vectorizable_induction (loop_vec_info loop_vinfo, dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n"); pe = loop_preheader_edge (iv_loop); - /* Find the first insertion point in the BB. */ + /* Find the first insertion point in the BB. If we're vectorzing a block that + contains an early exit, the IV needs to be materialized in the fall + through block as we shouldn't update the IV if we're exiting. */ basic_block bb = gimple_bb (phi); + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + { + for (gcond *cond : LOOP_VINFO_LOOP_CONDS (loop_vinfo)) + { + basic_block cond_bb = gimple_bb (cond); + if (cond_bb == bb) + { + bb = FALLTHRU_EDGE (cond_bb)->dest; + break; + } + } + } si = gsi_after_labels (bb); /* For SLP induction we have to generate several IVs as for example @@ -9999,13 +10110,24 @@ vectorizable_live_operation (vec_info *vinfo, new_tree = lane_extract ; lhs' = new_tree; */ + /* When vectorizing an early break, any live statement that is used + outside of the loop are dead. The loop will never get to them. + We could change the liveness value during analysis instead but since + the below code is invalid anyway just ignore it during codegen. */ + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + return true; + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); - basic_block exit_bb = single_exit (loop)->dest; + basic_block exit_bb = normal_exit (loop)->dest; gcc_assert (single_pred_p (exit_bb)); tree vec_lhs_phi = copy_ssa_name (vec_lhs); gimple *phi = create_phi_node (vec_lhs_phi, exit_bb); - SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs); + /* For early exits we need to compute the right exit. The current + approach punts to a scalar loop instead. If we were to vectorize + the exit condition below needs to take into account the difference + between a `break` edge and a `return` edge. */ + SET_PHI_ARG_DEF (phi, normal_exit (loop)->dest_idx, vec_lhs); gimple_seq stmts = NULL; tree new_tree; @@ -10444,7 +10566,8 @@ scale_profile_for_vect_loop (class loop *loop, unsigned vf) scale_loop_frequencies (loop, p); } - edge exit_e = single_exit (loop); + edge exit_e = normal_exit (loop); + exit_e->probability = profile_probability::always () / (new_est_niter + 1); edge exit_l = single_pred_edge (loop->latch); @@ -10747,6 +10870,115 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree advance) epilogue_vinfo->shared->save_datarefs (); } +/* When vectorizing early break statements instructions that happen before + the early break in the current BB need to be moved to after the early + break. This function deals with that and assumes that any validaty + checks has already been performed. + + While moving the instructions if it encounters a VUSE or VDEF it then + corrects the VUSES as it moves the statements along. CHAINED contains + the list of SSA_NAMES that belong to the dependency chain of the early + break conditional. GDEST is the location in which to insert the new + statements. GSTMT is the iterator to walk up to find statements to + consider moving. REACHING_VUSE contains the dominating VUSE found so far + and CURRENT_VDEF contains the last VDEF we've seen. These are updated in + pre-order and updated in post-order after moving the instruction. */ + +static void +move_early_exit_stmts (hash_set *chained, gimple_stmt_iterator *gdest, + gimple_stmt_iterator *gstmt, tree *reaching_vuse, + tree *current_vdef) +{ + if (gsi_end_p (*gstmt)) + return; + + gimple *stmt = gsi_stmt (*gstmt); + if (gimple_has_ops (stmt)) + { + tree dest = NULL_TREE; + /* Try to find the SSA_NAME being defined. For Statements with an LHS + use the LHS, if not, assume that the first argument of a call is the + value being defined. e.g. MASKED_LOAD etc. */ + if (gimple_has_lhs (stmt)) + { + if (is_gimple_assign (stmt)) + dest = gimple_assign_lhs (stmt); + else if (const gcall *call = dyn_cast (stmt)) + dest = gimple_call_lhs (call); + } + else if (const gcall *call = dyn_cast (stmt)) + dest = gimple_arg (call, 0); + + /* Don't move the scalar instructions. */ + bool move = dest != NULL_TREE; + + /* If we found the defining statement of a something that's part of the + chain then expand the chain with the new SSA_VARs being used. */ + if (chained->contains (dest)) + { + for (unsigned x = 0; x < gimple_num_args (stmt); x++) + if (TREE_CODE (gimple_arg (stmt, x)) == SSA_NAME) + chained->add (gimple_arg (stmt, x)); + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "found chain %G", stmt); + update_stmt (stmt); + move = false; + } + + if (move) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "moving stmt %G", stmt); + gsi_move_before (gstmt, gdest); + gsi_prev (gdest); + tree vdef = gimple_vdef (stmt); + + /* If we've moved a VDEF, extract the defining MEM and update + usages of it. TODO: I think this may need some constraints? */ + if (vdef) + { + *current_vdef = vdef; + *reaching_vuse = gimple_vuse (stmt); + imm_use_iterator imm_iter; + gimple *use_stmt; + FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, vdef) + { + if (!is_a (use_stmt)) + continue; + gphi *phi_stmt = as_a (use_stmt); + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "updating vuse %G", use_stmt); + + for (unsigned i = 0; i < gimple_phi_num_args (phi_stmt); i++) + if (gimple_phi_arg_def (phi_stmt, i) == vdef) + { + SET_USE (PHI_ARG_DEF_PTR (phi_stmt, i), gimple_vuse (stmt)); + break; + } + } + } + update_stmt (stmt); + } + } + + gsi_prev (gstmt); + move_early_exit_stmts (chained, gdest, gstmt, reaching_vuse, current_vdef); + + if (gimple_vuse (stmt) + && reaching_vuse && *reaching_vuse + && gimple_vuse (stmt) == *current_vdef) + { + unlink_stmt_vdef (stmt); + gimple_set_vuse (stmt, *reaching_vuse); + update_stmt (stmt); + } +} + /* Function vect_transform_loop. The analysis phase has determined that the loop is vectorizable. @@ -10793,7 +11025,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) /* Make sure there exists a single-predecessor exit bb. Do this before versioning. */ edge e = single_exit (loop); - if (! single_pred_p (e->dest)) + if (e && ! single_pred_p (e->dest)) { split_loop_exit_edge (e, true); if (dump_enabled_p ()) @@ -10819,7 +11051,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) { e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)); - if (! single_pred_p (e->dest)) + if (e && ! single_pred_p (e->dest)) { split_loop_exit_edge (e, true); if (dump_enabled_p ()) @@ -10885,6 +11117,26 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo)); } + /* Handle any code motion that we need to for early-break vectorization after + we've done peeling but just before we start vectorizing. */ + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + { + hash_set chained; + for (gcond *cond : LOOP_VINFO_LOOP_CONDS (loop_vinfo)) + { + basic_block cond_bb = gimple_bb (cond); + basic_block dest_bb = FALLTHRU_EDGE (cond_bb)->dest; + gimple_stmt_iterator dest_gsi = gsi_start_bb (dest_bb); + + gimple_stmt_iterator gsi2 = gsi_for_stmt (cond); + for (unsigned i = 0; i < gimple_num_ops (cond); i++) + chained.add (gimple_op (cond, i)); + tree vdef; + tree vuse = gimple_vuse (cond); + move_early_exit_stmts (&chained, &dest_gsi, &gsi2, &vuse, &vdef); + } + } + /* FORNOW: the vectorizer supports only loops which body consist of one basic block (header + empty latch). When the vectorizer will support more involved loop forms, the order by which the BBs are @@ -11152,7 +11404,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) /* Loops vectorized with a variable factor won't benefit from unrolling/peeling. */ - if (!vf.is_constant ()) + if (!vf.is_constant () + && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) { loop->unroll = 1; if (dump_enabled_p ()) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 5485da58b38a0db2ea1a357ee8647ae47b563a8f..6a3a6065ce2ee7dd53ebf83c8796d57fc9805750 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -342,9 +342,28 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, *live_p = false; /* cond stmt other than loop exit cond. */ - if (is_ctrl_stmt (stmt_info->stmt) - && STMT_VINFO_TYPE (stmt_info) != loop_exit_ctrl_vec_info_type) - *relevant = vect_used_in_scope; + if (is_ctrl_stmt (stmt_info->stmt)) + { + /* Ideally EDGE_LOOP_EXIT would have been set on the exit edge, but + it looks like loop_manip doesn't do that.. So we have to do it + the hard way. */ + basic_block bb = gimple_bb (stmt_info->stmt); + basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); + edge exit = BRANCH_EDGE (bb); + unsigned nbbs = loop->num_nodes; + bool exit_bb = true; + for (unsigned i = 0; i < nbbs; i++) + { + if (exit->dest == bbs[i]) + { + exit_bb = false; + break; + } + } + + if (exit_bb) + *relevant = vect_used_in_scope; + } /* changing memory. */ if (gimple_code (stmt_info->stmt) != GIMPLE_PHI) @@ -357,6 +376,11 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, *relevant = vect_used_in_scope; } + auto_vec exits = get_loop_exit_edges (loop); + auto_bitmap exit_bbs; + for (edge exit : exits) + bitmap_set_bit (exit_bbs, exit->dest->index); + /* uses outside the loop. */ FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF) { @@ -375,7 +399,7 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, /* We expect all such uses to be in the loop exit phis (because of loop closed form) */ gcc_assert (gimple_code (USE_STMT (use_p)) == GIMPLE_PHI); - gcc_assert (bb == single_exit (loop)->dest); + gcc_assert (bitmap_bit_p (exit_bbs, bb->index)); *live_p = true; } @@ -10790,6 +10814,223 @@ vectorizable_condition (vec_info *vinfo, return true; } +/* Check to see if the current early break given in STMT_INFO is valid for + vectorization. */ + +static bool +vectorizable_early_exit (vec_info *vinfo, + stmt_vec_info stmt_info, slp_tree /* slp_node */, + slp_instance /* slp_node_instance */, + stmt_vector_for_cost * /* cost_vec */) +{ + loop_vec_info loop_vinfo = dyn_cast (vinfo); + + if (!loop_vinfo + || !is_a (STMT_VINFO_STMT (stmt_info))) + return false; + + if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_early_exit_def) + return false; + + /* refactor and merge with vect_transform_early_break. */ + gimple_match_op op; + if (!gimple_extract_op (stmt_info->stmt, &op)) + gcc_unreachable (); + gcc_assert (op.code.is_tree_code ()); + + tree vectype_in = STMT_VINFO_VECTYPE (stmt_info); + gcc_assert (vectype_in); + + stmt_vec_info operand0_info + = loop_vinfo->lookup_stmt (SSA_NAME_DEF_STMT (op.ops[0])); + if (!operand0_info) + return false; + tree vectype_op = STMT_VINFO_VECTYPE (operand0_info); + + //tree vectype = STMT_VINFO_VECTYPE (stmt_info); + tree truth_type = truth_type_for (vectype_op); + machine_mode mode = TYPE_MODE (truth_type); + if (direct_optab_handler (cbranch_optab, mode) == CODE_FOR_nothing) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't vectorize early exit because the " + "target doesn't support flag setting vector " + "comparisons.\n"); + return false; + } + + if (!expand_vec_cmp_expr_p (vectype_op, truth_type, NE_EXPR)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't vectorize early exit because the " + "target does not support boolean vector " + "comparisons for type %T.\n", truth_type); + return false; + } + + if (direct_optab_handler (ior_optab, mode) == CODE_FOR_nothing) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't vectorize early exit because the " + "target does not support boolean vector OR for " + "type %T.\n", truth_type); + return false; + } + + return true; +} + + +/* Transform the definition stmt STMT_INFO of an early exit + value. */ + +static bool +vect_transform_early_break (loop_vec_info loop_vinfo, + stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + gimple **vec_stmt, slp_tree slp_node) +{ + tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); + int i; + int ncopies, nopcopies; + int vec_num; + stmt_vec_info operand0_info; + + if (!STMT_VINFO_RELEVANT_P (stmt_info)) + return false; + + gimple_match_op op; + if (!gimple_extract_op (stmt_info->stmt, &op)) + gcc_unreachable (); + gcc_assert (op.code.is_tree_code ()); + auto code = tree_code (op.code); + + tree vectype_in = STMT_VINFO_VECTYPE (stmt_info); + gcc_assert (vectype_in); + + operand0_info = loop_vinfo->lookup_stmt (SSA_NAME_DEF_STMT (op.ops[0])); + tree vectype_op = STMT_VINFO_VECTYPE (operand0_info); + + if (slp_node) + { + ncopies = 1; + nopcopies = 1; + vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + } + else + { + /* Because of the difference between types when doing a boolean reduction + we must determine the number of copies based on the type operands of + operands of the comparison and not the comparison itself. */ + ncopies = vect_get_num_copies (loop_vinfo, vectype_in); + nopcopies = vect_get_num_copies (loop_vinfo, vectype_op); + vec_num = 1; + } + + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); + + if (ncopies != nopcopies) + vectype_out = vectype_op; + + /* Transform. */ + tree new_temp = NULL_TREE; + auto_vec vec_oprnds0; + auto_vec vec_oprnds1; + tree def0; + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "transform early-exit.\n"); + + gimple *stmt = STMT_VINFO_STMT (stmt_info); + basic_block cond_bb = gimple_bb (stmt); + gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb); + + vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, nopcopies, + op.ops[0], &vec_oprnds0, vectype_out, + op.ops[1], &vec_oprnds1, vectype_out, + NULL, NULL, NULL); + + gimple *new_stmt = NULL; + tree cst_0 = build_zero_cst (truth_type_for (vectype_out)); + tree cst_m1 = build_minus_one_cst (truth_type_for (vectype_out)); + + FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) + { + tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; + { + tree cond + = make_temp_ssa_name (truth_type_for (vectype_out), NULL, "mask"); + gimple *vec_cmp = gimple_build_assign (cond, code, vop[0], vop[1]); + vect_finish_stmt_generation (loop_vinfo, stmt_info, vec_cmp, + &cond_gsi); + if (masked_loop_p) + { + tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, + vectype_op, i); + cond = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask, + cond, &cond_gsi); + } + + new_temp + = make_temp_ssa_name (truth_type_for (vectype_out), NULL, "vexit"); + new_stmt = gimple_build_assign (new_temp, VEC_COND_EXPR, + cond, cst_m1, cst_0); + vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, + &cond_gsi); + } + + if (slp_node) + SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); + else + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); + } + + /* Determine if we need to reduce the final value. */ + if (ncopies != nopcopies) + { + vec stmts; + if (slp_node) + stmts = SLP_TREE_VEC_STMTS (slp_node); + else + stmts = STMT_VINFO_VEC_STMTS (stmt_info); + + /* We build the reductions in a way to maintain as much parallelism as + possible. */ + auto_vec workset (stmts.length ()); + workset.splice (stmts); + while (workset.length () > 1) + { + new_temp = make_temp_ssa_name (truth_type_for (vectype_out), NULL, + "vexit_reduc"); + gimple *arg0 = workset.pop (); + gimple *arg1 = workset.pop (); + new_stmt = gimple_build_assign (new_temp, BIT_IOR_EXPR, + gimple_assign_lhs (arg0), + gimple_assign_lhs (arg1)); + vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, + &cond_gsi); + workset.quick_insert (0, new_stmt); + } + } + + gcc_assert (new_stmt); + tree lhs = gimple_assign_lhs (new_stmt); + + tree t = fold_build2 (NE_EXPR, boolean_type_node, lhs, + build_zero_cst (truth_type_for (vectype_out))); + t = canonicalize_cond_expr_cond (t); + gimple_cond_set_condition_from_tree ((gcond*)stmt, t); + update_stmt (stmt); + + if (!slp_node) + *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; + + return true; +} + /* vectorizable_comparison. Check if STMT_INFO is comparison expression that can be vectorized. @@ -11167,11 +11408,14 @@ vect_analyze_stmt (vec_info *vinfo, node_instance, cost_vec); if (!res) return res; - } + } + else if (is_ctrl_stmt (stmt_info->stmt)) + STMT_VINFO_DEF_TYPE (stmt_info) = vect_early_exit_def; switch (STMT_VINFO_DEF_TYPE (stmt_info)) { case vect_internal_def: + case vect_early_exit_def: break; case vect_reduction_def: @@ -11204,6 +11448,7 @@ vect_analyze_stmt (vec_info *vinfo, { gcall *call = dyn_cast (stmt_info->stmt); gcc_assert (STMT_VINFO_VECTYPE (stmt_info) + || gimple_code (stmt_info->stmt) == GIMPLE_COND || (call && gimple_call_lhs (call) == NULL_TREE)); *need_to_vectorize = true; } @@ -11246,7 +11491,9 @@ vect_analyze_stmt (vec_info *vinfo, || vectorizable_lc_phi (as_a (vinfo), stmt_info, NULL, node) || vectorizable_recurr (as_a (vinfo), - stmt_info, NULL, node, cost_vec)); + stmt_info, NULL, node, cost_vec) + || vectorizable_early_exit (vinfo, stmt_info, + node, node_instance, cost_vec)); else { if (bb_vinfo) @@ -11269,7 +11516,10 @@ vect_analyze_stmt (vec_info *vinfo, NULL, NULL, node, cost_vec) || vectorizable_comparison (vinfo, stmt_info, NULL, NULL, node, cost_vec) - || vectorizable_phi (vinfo, stmt_info, NULL, node, cost_vec)); + || vectorizable_phi (vinfo, stmt_info, NULL, node, cost_vec) + || vectorizable_early_exit (vinfo, stmt_info, node, + node_instance, cost_vec)); + } if (node) @@ -11427,6 +11677,12 @@ vect_transform_stmt (vec_info *vinfo, gcc_assert (done); break; + case loop_exit_ctrl_vec_info_type: + done = vect_transform_early_break (as_a (vinfo), stmt_info, + gsi, &vec_stmt, slp_node); + gcc_assert (done); + break; + default: if (!STMT_VINFO_LIVE_P (stmt_info)) { @@ -11825,6 +12081,9 @@ vect_is_simple_use (tree operand, vec_info *vinfo, enum vect_def_type *dt, case vect_first_order_recurrence: dump_printf (MSG_NOTE, "first order recurrence\n"); break; + case vect_early_exit_def: + dump_printf (MSG_NOTE, "early exit\n"); + break; case vect_unknown_def_type: dump_printf (MSG_NOTE, "unknown\n"); break; @@ -12504,6 +12763,8 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info, *nunits_vectype_out = NULL_TREE; if (gimple_get_lhs (stmt) == NULL_TREE + /* Allow vector conditionals through here. */ + && !is_ctrl_stmt (stmt) /* MASK_STORE has no lhs, but is ok. */ && !gimple_call_internal_p (stmt, IFN_MASK_STORE)) { @@ -12520,7 +12781,7 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info, } return opt_result::failure_at (stmt, - "not vectorized: irregular stmt.%G", stmt); + "not vectorized: irregular stmt: %G", stmt); } tree vectype; @@ -12549,6 +12810,8 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info, scalar_type = TREE_TYPE (DR_REF (dr)); else if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3)); + else if (is_ctrl_stmt (stmt)) + scalar_type = TREE_TYPE (gimple_cond_rhs (stmt)); else scalar_type = TREE_TYPE (gimple_get_lhs (stmt)); diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 016961da8510ca7dd2d07e716cbe35623ed2d9a5..3f4fc90c7ada9713b9ce58f86272648dab0ca6b7 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -63,6 +63,7 @@ enum vect_def_type { vect_internal_def, vect_induction_def, vect_reduction_def, + vect_early_exit_def, vect_double_reduction_def, vect_nested_cycle, vect_first_order_recurrence, @@ -836,6 +837,13 @@ public: we need to peel off iterations at the end to form an epilogue loop. */ bool peeling_for_niter; + /* When the loop has early breaks that we can vectorize we need to peel + the loop for the break finding loop. */ + bool early_breaks; + + /* List of loop IV conditionals found in the loop. */ + auto_vec conds; + /* True if there are no loop carried data dependencies in the loop. If loop->safelen <= 1, then this is always true, either the loop didn't have any loop carried data dependencies, or the loop is being @@ -921,6 +929,8 @@ public: #define LOOP_VINFO_REDUCTION_CHAINS(L) (L)->reduction_chains #define LOOP_VINFO_PEELING_FOR_GAPS(L) (L)->peeling_for_gaps #define LOOP_VINFO_PEELING_FOR_NITER(L) (L)->peeling_for_niter +#define LOOP_VINFO_EARLY_BREAKS(L) (L)->early_breaks +#define LOOP_VINFO_LOOP_CONDS(L) (L)->conds #define LOOP_VINFO_NO_DATA_DEPENDENCIES(L) (L)->no_data_dependencies #define LOOP_VINFO_SCALAR_LOOP(L) (L)->scalar_loop #define LOOP_VINFO_SCALAR_LOOP_SCALING(L) (L)->scalar_loop_scaling @@ -970,7 +980,7 @@ public: typedef opt_pointer_wrapper opt_loop_vec_info; static inline loop_vec_info -loop_vec_info_for_loop (class loop *loop) +loop_vec_info_for_loop (const class loop *loop) { return (loop_vec_info) loop->aux; } @@ -2107,7 +2117,7 @@ class auto_purge_vect_location in tree-vect-loop-manip.cc. */ extern void vect_set_loop_condition (class loop *, loop_vec_info, tree, tree, tree, bool); -extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge); +extern bool slpeel_can_duplicate_loop_p (const loop_vec_info, const_edge); class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, class loop *, edge); class loop *vect_loop_versioning (loop_vec_info, gimple *); @@ -2306,6 +2316,7 @@ struct vect_loop_form_info tree number_of_iterations; tree number_of_iterationsm1; tree assumptions; + vec conds; gcond *loop_cond; gcond *inner_loop_cond; }; diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc index 6ec49511d74bd2e0e5dd51823a6c41180f08716c..4aa46c7c0d8235d3b783ce930e5df3480e1b3ef9 100644 --- a/gcc/tree-vectorizer.cc +++ b/gcc/tree-vectorizer.cc @@ -1382,7 +1382,9 @@ pass_vectorize::execute (function *fun) predicates that need to be shared for optimal predicate usage. However reassoc will re-order them and prevent CSE from working as it should. CSE only the loop body, not the entry. */ - bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index); + auto_vec exits = get_loop_exit_edges (loop); + for (edge exit : exits) + bitmap_set_bit (exit_bbs, exit->dest->index); edge entry = EDGE_PRED (loop_preheader_edge (loop)->src, 0); do_rpo_vn (fun, entry, exit_bbs);