* [RFC: PATCH] Extend vectorizer to handle nonlinear induction for neg, mul/lshift/rshift with a constant.
@ 2022-08-04 4:28 liuhongt
2022-08-04 8:18 ` Richard Biener
0 siblings, 1 reply; 7+ messages in thread
From: liuhongt @ 2022-08-04 4:28 UTC (permalink / raw)
To: gcc-patches
For neg, the patch create a vec_init as [ a, -a, a, -a, ... ] and no
vec_step is needed to update vectorized iv since vf is always multiple
of 2(negative * negative is positive).
For shift, the patch create a vec_init as [ a, a >> c, a >> 2*c, ..]
as vec_step as [ c * nunits, c * nunits, c * nunits, ... ], vectorized iv is
updated as vec_def = vec_init >>/<< vec_step.
For mul, the patch create a vec_init as [ a, a * c, a * pow(c, 2), ..]
as vec_step as [ pow(c,nunits), pow(c,nunits),...] iv is updated as vec_def =
vec_init * vec_step.
The patch handles nonlinear iv for
1. Integer type only, floating point is not handled.
2. No slp_node.
3. iv_loop should be same as vector loop, not nested loop.
4. No UD is created, for mul, no UD overlow for pow (c, vf), for
shift, shift count should be less than type precision.
Bootstrapped and regression tested on x86_64-pc-linux-gnu{-m32,}.
There's some cases observed in SPEC2017, but no big performance impact.
Any comments?
gcc/ChangeLog:
PR tree-optimization/103144
* tree-vect-loop.cc (vect_is_nonlinear_iv_evolution): New function.
(vect_analyze_scalar_cycles_1): Detect nonlinear iv by upper function.
(vect_create_nonlinear_iv_init): New function.
(vect_create_nonlinear_iv_step): Ditto
(vect_create_nonlinear_iv_vec_step): Ditto
(vect_update_nonlinear_iv): Ditto
(vectorizable_nonlinear_induction): Ditto.
(vectorizable_induction): Call
vectorizable_nonlinear_induction when induction_type is not
vect_step_op_add.
* tree-vectorizer.h (enum vect_induction_op_type): New enum.
(STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE): New Macro.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr103144-mul-1.c: New test.
* gcc.target/i386/pr103144-mul-2.c: New test.
* gcc.target/i386/pr103144-neg-1.c: New test.
* gcc.target/i386/pr103144-neg-2.c: New test.
* gcc.target/i386/pr103144-shift-1.c: New test.
* gcc.target/i386/pr103144-shift-2.c: New test.
---
.../gcc.target/i386/pr103144-mul-1.c | 25 +
.../gcc.target/i386/pr103144-mul-2.c | 43 ++
.../gcc.target/i386/pr103144-neg-1.c | 25 +
.../gcc.target/i386/pr103144-neg-2.c | 36 ++
.../gcc.target/i386/pr103144-shift-1.c | 34 +
.../gcc.target/i386/pr103144-shift-2.c | 61 ++
gcc/tree-vect-loop.cc | 604 +++++++++++++++++-
gcc/tree-vectorizer.h | 11 +
8 files changed, 834 insertions(+), 5 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-shift-2.c
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
new file mode 100644
index 00000000000..2357541d95d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
+
+#define N 10000
+void
+foo_mul (int* a, int b)
+{
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b *= 3;
+ }
+}
+
+void
+foo_mul_const (int* a)
+{
+ int b = 1;
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b *= 3;
+ }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
new file mode 100644
index 00000000000..4ea53e44658
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
@@ -0,0 +1,43 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include <string.h>
+#include "pr103144-mul-1.c"
+
+typedef int v8si __attribute__((vector_size(32)));
+
+void
+avx2_test (void)
+{
+ int* epi32_exp = (int*) malloc (N * sizeof (int));
+ int* epi32_dst = (int*) malloc (N * sizeof (int));
+
+ __builtin_memset (epi32_exp, 0, N * sizeof (int));
+ int b = 8;
+ v8si init = __extension__(v8si) { b, b * 3, b * 9, b * 27, b * 81, b * 243, b * 729, b * 2187 };
+
+ for (int i = 0; i != N / 8; i++)
+ {
+ memcpy (epi32_exp + i * 8, &init, 32);
+ init *= 6561;
+ }
+
+ foo_mul (epi32_dst, b);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ init = __extension__(v8si) { 1, 3, 9, 27, 81, 243, 729, 2187 };
+ for (int i = 0; i != N / 8; i++)
+ {
+ memcpy (epi32_exp + i * 8, &init, 32);
+ init *= 6561;
+ }
+
+ foo_mul_const (epi32_dst);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
new file mode 100644
index 00000000000..63a3331563d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
+
+#define N 10000
+void
+foo_neg (int* a, int b)
+{
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b = -b;
+ }
+}
+
+void
+foo_neg_const (int* a)
+{
+ int b = 1;
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b = -b;
+ }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
new file mode 100644
index 00000000000..13bc61cdce7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include <string.h>
+#include "pr103144-neg-1.c"
+
+void
+avx2_test (void)
+{
+ int* epi32_exp = (int*) malloc (N * sizeof (int));
+ int* epi32_dst = (int*) malloc (N * sizeof (int));
+ long long* epi64_exp = (long long*) malloc (N * sizeof (int));
+
+ __builtin_memset (epi32_exp, 0, N * sizeof (int));
+ int b = 100;
+
+ for (int i = 0; i != N / 2; i++)
+ epi64_exp[i] = ((long long) b) | (((long long) -b) << 32);
+
+ memcpy (epi32_exp, epi64_exp, N * sizeof (int));
+ foo_neg (epi32_dst, b);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ for (int i = 0; i != N / 2; i++)
+ epi64_exp[i] = ((long long) 1) | (((long long) -1) << 32);
+
+ memcpy (epi32_exp, epi64_exp, N * sizeof (int));
+ foo_neg_const (epi32_dst);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
new file mode 100644
index 00000000000..51531495d60
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 3 "vect" } } */
+
+#define N 10000
+void
+foo_shl (int* a, int b)
+{
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b <<= 1;
+ }
+}
+
+void
+foo_ashr (int* a, int b)
+{
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b >>= 1;
+ }
+}
+
+void
+foo_lshr (unsigned int* a, unsigned int b)
+{
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b >>= 1U;
+ }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c
new file mode 100644
index 00000000000..02deda6d1ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c
@@ -0,0 +1,61 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include <string.h>
+#include "pr103144-shift-1.c"
+
+typedef int v8si __attribute__((vector_size(32)));
+typedef unsigned int v8usi __attribute__((vector_size(32)));
+
+void
+avx2_test (void)
+{
+ int* epi32_exp = (int*) malloc (N * sizeof (int));
+ int* epi32_dst = (int*) malloc (N * sizeof (int));
+ unsigned int* epu32_exp = (unsigned int*) malloc (N * sizeof (int));
+ unsigned int* epu32_dst = (unsigned int*) malloc (N * sizeof (int));
+
+
+ __builtin_memset (epi32_exp, 0, N * sizeof (int));
+ int b = 8;
+ v8si init = __extension__(v8si) { b, b << 1, b << 2, b << 3, b << 4, b << 5, b << 6, b << 7 };
+
+ for (int i = 0; i != N / 8; i++)
+ {
+ memcpy (epi32_exp + i * 8, &init, 32);
+ init <<= 8;
+ }
+
+ foo_shl (epi32_dst, b);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ b = 11111111;
+ init = __extension__(v8si) { b, b >> 1, b >> 2, b >> 3, b >> 4, b >> 5, b >> 6, b >> 7 };
+ for (int i = 0; i != N / 8; i++)
+ {
+ memcpy (epi32_exp + i * 8, &init, 32);
+ init >>= 8;
+ }
+
+ foo_ashr (epi32_dst, b);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ __builtin_memset (epu32_exp, 0, N * sizeof (int));
+ unsigned int c = 11111111;
+ v8usi initu = __extension__(v8usi) { c, c >> 1U, c >> 2U, c >> 3U, c >> 4U, c >> 5U, c >> 6U, c >> 7U };
+ for (int i = 0; i != N / 8; i++)
+ {
+ memcpy (epu32_exp + i * 8, &initu, 32);
+ initu >>= 8U;
+ }
+
+ foo_lshr (epu32_dst, c);
+ if (__builtin_memcmp (epu32_dst, epu32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ return;
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 2257b29a652..fc2a1584048 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -425,6 +425,98 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
return true;
}
+static bool
+vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
+ gphi* loop_phi_node, tree *init, tree *step)
+{
+ tree init_expr, ev_expr, result, op1, op2;
+ basic_block bb1, bb2;
+ gimple* def;
+ imm_use_iterator imm_iter;
+ use_operand_p use_p;
+
+
+ gcc_assert (gimple_code (loop_phi_node) == GIMPLE_PHI);
+
+ if (gimple_phi_num_args (loop_phi_node) != 2)
+ return false;
+
+ init_expr = PHI_ARG_DEF (loop_phi_node, 0);
+ ev_expr = PHI_ARG_DEF (loop_phi_node, 1);
+
+ /* Only support nonlinear induction for integer. */
+ if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
+ return false;
+
+ bb1 = gimple_phi_arg_edge (loop_phi_node, 0)->src;
+ bb2 = gimple_phi_arg_edge (loop_phi_node, 1)->src;
+
+ /* One outside the loop, the other inside the loop. */
+ if (!(flow_bb_inside_loop_p (loop, bb1) ^ flow_bb_inside_loop_p (loop, bb2)))
+ return false;
+
+ if (flow_bb_inside_loop_p (loop, bb1))
+ std::swap (init_expr, ev_expr);
+
+ if (TREE_CODE (ev_expr) != SSA_NAME
+ || !has_single_use (ev_expr)
+ || !(def = SSA_NAME_DEF_STMT (ev_expr))
+ || !is_gimple_assign (def))
+ return false;
+
+ *init = init_expr;
+ result = PHI_RESULT (loop_phi_node);
+
+ /* final_value_replacement_loop should handle outside loop use. */
+ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, result)
+ {
+ gimple *use_stmt;
+ use_stmt = USE_STMT (use_p);
+ if (is_gimple_debug (use_stmt))
+ continue;
+ if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
+ return false;
+ }
+
+ enum tree_code t_code = gimple_assign_rhs_code (def);
+ switch (t_code)
+ {
+ case NEGATE_EXPR:
+ if (gimple_assign_rhs1 (def) != result)
+ return false;
+ *step = build_int_cst (TREE_TYPE (init_expr), -1);
+ STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
+ break;
+
+ case RSHIFT_EXPR:
+ case LSHIFT_EXPR:
+ case MULT_EXPR:
+ op1 = gimple_assign_rhs1 (def);
+ op2 = gimple_assign_rhs2 (def);
+ if (TREE_CODE (op2) != INTEGER_CST
+ || op1 != result)
+ return false;
+ *step = op2;
+ break;
+
+ default:
+ return false;
+ }
+
+ if (t_code == LSHIFT_EXPR)
+ STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
+ else if (t_code == RSHIFT_EXPR)
+ STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
+ /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
+ else if (t_code == MULT_EXPR)
+ STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
+
+ STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
+ STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
+
+ return true;
+}
+
/* Return true if PHI, described by STMT_INFO, is the inner PHI in
what we are assuming is a double reduction. For example, given
a structure like this:
@@ -512,11 +604,16 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
= evolution_part_in_loop_num (access_fn, loop->num);
}
- if (!access_fn
- || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
- || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
- || (LOOP_VINFO_LOOP (loop_vinfo) != loop
- && TREE_CODE (step) != INTEGER_CST))
+ if ((!access_fn
+ || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
+ || !vect_is_simple_iv_evolution (loop->num, access_fn,
+ &init, &step)
+ || (LOOP_VINFO_LOOP (loop_vinfo) != loop
+ && TREE_CODE (step) != INTEGER_CST))
+ /* Only handle nonlinear iv for same loop. */
+ && (!vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
+ phi, &init, &step)
+ || LOOP_VINFO_LOOP (loop_vinfo) != loop))
{
worklist.safe_push (stmt_vinfo);
continue;
@@ -8229,6 +8326,496 @@ vect_can_vectorize_without_simd_p (code_helper code)
&& vect_can_vectorize_without_simd_p (tree_code (code)));
}
+/* Create vector init for vectorized iv. */
+static tree
+vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
+ tree step_expr, poly_uint64 nunits,
+ tree vectype,
+ enum vect_induction_op_type induction_type)
+{
+ unsigned HOST_WIDE_INT const_nunits;
+ tree vec_shift, vec_init, new_name;
+ unsigned i;
+
+ /* iv_loop is the loop to be vectorized. Create:
+ vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
+ new_name = init_expr;
+ switch (induction_type)
+ {
+ case vect_step_op_shr:
+ case vect_step_op_shl:
+ /* Build the Initial value from shift_expr. */
+ vec_init = gimple_build_vector_from_val (stmts,
+ vectype,
+ new_name);
+ vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
+ build_zero_cst (TREE_TYPE (step_expr)),
+ step_expr);
+ vec_init = gimple_build (stmts,
+ (induction_type == vect_step_op_shr
+ ? RSHIFT_EXPR : LSHIFT_EXPR),
+ vectype, vec_init, vec_shift);
+ break;
+
+ case vect_step_op_neg:
+ {
+ vec_init = gimple_build_vector_from_val (stmts,
+ vectype,
+ new_name);
+ tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
+ vectype, vec_init);
+ /* The encoding has 2 interleaved stepped patterns. */
+ vec_perm_builder sel (nunits, 2, 3);
+ sel.quick_grow (6);
+ for (i = 0; i < 3; i++)
+ {
+ sel[2 * i] = i;
+ sel[2 * i + 1] = i + nunits;
+ }
+ vec_perm_indices indices (sel, 2, nunits);
+ tree perm_mask_even
+ = vect_gen_perm_mask_checked (vectype, indices);
+ vec_init = gimple_build (stmts, VEC_PERM_EXPR,
+ vectype,
+ vec_init, vec_neg,
+ perm_mask_even);
+ }
+ break;
+
+ case vect_step_op_mul:
+ {
+ gcc_assert (nunits.is_constant (&const_nunits));
+ vec_init = gimple_build_vector_from_val (stmts,
+ vectype,
+ new_name);
+ tree_vector_builder elts (vectype, const_nunits, 1);
+ tree elt_step = build_one_cst (TREE_TYPE (step_expr));
+
+ elts.quick_push (elt_step);
+ for (i = 1; i < const_nunits; i++)
+ {
+ /* Create: new_name_i = new_name + step_expr. */
+ elt_step = gimple_build (stmts, MULT_EXPR,
+ TREE_TYPE (step_expr),
+ elt_step, step_expr);
+ elts.quick_push (elt_step);
+ }
+ /* Create a vector from [new_name_0, new_name_1, ...,
+ new_name_nunits-1]. */
+ tree vec_mul = gimple_build_vector (stmts, &elts);
+ vec_init = gimple_build (stmts, MULT_EXPR, vectype,
+ vec_init, vec_mul);
+ }
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ return vec_init;
+}
+
+/* Create vector step for vectorized iv. */
+static tree
+vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
+ poly_uint64 vf,
+ enum vect_induction_op_type induction_type)
+{
+ tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
+ tree new_name = NULL;
+ /* Step should be pow (step, vf) for mult induction. */
+ if (induction_type == vect_step_op_mul)
+ {
+ gcc_assert (vf.is_constant ());
+ wide_int begin = wi::to_wide (step_expr);
+
+ for (unsigned i = 0; i != vf.to_constant () - 1; i++)
+ begin = wi::mul (begin, wi::to_wide (step_expr));
+
+ new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
+ }
+ else if (induction_type == vect_step_op_neg)
+ /* Do nothing. */
+ ;
+ else
+ new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
+ expr, step_expr);
+ return new_name;
+}
+
+static tree
+vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
+ stmt_vec_info stmt_info,
+ tree new_name, tree vectype,
+ enum vect_induction_op_type induction_type)
+{
+ /* No step is needed for neg induction. */
+ if (induction_type == vect_step_op_neg)
+ return NULL;
+
+ tree t = unshare_expr (new_name);
+ gcc_assert (CONSTANT_CLASS_P (new_name)
+ || TREE_CODE (new_name) == SSA_NAME);
+ tree new_vec = build_vector_from_val (vectype, t);
+ tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
+ new_vec, vectype, NULL);
+ return vec_step;
+}
+
+/* Update vectorized iv with vect_step, induc_def is init. */
+static tree
+vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
+ tree induc_def, tree vec_step,
+ enum vect_induction_op_type induction_type)
+{
+ tree vec_def = induc_def;
+ switch (induction_type)
+ {
+ case vect_step_op_mul:
+ vec_def = gimple_build (stmts, MULT_EXPR, vectype,
+ vec_def, vec_step);
+ break;
+
+ case vect_step_op_shr:
+ vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
+ vec_def, vec_step);
+ break;
+
+ case vect_step_op_shl:
+ vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
+ vec_def, vec_step);
+ break;
+ case vect_step_op_neg:
+ vec_def = induc_def;
+ /* Do nothing. */
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ return vec_def;
+
+}
+/* Function vectorizable_induction
+
+ Check if STMT_INFO performs an nonlinear induction computation that can be
+ vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
+ a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
+ basic block.
+ Return true if STMT_INFO is vectorizable in this way. */
+
+static bool
+vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
+ stmt_vec_info stmt_info,
+ gimple **vec_stmt, slp_tree slp_node,
+ stmt_vector_for_cost *cost_vec)
+{
+ class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ unsigned ncopies;
+ bool nested_in_vect_loop = false;
+ class loop *iv_loop;
+ tree vec_def;
+ edge pe = loop_preheader_edge (loop);
+ basic_block new_bb;
+ tree vec_init, vec_step;
+ tree new_name;
+ gimple *new_stmt;
+ gphi *induction_phi;
+ tree induc_def, vec_dest;
+ tree init_expr, step_expr;
+ poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ unsigned i;
+ gimple_stmt_iterator si;
+
+ gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
+
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+ poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+ enum vect_induction_op_type induction_type
+ = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
+
+ gcc_assert (induction_type > vect_step_op_add);
+
+ if (slp_node)
+ ncopies = 1;
+ else
+ ncopies = vect_get_num_copies (loop_vinfo, vectype);
+ gcc_assert (ncopies >= 1);
+
+ /* FORNOW. Only handle nonlinear induction in the same loop. */
+ if (nested_in_vect_loop_p (loop, stmt_info))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "nonlinear induction in nested loop.\n");
+ return false;
+ }
+
+ iv_loop = loop;
+ gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
+
+ if (slp_node)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "SLP induction not supported for nonlinear"
+ " induction.\n");
+ return false;
+ }
+
+ if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Peel for alignment not supported for nonlinear"
+ " induction.\n");
+ return false;
+ }
+
+ if (FLOAT_TYPE_P (vectype))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "floating point nonlinear induction vectorization"
+ " not supported.\n");
+ return false;
+ }
+
+ step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
+ init_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info);
+ gcc_assert (step_expr != NULL_TREE && init_expr != NULL);
+ /* step_expr should be aligned with init_expr,
+ .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
+ step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
+ gcc_assert (TREE_CODE (init_expr) == INTEGER_CST
+ || (TYPE_CANONICAL (TREE_TYPE (vectype))
+ == TYPE_CANONICAL (TREE_TYPE (init_expr))));
+ gcc_assert (TREE_CODE (step_expr) == INTEGER_CST);
+ init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
+
+ switch (induction_type)
+ {
+ case vect_step_op_neg:
+ if (TREE_CODE (init_expr) != INTEGER_CST
+ && TREE_CODE (init_expr) != REAL_CST)
+ {
+ /* Check for backend support of NEGATE_EXPR and vec_perm. */
+ if (!directly_supported_p (NEGATE_EXPR, vectype))
+ return false;
+
+ /* The encoding has 2 interleaved stepped patterns. */
+ vec_perm_builder sel (nunits, 2, 3);
+ machine_mode mode = TYPE_MODE (vectype);
+ sel.quick_grow (6);
+ for (i = 0; i < 3; i++)
+ {
+ sel[i * 2] = i;
+ sel[i * 2 + 1] = i + nunits;
+ }
+ vec_perm_indices indices (sel, 2, nunits);
+ if (!can_vec_perm_const_p (mode, mode, indices))
+ return false;
+ }
+ break;
+
+ case vect_step_op_mul:
+ {
+ /* Check for backend support of MULT_EXPR. */
+ if (!directly_supported_p (MULT_EXPR, vectype))
+ return false;
+
+ /* ?? How to construct vector step for variable number vector.
+ [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
+ if (!nunits.is_constant ())
+ return false;
+
+ /* Make sure there's no overflow or overflow is defined. */
+ if (!TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (init_expr)))
+ break;
+
+ wi::overflow_type overflow;
+ wide_int begin = wi::to_wide (step_expr);
+
+ for (unsigned i = 0; i != nunits.to_constant () - 1; i++)
+ {
+ begin = wi::mul (begin, wi::to_wide (step_expr),
+ TYPE_SIGN (TREE_TYPE (init_expr)),
+ &overflow);
+ if (overflow)
+ return false;
+ }
+ }
+ break;
+
+ case vect_step_op_shr:
+ /* Check for backend support of RSHIFT_EXPR. */
+ if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
+ return false;
+
+ /* Don't shift more than type precision. */
+ if (!tree_fits_uhwi_p (step_expr)
+ || maybe_ge (nunits * tree_to_uhwi (step_expr),
+ TYPE_PRECISION (TREE_TYPE (init_expr))))
+ return false;
+ break;
+
+ case vect_step_op_shl:
+ /* Check for backend support of RSHIFT_EXPR. */
+ if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
+ return false;
+
+ /* Don't shift more than type precision. */
+ if (!tree_fits_uhwi_p (step_expr)
+ || maybe_ge (nunits * tree_to_uhwi (step_expr),
+ TYPE_PRECISION (TREE_TYPE (init_expr))))
+ return false;
+
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ if (!vec_stmt) /* transformation not required. */
+ {
+ unsigned inside_cost = 0, prologue_cost = 0;
+ /* loop cost for vec_loop. Neg induction doesn't have any
+ inside_cost. */
+ inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
+ stmt_info, 0, vect_body);
+
+ /* loop cost for vec_loop. Neg induction doesn't have any
+ inside_cost. */
+ if (induction_type == vect_step_op_neg)
+ inside_cost = 0;
+
+ /* prologue cost for vec_init and vec_step. */
+ prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
+ stmt_info, 0, vect_prologue);
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "vect_model_induction_cost: inside_cost = %d, "
+ "prologue_cost = %d. \n", inside_cost,
+ prologue_cost);
+
+ STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
+ DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
+ return true;
+ }
+
+ /* Transform. */
+
+ /* Compute a vector variable, initialized with the first VF values of
+ the induction variable. E.g., for an iv with IV_PHI='X' and
+ evolution S, for a vector of 4 units, we want to compute:
+ [X, X + S, X + 2*S, X + 3*S]. */
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
+
+ pe = loop_preheader_edge (iv_loop);
+ /* Find the first insertion point in the BB. */
+ basic_block bb = gimple_bb (phi);
+ si = gsi_after_labels (bb);
+
+ init_expr = vect_phi_initial_value (phi);
+
+ gimple_seq stmts = NULL;
+ vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
+ step_expr, nunits, vectype,
+ induction_type);
+ if (stmts)
+ {
+ new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+ gcc_assert (!new_bb);
+ }
+
+ stmts = NULL;
+ new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
+ vf, induction_type);
+ if (stmts)
+ {
+ new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+ gcc_assert (!new_bb);
+ }
+
+ vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
+ new_name, vectype,
+ induction_type);
+ /* Create the following def-use cycle:
+ loop prolog:
+ vec_init = ...
+ vec_step = ...
+ loop:
+ vec_iv = PHI <vec_init, vec_loop>
+ ...
+ STMT
+ ...
+ vec_loop = vec_iv + vec_step; */
+
+ /* Create the induction-phi that defines the induction-operand. */
+ vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
+ induction_phi = create_phi_node (vec_dest, iv_loop->header);
+ induc_def = PHI_RESULT (induction_phi);
+
+ /* Create the iv update inside the loop. */
+ stmts = NULL;
+ vec_def = vect_update_nonlinear_iv (&stmts, vectype,
+ induc_def, vec_step,
+ induction_type);
+
+ gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
+ new_stmt = SSA_NAME_DEF_STMT (vec_def);
+
+ /* Set the arguments of the phi node: */
+ add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
+ add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
+ UNKNOWN_LOCATION);
+
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
+ *vec_stmt = induction_phi;
+
+ /* In case that vectorization factor (VF) is bigger than the number
+ of elements that we can fit in a vectype (nunits), we have to generate
+ more than one vector stmt - i.e - we need to "unroll" the
+ vector stmt by a factor VF/nunits. For more details see documentation
+ in vectorizable_operation. */
+
+ if (ncopies > 1)
+ {
+ stmts = NULL;
+ /* FORNOW. This restriction should be relaxed. */
+ gcc_assert (!nested_in_vect_loop);
+
+ new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
+ nunits, induction_type);
+
+ vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
+ new_name, vectype,
+ induction_type);
+ vec_def = induc_def;
+ for (i = 1; i < ncopies; i++)
+ {
+ /* vec_i = vec_prev + vec_step. */
+ stmts = NULL;
+ vec_def = vect_update_nonlinear_iv (&stmts, vectype,
+ vec_def, vec_step,
+ induction_type);
+ gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
+ new_stmt = SSA_NAME_DEF_STMT (vec_def);
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+ }
+ }
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "transform induction: created def-use cycle: %G%G",
+ induction_phi, SSA_NAME_DEF_STMT (vec_def));
+
+ return true;
+}
+
/* Function vectorizable_induction
Check if STMT_INFO performs an induction computation that can be vectorized.
@@ -8259,6 +8846,8 @@ vectorizable_induction (loop_vec_info loop_vinfo,
unsigned i;
tree expr;
gimple_stmt_iterator si;
+ enum vect_induction_op_type induction_type
+ = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
if (!phi)
@@ -8271,6 +8860,11 @@ vectorizable_induction (loop_vec_info loop_vinfo,
if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
return false;
+ /* Handle nonlinear induction in a separate place. */
+ if (induction_type != vect_step_op_add)
+ return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
+ vec_stmt, slp_node, cost_vec);
+
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index e5fdc9e0a14..0c7f1a32070 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -68,6 +68,15 @@ enum vect_def_type {
vect_unknown_def_type
};
+/* Define operation type of linear/non-linear induction variable. */
+enum vect_induction_op_type {
+ vect_step_op_add = 0,
+ vect_step_op_neg,
+ vect_step_op_mul,
+ vect_step_op_shl,
+ vect_step_op_shr
+};
+
/* Define type of reduction. */
enum vect_reduction_type {
TREE_CODE_REDUCTION,
@@ -1188,6 +1197,7 @@ public:
the version here. */
tree loop_phi_evolution_base_unchanged;
tree loop_phi_evolution_part;
+ enum vect_induction_op_type loop_phi_evolution_type;
/* Used for various bookkeeping purposes, generally holding a pointer to
some other stmt S that is in some way "related" to this stmt.
@@ -1421,6 +1431,7 @@ struct gather_scatter_info {
((S)->dr_aux.dr && DR_GROUP_FIRST_ELEMENT(S))
#define STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED(S) (S)->loop_phi_evolution_base_unchanged
#define STMT_VINFO_LOOP_PHI_EVOLUTION_PART(S) (S)->loop_phi_evolution_part
+#define STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE(S) (S)->loop_phi_evolution_type
#define STMT_VINFO_MIN_NEG_DIST(S) (S)->min_neg_dist
#define STMT_VINFO_REDUC_TYPE(S) (S)->reduc_type
#define STMT_VINFO_REDUC_CODE(S) (S)->reduc_code
--
2.18.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [RFC: PATCH] Extend vectorizer to handle nonlinear induction for neg, mul/lshift/rshift with a constant.
2022-08-04 4:28 [RFC: PATCH] Extend vectorizer to handle nonlinear induction for neg, mul/lshift/rshift with a constant liuhongt
@ 2022-08-04 8:18 ` Richard Biener
2022-08-04 10:09 ` Richard Sandiford
` (2 more replies)
0 siblings, 3 replies; 7+ messages in thread
From: Richard Biener @ 2022-08-04 8:18 UTC (permalink / raw)
To: liuhongt, Richard Sandiford; +Cc: GCC Patches
On Thu, Aug 4, 2022 at 6:29 AM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> For neg, the patch create a vec_init as [ a, -a, a, -a, ... ] and no
> vec_step is needed to update vectorized iv since vf is always multiple
> of 2(negative * negative is positive).
>
> For shift, the patch create a vec_init as [ a, a >> c, a >> 2*c, ..]
> as vec_step as [ c * nunits, c * nunits, c * nunits, ... ], vectorized iv is
> updated as vec_def = vec_init >>/<< vec_step.
>
> For mul, the patch create a vec_init as [ a, a * c, a * pow(c, 2), ..]
> as vec_step as [ pow(c,nunits), pow(c,nunits),...] iv is updated as vec_def =
> vec_init * vec_step.
>
> The patch handles nonlinear iv for
> 1. Integer type only, floating point is not handled.
> 2. No slp_node.
> 3. iv_loop should be same as vector loop, not nested loop.
> 4. No UD is created, for mul, no UD overlow for pow (c, vf), for
> shift, shift count should be less than type precision.
>
> Bootstrapped and regression tested on x86_64-pc-linux-gnu{-m32,}.
> There's some cases observed in SPEC2017, but no big performance impact.
>
> Any comments?
Looks good overall - a few comments inline. Also can you please add
SLP support?
I've tried hard to fill in gaps where SLP support is missing since my
goal is still to get
rid of non-SLP.
> gcc/ChangeLog:
>
> PR tree-optimization/103144
> * tree-vect-loop.cc (vect_is_nonlinear_iv_evolution): New function.
> (vect_analyze_scalar_cycles_1): Detect nonlinear iv by upper function.
> (vect_create_nonlinear_iv_init): New function.
> (vect_create_nonlinear_iv_step): Ditto
> (vect_create_nonlinear_iv_vec_step): Ditto
> (vect_update_nonlinear_iv): Ditto
> (vectorizable_nonlinear_induction): Ditto.
> (vectorizable_induction): Call
> vectorizable_nonlinear_induction when induction_type is not
> vect_step_op_add.
> * tree-vectorizer.h (enum vect_induction_op_type): New enum.
> (STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE): New Macro.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr103144-mul-1.c: New test.
> * gcc.target/i386/pr103144-mul-2.c: New test.
> * gcc.target/i386/pr103144-neg-1.c: New test.
> * gcc.target/i386/pr103144-neg-2.c: New test.
> * gcc.target/i386/pr103144-shift-1.c: New test.
> * gcc.target/i386/pr103144-shift-2.c: New test.
> ---
> .../gcc.target/i386/pr103144-mul-1.c | 25 +
> .../gcc.target/i386/pr103144-mul-2.c | 43 ++
> .../gcc.target/i386/pr103144-neg-1.c | 25 +
> .../gcc.target/i386/pr103144-neg-2.c | 36 ++
> .../gcc.target/i386/pr103144-shift-1.c | 34 +
> .../gcc.target/i386/pr103144-shift-2.c | 61 ++
> gcc/tree-vect-loop.cc | 604 +++++++++++++++++-
> gcc/tree-vectorizer.h | 11 +
> 8 files changed, 834 insertions(+), 5 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-shift-2.c
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
> new file mode 100644
> index 00000000000..2357541d95d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
> @@ -0,0 +1,25 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
> +
> +#define N 10000
> +void
> +foo_mul (int* a, int b)
> +{
> + for (int i = 0; i != N; i++)
> + {
> + a[i] = b;
> + b *= 3;
> + }
> +}
> +
> +void
> +foo_mul_const (int* a)
> +{
> + int b = 1;
> + for (int i = 0; i != N; i++)
> + {
> + a[i] = b;
> + b *= 3;
> + }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
> new file mode 100644
> index 00000000000..4ea53e44658
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
> @@ -0,0 +1,43 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx2 } */
> +
> +#include "avx2-check.h"
> +#include <string.h>
> +#include "pr103144-mul-1.c"
> +
> +typedef int v8si __attribute__((vector_size(32)));
> +
> +void
> +avx2_test (void)
> +{
> + int* epi32_exp = (int*) malloc (N * sizeof (int));
> + int* epi32_dst = (int*) malloc (N * sizeof (int));
> +
> + __builtin_memset (epi32_exp, 0, N * sizeof (int));
> + int b = 8;
> + v8si init = __extension__(v8si) { b, b * 3, b * 9, b * 27, b * 81, b * 243, b * 729, b * 2187 };
> +
> + for (int i = 0; i != N / 8; i++)
> + {
> + memcpy (epi32_exp + i * 8, &init, 32);
> + init *= 6561;
> + }
> +
> + foo_mul (epi32_dst, b);
> + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> + __builtin_abort ();
> +
> + init = __extension__(v8si) { 1, 3, 9, 27, 81, 243, 729, 2187 };
> + for (int i = 0; i != N / 8; i++)
> + {
> + memcpy (epi32_exp + i * 8, &init, 32);
> + init *= 6561;
> + }
> +
> + foo_mul_const (epi32_dst);
> + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> + __builtin_abort ();
> +
> + return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
> new file mode 100644
> index 00000000000..63a3331563d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
> @@ -0,0 +1,25 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
> +
> +#define N 10000
> +void
> +foo_neg (int* a, int b)
> +{
> + for (int i = 0; i != N; i++)
> + {
> + a[i] = b;
> + b = -b;
> + }
> +}
> +
> +void
> +foo_neg_const (int* a)
> +{
> + int b = 1;
> + for (int i = 0; i != N; i++)
> + {
> + a[i] = b;
> + b = -b;
> + }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
> new file mode 100644
> index 00000000000..13bc61cdce7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
> @@ -0,0 +1,36 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx2 } */
> +
> +#include "avx2-check.h"
> +#include <string.h>
> +#include "pr103144-neg-1.c"
> +
> +void
> +avx2_test (void)
> +{
> + int* epi32_exp = (int*) malloc (N * sizeof (int));
> + int* epi32_dst = (int*) malloc (N * sizeof (int));
> + long long* epi64_exp = (long long*) malloc (N * sizeof (int));
> +
> + __builtin_memset (epi32_exp, 0, N * sizeof (int));
> + int b = 100;
> +
> + for (int i = 0; i != N / 2; i++)
> + epi64_exp[i] = ((long long) b) | (((long long) -b) << 32);
> +
> + memcpy (epi32_exp, epi64_exp, N * sizeof (int));
> + foo_neg (epi32_dst, b);
> + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> + __builtin_abort ();
> +
> + for (int i = 0; i != N / 2; i++)
> + epi64_exp[i] = ((long long) 1) | (((long long) -1) << 32);
> +
> + memcpy (epi32_exp, epi64_exp, N * sizeof (int));
> + foo_neg_const (epi32_dst);
> + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> + __builtin_abort ();
> +
> + return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
> new file mode 100644
> index 00000000000..51531495d60
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
> @@ -0,0 +1,34 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 3 "vect" } } */
> +
> +#define N 10000
> +void
> +foo_shl (int* a, int b)
> +{
> + for (int i = 0; i != N; i++)
> + {
> + a[i] = b;
> + b <<= 1;
> + }
> +}
> +
> +void
> +foo_ashr (int* a, int b)
> +{
> + for (int i = 0; i != N; i++)
> + {
> + a[i] = b;
> + b >>= 1;
> + }
> +}
> +
> +void
> +foo_lshr (unsigned int* a, unsigned int b)
> +{
> + for (int i = 0; i != N; i++)
> + {
> + a[i] = b;
> + b >>= 1U;
> + }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c
> new file mode 100644
> index 00000000000..02deda6d1ab
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c
> @@ -0,0 +1,61 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx2 } */
> +
> +#include "avx2-check.h"
> +#include <string.h>
> +#include "pr103144-shift-1.c"
> +
> +typedef int v8si __attribute__((vector_size(32)));
> +typedef unsigned int v8usi __attribute__((vector_size(32)));
> +
> +void
> +avx2_test (void)
> +{
> + int* epi32_exp = (int*) malloc (N * sizeof (int));
> + int* epi32_dst = (int*) malloc (N * sizeof (int));
> + unsigned int* epu32_exp = (unsigned int*) malloc (N * sizeof (int));
> + unsigned int* epu32_dst = (unsigned int*) malloc (N * sizeof (int));
> +
> +
> + __builtin_memset (epi32_exp, 0, N * sizeof (int));
> + int b = 8;
> + v8si init = __extension__(v8si) { b, b << 1, b << 2, b << 3, b << 4, b << 5, b << 6, b << 7 };
> +
> + for (int i = 0; i != N / 8; i++)
> + {
> + memcpy (epi32_exp + i * 8, &init, 32);
> + init <<= 8;
> + }
> +
> + foo_shl (epi32_dst, b);
> + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> + __builtin_abort ();
> +
> + b = 11111111;
> + init = __extension__(v8si) { b, b >> 1, b >> 2, b >> 3, b >> 4, b >> 5, b >> 6, b >> 7 };
> + for (int i = 0; i != N / 8; i++)
> + {
> + memcpy (epi32_exp + i * 8, &init, 32);
> + init >>= 8;
> + }
> +
> + foo_ashr (epi32_dst, b);
> + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> + __builtin_abort ();
> +
> + __builtin_memset (epu32_exp, 0, N * sizeof (int));
> + unsigned int c = 11111111;
> + v8usi initu = __extension__(v8usi) { c, c >> 1U, c >> 2U, c >> 3U, c >> 4U, c >> 5U, c >> 6U, c >> 7U };
> + for (int i = 0; i != N / 8; i++)
> + {
> + memcpy (epu32_exp + i * 8, &initu, 32);
> + initu >>= 8U;
> + }
> +
> + foo_lshr (epu32_dst, c);
> + if (__builtin_memcmp (epu32_dst, epu32_exp, N * sizeof (int)) != 0)
> + __builtin_abort ();
> +
> + return;
> +}
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 2257b29a652..fc2a1584048 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -425,6 +425,98 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
> return true;
> }
>
> +static bool
> +vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
> + gphi* loop_phi_node, tree *init, tree *step)
> +{
> + tree init_expr, ev_expr, result, op1, op2;
> + basic_block bb1, bb2;
> + gimple* def;
> + imm_use_iterator imm_iter;
> + use_operand_p use_p;
> +
> +
> + gcc_assert (gimple_code (loop_phi_node) == GIMPLE_PHI);
gcc_assert (is_a <gphi *> (loop_phi_node));
is shorter
> +
> + if (gimple_phi_num_args (loop_phi_node) != 2)
> + return false;
> +
> + init_expr = PHI_ARG_DEF (loop_phi_node, 0);
> + ev_expr = PHI_ARG_DEF (loop_phi_node, 1);
I think you should use
init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node,loop_preheader_edge (loop));
and loop_latch_edge (loop) for ev_expr, you can't rely on them being arg 0 / 1.
> +
> + /* Only support nonlinear induction for integer. */
> + if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
> + return false;
> +
> + bb1 = gimple_phi_arg_edge (loop_phi_node, 0)->src;
> + bb2 = gimple_phi_arg_edge (loop_phi_node, 1)->src;
Likewise. Use preheader/latch edge.
> +
> + /* One outside the loop, the other inside the loop. */
> + if (!(flow_bb_inside_loop_p (loop, bb1) ^ flow_bb_inside_loop_p (loop, bb2)))
> + return false;
> +
> + if (flow_bb_inside_loop_p (loop, bb1))
> + std::swap (init_expr, ev_expr);
and those two should then go away.
> +
> + if (TREE_CODE (ev_expr) != SSA_NAME
> + || !has_single_use (ev_expr)
A comment might be nice - this makes sure we only
see post-increment uses. But that also means
for(;;)
{
b = -b;
a[i] = b;
}
is not vectorized? I think it should be possible to relax
this.
> + || !(def = SSA_NAME_DEF_STMT (ev_expr))
def is never NULL so a cheaper way to write this is
|| ((def = SSA_NAME_DEF_STMT (ev_expr)), true)
> + || !is_gimple_assign (def))
> + return false;
> +
> + *init = init_expr;
> + result = PHI_RESULT (loop_phi_node);
> +
> + /* final_value_replacement_loop should handle outside loop use. */
> + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, result)
> + {
> + gimple *use_stmt;
> + use_stmt = USE_STMT (use_p);
> + if (is_gimple_debug (use_stmt))
> + continue;
> + if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
> + return false;
> + }
not sure if we need to bother here - ideally vectorizable_live_operation would
give up instead (but I suppose the regular IV / induction analysis gives up
here as well?)
> +
> + enum tree_code t_code = gimple_assign_rhs_code (def);
> + switch (t_code)
> + {
> + case NEGATE_EXPR:
> + if (gimple_assign_rhs1 (def) != result)
> + return false;
> + *step = build_int_cst (TREE_TYPE (init_expr), -1);
> + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
> + break;
> +
> + case RSHIFT_EXPR:
> + case LSHIFT_EXPR:
> + case MULT_EXPR:
> + op1 = gimple_assign_rhs1 (def);
> + op2 = gimple_assign_rhs2 (def);
> + if (TREE_CODE (op2) != INTEGER_CST
> + || op1 != result)
> + return false;
> + *step = op2;
> + break;
> +
> + default:
> + return false;
> + }
> +
> + if (t_code == LSHIFT_EXPR)
> + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
> + else if (t_code == RSHIFT_EXPR)
> + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
> + /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
> + else if (t_code == MULT_EXPR)
> + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
the above can at least go into the combined switch case
> + STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
Seeing this - did you check whether you handle prologue peeling correctly? The
same issue might show up with a vectorized epilogue. I think you can force a
peeled prologue with storing unaligned and -fno-vect-cost-model (that IIRC will
simply optimize for the larger number of aligned memory ops)
> + STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
> +
> + return true;
> +}
> +
> /* Return true if PHI, described by STMT_INFO, is the inner PHI in
> what we are assuming is a double reduction. For example, given
> a structure like this:
> @@ -512,11 +604,16 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
> = evolution_part_in_loop_num (access_fn, loop->num);
> }
>
> - if (!access_fn
> - || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
> - || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
> - || (LOOP_VINFO_LOOP (loop_vinfo) != loop
> - && TREE_CODE (step) != INTEGER_CST))
> + if ((!access_fn
> + || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
> + || !vect_is_simple_iv_evolution (loop->num, access_fn,
> + &init, &step)
> + || (LOOP_VINFO_LOOP (loop_vinfo) != loop
> + && TREE_CODE (step) != INTEGER_CST))
> + /* Only handle nonlinear iv for same loop. */
> + && (!vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
> + phi, &init, &step)
> + || LOOP_VINFO_LOOP (loop_vinfo) != loop))
since you only handle inner loop nonlinear IVs you should probably
swap the two checks?
> {
> worklist.safe_push (stmt_vinfo);
> continue;
> @@ -8229,6 +8326,496 @@ vect_can_vectorize_without_simd_p (code_helper code)
> && vect_can_vectorize_without_simd_p (tree_code (code)));
> }
>
> +/* Create vector init for vectorized iv. */
> +static tree
> +vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
> + tree step_expr, poly_uint64 nunits,
> + tree vectype,
> + enum vect_induction_op_type induction_type)
> +{
> + unsigned HOST_WIDE_INT const_nunits;
> + tree vec_shift, vec_init, new_name;
> + unsigned i;
> +
> + /* iv_loop is the loop to be vectorized. Create:
> + vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
> + new_name = init_expr;
> + switch (induction_type)
> + {
> + case vect_step_op_shr:
> + case vect_step_op_shl:
> + /* Build the Initial value from shift_expr. */
> + vec_init = gimple_build_vector_from_val (stmts,
> + vectype,
> + new_name);
> + vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
> + build_zero_cst (TREE_TYPE (step_expr)),
> + step_expr);
There might be a more canonical way to build the series expr - Richard?
> + vec_init = gimple_build (stmts,
> + (induction_type == vect_step_op_shr
> + ? RSHIFT_EXPR : LSHIFT_EXPR),
> + vectype, vec_init, vec_shift);
> + break;
> +
> + case vect_step_op_neg:
> + {
> + vec_init = gimple_build_vector_from_val (stmts,
> + vectype,
> + new_name);
> + tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
> + vectype, vec_init);
> + /* The encoding has 2 interleaved stepped patterns. */
> + vec_perm_builder sel (nunits, 2, 3);
> + sel.quick_grow (6);
> + for (i = 0; i < 3; i++)
> + {
> + sel[2 * i] = i;
> + sel[2 * i + 1] = i + nunits;
> + }
> + vec_perm_indices indices (sel, 2, nunits);
> + tree perm_mask_even
> + = vect_gen_perm_mask_checked (vectype, indices);
> + vec_init = gimple_build (stmts, VEC_PERM_EXPR,
> + vectype,
> + vec_init, vec_neg,
> + perm_mask_even);
> + }
> + break;
> +
> + case vect_step_op_mul:
> + {
> + gcc_assert (nunits.is_constant (&const_nunits));
> + vec_init = gimple_build_vector_from_val (stmts,
> + vectype,
> + new_name);
> + tree_vector_builder elts (vectype, const_nunits, 1);
> + tree elt_step = build_one_cst (TREE_TYPE (step_expr));
> +
> + elts.quick_push (elt_step);
> + for (i = 1; i < const_nunits; i++)
> + {
> + /* Create: new_name_i = new_name + step_expr. */
> + elt_step = gimple_build (stmts, MULT_EXPR,
> + TREE_TYPE (step_expr),
> + elt_step, step_expr);
You probably should perform this multiplication in an unsigned type to avoid
overflow.
> + elts.quick_push (elt_step);
> + }
> + /* Create a vector from [new_name_0, new_name_1, ...,
> + new_name_nunits-1]. */
> + tree vec_mul = gimple_build_vector (stmts, &elts);
> + vec_init = gimple_build (stmts, MULT_EXPR, vectype,
> + vec_init, vec_mul);
> + }
> + break;
> +
> + default:
> + gcc_unreachable ();
> + }
> +
> + return vec_init;
> +}
> +
> +/* Create vector step for vectorized iv. */
> +static tree
> +vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
> + poly_uint64 vf,
> + enum vect_induction_op_type induction_type)
> +{
> + tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
> + tree new_name = NULL;
> + /* Step should be pow (step, vf) for mult induction. */
> + if (induction_type == vect_step_op_mul)
> + {
> + gcc_assert (vf.is_constant ());
> + wide_int begin = wi::to_wide (step_expr);
> +
> + for (unsigned i = 0; i != vf.to_constant () - 1; i++)
> + begin = wi::mul (begin, wi::to_wide (step_expr));
> +
> + new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
> + }
> + else if (induction_type == vect_step_op_neg)
> + /* Do nothing. */
> + ;
> + else
> + new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
> + expr, step_expr);
> + return new_name;
> +}
> +
> +static tree
> +vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
> + stmt_vec_info stmt_info,
> + tree new_name, tree vectype,
> + enum vect_induction_op_type induction_type)
> +{
> + /* No step is needed for neg induction. */
> + if (induction_type == vect_step_op_neg)
> + return NULL;
> +
> + tree t = unshare_expr (new_name);
> + gcc_assert (CONSTANT_CLASS_P (new_name)
> + || TREE_CODE (new_name) == SSA_NAME);
> + tree new_vec = build_vector_from_val (vectype, t);
> + tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
> + new_vec, vectype, NULL);
> + return vec_step;
> +}
> +
> +/* Update vectorized iv with vect_step, induc_def is init. */
> +static tree
> +vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
> + tree induc_def, tree vec_step,
> + enum vect_induction_op_type induction_type)
> +{
> + tree vec_def = induc_def;
> + switch (induction_type)
> + {
> + case vect_step_op_mul:
> + vec_def = gimple_build (stmts, MULT_EXPR, vectype,
> + vec_def, vec_step);
> + break;
> +
> + case vect_step_op_shr:
> + vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
> + vec_def, vec_step);
> + break;
> +
> + case vect_step_op_shl:
> + vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
> + vec_def, vec_step);
> + break;
> + case vect_step_op_neg:
> + vec_def = induc_def;
> + /* Do nothing. */
> + break;
> + default:
> + gcc_unreachable ();
> + }
> +
> + return vec_def;
> +
> +}
> +/* Function vectorizable_induction
> +
> + Check if STMT_INFO performs an nonlinear induction computation that can be
> + vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
> + a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
> + basic block.
> + Return true if STMT_INFO is vectorizable in this way. */
> +
> +static bool
> +vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
> + stmt_vec_info stmt_info,
> + gimple **vec_stmt, slp_tree slp_node,
> + stmt_vector_for_cost *cost_vec)
> +{
> + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> + unsigned ncopies;
> + bool nested_in_vect_loop = false;
> + class loop *iv_loop;
> + tree vec_def;
> + edge pe = loop_preheader_edge (loop);
> + basic_block new_bb;
> + tree vec_init, vec_step;
> + tree new_name;
> + gimple *new_stmt;
> + gphi *induction_phi;
> + tree induc_def, vec_dest;
> + tree init_expr, step_expr;
> + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> + unsigned i;
> + gimple_stmt_iterator si;
> +
> + gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
> +
> + tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> + enum vect_induction_op_type induction_type
> + = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
> +
> + gcc_assert (induction_type > vect_step_op_add);
> +
> + if (slp_node)
> + ncopies = 1;
> + else
> + ncopies = vect_get_num_copies (loop_vinfo, vectype);
> + gcc_assert (ncopies >= 1);
> +
> + /* FORNOW. Only handle nonlinear induction in the same loop. */
> + if (nested_in_vect_loop_p (loop, stmt_info))
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "nonlinear induction in nested loop.\n");
> + return false;
> + }
> +
> + iv_loop = loop;
> + gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
> +
> + if (slp_node)
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "SLP induction not supported for nonlinear"
> + " induction.\n");
> + return false;
> + }
> +
> + if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "Peel for alignment not supported for nonlinear"
> + " induction.\n");
Note LOOP_VINFO_MASK_SKIP_NITERS doesn't capture all cases of
prologue peeling. I _think_ that you eventually could check
LOOP_VINFO_EPILOGUE_P (is this a vectorized epilouge) and
LOOP_VINFO_PEELING_FOR_ALIGNMENT here instead. But do you
see any difficulties in supporting this?
> + return false;
> + }
> +
> + if (FLOAT_TYPE_P (vectype))
It's always better to state what is supported "positively", so please use
if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "floating point nonlinear induction vectorization"
> + " not supported.\n");
> + return false;
> + }
> +
> + step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
> + init_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info);
> + gcc_assert (step_expr != NULL_TREE && init_expr != NULL);
> + /* step_expr should be aligned with init_expr,
> + .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
> + step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
> + gcc_assert (TREE_CODE (init_expr) == INTEGER_CST
> + || (TYPE_CANONICAL (TREE_TYPE (vectype))
> + == TYPE_CANONICAL (TREE_TYPE (init_expr))));
use types_compatible_p (...) instead of comparing TYPE_CANONICAL.
A small enhancement would be to support different signedness
(use tree_nop_conversion_p then).
> + gcc_assert (TREE_CODE (step_expr) == INTEGER_CST);
> + init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
above you asserted that the conversion is only necessary for constants
but then fold_convert will also generate a tree NOP_EXPR for
some types_compatible_p types. So maybe only do this for INTEGER_CST
init_expr or use init_expr = gimple_convert (...) and insert required stmts
on the preheader.
> +
> + switch (induction_type)
> + {
> + case vect_step_op_neg:
> + if (TREE_CODE (init_expr) != INTEGER_CST
> + && TREE_CODE (init_expr) != REAL_CST)
> + {
> + /* Check for backend support of NEGATE_EXPR and vec_perm. */
> + if (!directly_supported_p (NEGATE_EXPR, vectype))
> + return false;
> +
> + /* The encoding has 2 interleaved stepped patterns. */
> + vec_perm_builder sel (nunits, 2, 3);
> + machine_mode mode = TYPE_MODE (vectype);
> + sel.quick_grow (6);
> + for (i = 0; i < 3; i++)
> + {
> + sel[i * 2] = i;
> + sel[i * 2 + 1] = i + nunits;
> + }
> + vec_perm_indices indices (sel, 2, nunits);
> + if (!can_vec_perm_const_p (mode, mode, indices))
> + return false;
> + }
> + break;
> +
> + case vect_step_op_mul:
> + {
> + /* Check for backend support of MULT_EXPR. */
> + if (!directly_supported_p (MULT_EXPR, vectype))
> + return false;
> +
> + /* ?? How to construct vector step for variable number vector.
> + [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
> + if (!nunits.is_constant ())
> + return false;
> +
> + /* Make sure there's no overflow or overflow is defined. */
> + if (!TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (init_expr)))
> + break;
Alternatively you could perform the vector IV updates in an unsigned type?
> + wi::overflow_type overflow;
> + wide_int begin = wi::to_wide (step_expr);
> +
> + for (unsigned i = 0; i != nunits.to_constant () - 1; i++)
> + {
> + begin = wi::mul (begin, wi::to_wide (step_expr),
> + TYPE_SIGN (TREE_TYPE (init_expr)),
> + &overflow);
> + if (overflow)
> + return false;
> + }
> + }
> + break;
> +
> + case vect_step_op_shr:
> + /* Check for backend support of RSHIFT_EXPR. */
> + if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
> + return false;
> +
> + /* Don't shift more than type precision. */
> + if (!tree_fits_uhwi_p (step_expr)
> + || maybe_ge (nunits * tree_to_uhwi (step_expr),
> + TYPE_PRECISION (TREE_TYPE (init_expr))))
> + return false;
why's that necessary? can we do a MIN (vector_step, { prec-1, prec-1,
prec-1 ... })
to fix things up? The shift IVs degenerate quickly with a large niter (or VF).
> + break;
> +
> + case vect_step_op_shl:
> + /* Check for backend support of RSHIFT_EXPR. */
> + if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
> + return false;
> +
> + /* Don't shift more than type precision. */
> + if (!tree_fits_uhwi_p (step_expr)
> + || maybe_ge (nunits * tree_to_uhwi (step_expr),
> + TYPE_PRECISION (TREE_TYPE (init_expr))))
> + return false;
Likewise.
> +
> + break;
> +
> + default:
> + gcc_unreachable ();
> + }
> +
> + if (!vec_stmt) /* transformation not required. */
> + {
> + unsigned inside_cost = 0, prologue_cost = 0;
> + /* loop cost for vec_loop. Neg induction doesn't have any
> + inside_cost. */
> + inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
> + stmt_info, 0, vect_body);
> +
> + /* loop cost for vec_loop. Neg induction doesn't have any
> + inside_cost. */
> + if (induction_type == vect_step_op_neg)
> + inside_cost = 0;
> +
> + /* prologue cost for vec_init and vec_step. */
> + prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
> + stmt_info, 0, vect_prologue);
> +
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_NOTE, vect_location,
> + "vect_model_induction_cost: inside_cost = %d, "
> + "prologue_cost = %d. \n", inside_cost,
> + prologue_cost);
> +
> + STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
> + DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
> + return true;
> + }
> +
> + /* Transform. */
> +
> + /* Compute a vector variable, initialized with the first VF values of
> + the induction variable. E.g., for an iv with IV_PHI='X' and
> + evolution S, for a vector of 4 units, we want to compute:
> + [X, X + S, X + 2*S, X + 3*S]. */
> +
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
> +
> + pe = loop_preheader_edge (iv_loop);
> + /* Find the first insertion point in the BB. */
> + basic_block bb = gimple_bb (phi);
> + si = gsi_after_labels (bb);
> +
> + init_expr = vect_phi_initial_value (phi);
> +
> + gimple_seq stmts = NULL;
> + vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
> + step_expr, nunits, vectype,
> + induction_type);
> + if (stmts)
> + {
> + new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
> + gcc_assert (!new_bb);
> + }
> +
> + stmts = NULL;
> + new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
> + vf, induction_type);
> + if (stmts)
> + {
> + new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
> + gcc_assert (!new_bb);
> + }
> +
> + vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
> + new_name, vectype,
> + induction_type);
> + /* Create the following def-use cycle:
> + loop prolog:
> + vec_init = ...
> + vec_step = ...
> + loop:
> + vec_iv = PHI <vec_init, vec_loop>
> + ...
> + STMT
> + ...
> + vec_loop = vec_iv + vec_step; */
> +
> + /* Create the induction-phi that defines the induction-operand. */
> + vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
> + induction_phi = create_phi_node (vec_dest, iv_loop->header);
> + induc_def = PHI_RESULT (induction_phi);
> +
> + /* Create the iv update inside the loop. */
> + stmts = NULL;
> + vec_def = vect_update_nonlinear_iv (&stmts, vectype,
> + induc_def, vec_step,
> + induction_type);
> +
> + gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
> + new_stmt = SSA_NAME_DEF_STMT (vec_def);
> +
> + /* Set the arguments of the phi node: */
> + add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
> + add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
> + UNKNOWN_LOCATION);
> +
> + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
> + *vec_stmt = induction_phi;
> +
> + /* In case that vectorization factor (VF) is bigger than the number
> + of elements that we can fit in a vectype (nunits), we have to generate
> + more than one vector stmt - i.e - we need to "unroll" the
> + vector stmt by a factor VF/nunits. For more details see documentation
> + in vectorizable_operation. */
> +
> + if (ncopies > 1)
> + {
> + stmts = NULL;
> + /* FORNOW. This restriction should be relaxed. */
> + gcc_assert (!nested_in_vect_loop);
> +
> + new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
> + nunits, induction_type);
> +
> + vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
> + new_name, vectype,
> + induction_type);
are these not the same as created above?
> + vec_def = induc_def;
> + for (i = 1; i < ncopies; i++)
> + {
> + /* vec_i = vec_prev + vec_step. */
> + stmts = NULL;
> + vec_def = vect_update_nonlinear_iv (&stmts, vectype,
> + vec_def, vec_step,
> + induction_type);
I think you need to update the PHI latch definition to the last vec_i?
> + gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
> + new_stmt = SSA_NAME_DEF_STMT (vec_def);
> + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> + }
> + }
> +
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_NOTE, vect_location,
> + "transform induction: created def-use cycle: %G%G",
> + induction_phi, SSA_NAME_DEF_STMT (vec_def));
> +
> + return true;
> +}
> +
> /* Function vectorizable_induction
>
> Check if STMT_INFO performs an induction computation that can be vectorized.
> @@ -8259,6 +8846,8 @@ vectorizable_induction (loop_vec_info loop_vinfo,
> unsigned i;
> tree expr;
> gimple_stmt_iterator si;
> + enum vect_induction_op_type induction_type
> + = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
>
> gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
> if (!phi)
> @@ -8271,6 +8860,11 @@ vectorizable_induction (loop_vec_info loop_vinfo,
> if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
> return false;
>
> + /* Handle nonlinear induction in a separate place. */
> + if (induction_type != vect_step_op_add)
> + return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
> + vec_stmt, slp_node, cost_vec);
> +
> tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
>
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index e5fdc9e0a14..0c7f1a32070 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -68,6 +68,15 @@ enum vect_def_type {
> vect_unknown_def_type
> };
>
> +/* Define operation type of linear/non-linear induction variable. */
> +enum vect_induction_op_type {
> + vect_step_op_add = 0,
> + vect_step_op_neg,
> + vect_step_op_mul,
> + vect_step_op_shl,
> + vect_step_op_shr
> +};
> +
> /* Define type of reduction. */
> enum vect_reduction_type {
> TREE_CODE_REDUCTION,
> @@ -1188,6 +1197,7 @@ public:
> the version here. */
> tree loop_phi_evolution_base_unchanged;
> tree loop_phi_evolution_part;
> + enum vect_induction_op_type loop_phi_evolution_type;
>
> /* Used for various bookkeeping purposes, generally holding a pointer to
> some other stmt S that is in some way "related" to this stmt.
> @@ -1421,6 +1431,7 @@ struct gather_scatter_info {
> ((S)->dr_aux.dr && DR_GROUP_FIRST_ELEMENT(S))
> #define STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED(S) (S)->loop_phi_evolution_base_unchanged
> #define STMT_VINFO_LOOP_PHI_EVOLUTION_PART(S) (S)->loop_phi_evolution_part
> +#define STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE(S) (S)->loop_phi_evolution_type
> #define STMT_VINFO_MIN_NEG_DIST(S) (S)->min_neg_dist
> #define STMT_VINFO_REDUC_TYPE(S) (S)->reduc_type
> #define STMT_VINFO_REDUC_CODE(S) (S)->reduc_code
> --
> 2.18.1
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [RFC: PATCH] Extend vectorizer to handle nonlinear induction for neg, mul/lshift/rshift with a constant.
2022-08-04 8:18 ` Richard Biener
@ 2022-08-04 10:09 ` Richard Sandiford
2022-08-05 5:21 ` Hongtao Liu
2022-08-29 5:24 ` [PATCH V2] " liuhongt
2 siblings, 0 replies; 7+ messages in thread
From: Richard Sandiford @ 2022-08-04 10:09 UTC (permalink / raw)
To: Richard Biener; +Cc: liuhongt, GCC Patches
Richard Biener <richard.guenther@gmail.com> writes:
>> +/* Create vector init for vectorized iv. */
>> +static tree
>> +vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
>> + tree step_expr, poly_uint64 nunits,
>> + tree vectype,
>> + enum vect_induction_op_type induction_type)
>> +{
>> + unsigned HOST_WIDE_INT const_nunits;
>> + tree vec_shift, vec_init, new_name;
>> + unsigned i;
>> +
>> + /* iv_loop is the loop to be vectorized. Create:
>> + vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
>> + new_name = init_expr;
>> + switch (induction_type)
>> + {
>> + case vect_step_op_shr:
>> + case vect_step_op_shl:
>> + /* Build the Initial value from shift_expr. */
>> + vec_init = gimple_build_vector_from_val (stmts,
>> + vectype,
>> + new_name);
>> + vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
>> + build_zero_cst (TREE_TYPE (step_expr)),
>> + step_expr);
>
> There might be a more canonical way to build the series expr - Richard?
build_vec_series is shorter if step_expr is known to be a constant.
The above looks good for the general case.
Thanks,
Richard
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [RFC: PATCH] Extend vectorizer to handle nonlinear induction for neg, mul/lshift/rshift with a constant.
2022-08-04 8:18 ` Richard Biener
2022-08-04 10:09 ` Richard Sandiford
@ 2022-08-05 5:21 ` Hongtao Liu
2022-08-29 5:24 ` [PATCH V2] " liuhongt
2 siblings, 0 replies; 7+ messages in thread
From: Hongtao Liu @ 2022-08-05 5:21 UTC (permalink / raw)
To: Richard Biener; +Cc: liuhongt, Richard Sandiford, GCC Patches
On Thu, Aug 4, 2022 at 4:19 PM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Thu, Aug 4, 2022 at 6:29 AM liuhongt via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > For neg, the patch create a vec_init as [ a, -a, a, -a, ... ] and no
> > vec_step is needed to update vectorized iv since vf is always multiple
> > of 2(negative * negative is positive).
> >
> > For shift, the patch create a vec_init as [ a, a >> c, a >> 2*c, ..]
> > as vec_step as [ c * nunits, c * nunits, c * nunits, ... ], vectorized iv is
> > updated as vec_def = vec_init >>/<< vec_step.
> >
> > For mul, the patch create a vec_init as [ a, a * c, a * pow(c, 2), ..]
> > as vec_step as [ pow(c,nunits), pow(c,nunits),...] iv is updated as vec_def =
> > vec_init * vec_step.
> >
> > The patch handles nonlinear iv for
> > 1. Integer type only, floating point is not handled.
> > 2. No slp_node.
> > 3. iv_loop should be same as vector loop, not nested loop.
> > 4. No UD is created, for mul, no UD overlow for pow (c, vf), for
> > shift, shift count should be less than type precision.
> >
> > Bootstrapped and regression tested on x86_64-pc-linux-gnu{-m32,}.
> > There's some cases observed in SPEC2017, but no big performance impact.
> >
> > Any comments?
>
> Looks good overall - a few comments inline. Also can you please add
> SLP support?
> I've tried hard to fill in gaps where SLP support is missing since my
> goal is still to get
> rid of non-SLP.
>
> > gcc/ChangeLog:
> >
> > PR tree-optimization/103144
> > * tree-vect-loop.cc (vect_is_nonlinear_iv_evolution): New function.
> > (vect_analyze_scalar_cycles_1): Detect nonlinear iv by upper function.
> > (vect_create_nonlinear_iv_init): New function.
> > (vect_create_nonlinear_iv_step): Ditto
> > (vect_create_nonlinear_iv_vec_step): Ditto
> > (vect_update_nonlinear_iv): Ditto
> > (vectorizable_nonlinear_induction): Ditto.
> > (vectorizable_induction): Call
> > vectorizable_nonlinear_induction when induction_type is not
> > vect_step_op_add.
> > * tree-vectorizer.h (enum vect_induction_op_type): New enum.
> > (STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE): New Macro.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/pr103144-mul-1.c: New test.
> > * gcc.target/i386/pr103144-mul-2.c: New test.
> > * gcc.target/i386/pr103144-neg-1.c: New test.
> > * gcc.target/i386/pr103144-neg-2.c: New test.
> > * gcc.target/i386/pr103144-shift-1.c: New test.
> > * gcc.target/i386/pr103144-shift-2.c: New test.
> > ---
> > .../gcc.target/i386/pr103144-mul-1.c | 25 +
> > .../gcc.target/i386/pr103144-mul-2.c | 43 ++
> > .../gcc.target/i386/pr103144-neg-1.c | 25 +
> > .../gcc.target/i386/pr103144-neg-2.c | 36 ++
> > .../gcc.target/i386/pr103144-shift-1.c | 34 +
> > .../gcc.target/i386/pr103144-shift-2.c | 61 ++
> > gcc/tree-vect-loop.cc | 604 +++++++++++++++++-
> > gcc/tree-vectorizer.h | 11 +
> > 8 files changed, 834 insertions(+), 5 deletions(-)
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-shift-2.c
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
> > new file mode 100644
> > index 00000000000..2357541d95d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
> > +
> > +#define N 10000
> > +void
> > +foo_mul (int* a, int b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + {
> > + a[i] = b;
> > + b *= 3;
> > + }
> > +}
> > +
> > +void
> > +foo_mul_const (int* a)
> > +{
> > + int b = 1;
> > + for (int i = 0; i != N; i++)
> > + {
> > + a[i] = b;
> > + b *= 3;
> > + }
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
> > new file mode 100644
> > index 00000000000..4ea53e44658
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
> > @@ -0,0 +1,43 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx2 } */
> > +
> > +#include "avx2-check.h"
> > +#include <string.h>
> > +#include "pr103144-mul-1.c"
> > +
> > +typedef int v8si __attribute__((vector_size(32)));
> > +
> > +void
> > +avx2_test (void)
> > +{
> > + int* epi32_exp = (int*) malloc (N * sizeof (int));
> > + int* epi32_dst = (int*) malloc (N * sizeof (int));
> > +
> > + __builtin_memset (epi32_exp, 0, N * sizeof (int));
> > + int b = 8;
> > + v8si init = __extension__(v8si) { b, b * 3, b * 9, b * 27, b * 81, b * 243, b * 729, b * 2187 };
> > +
> > + for (int i = 0; i != N / 8; i++)
> > + {
> > + memcpy (epi32_exp + i * 8, &init, 32);
> > + init *= 6561;
> > + }
> > +
> > + foo_mul (epi32_dst, b);
> > + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> > + __builtin_abort ();
> > +
> > + init = __extension__(v8si) { 1, 3, 9, 27, 81, 243, 729, 2187 };
> > + for (int i = 0; i != N / 8; i++)
> > + {
> > + memcpy (epi32_exp + i * 8, &init, 32);
> > + init *= 6561;
> > + }
> > +
> > + foo_mul_const (epi32_dst);
> > + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> > + __builtin_abort ();
> > +
> > + return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
> > new file mode 100644
> > index 00000000000..63a3331563d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
> > +
> > +#define N 10000
> > +void
> > +foo_neg (int* a, int b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + {
> > + a[i] = b;
> > + b = -b;
> > + }
> > +}
> > +
> > +void
> > +foo_neg_const (int* a)
> > +{
> > + int b = 1;
> > + for (int i = 0; i != N; i++)
> > + {
> > + a[i] = b;
> > + b = -b;
> > + }
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
> > new file mode 100644
> > index 00000000000..13bc61cdce7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
> > @@ -0,0 +1,36 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx2 } */
> > +
> > +#include "avx2-check.h"
> > +#include <string.h>
> > +#include "pr103144-neg-1.c"
> > +
> > +void
> > +avx2_test (void)
> > +{
> > + int* epi32_exp = (int*) malloc (N * sizeof (int));
> > + int* epi32_dst = (int*) malloc (N * sizeof (int));
> > + long long* epi64_exp = (long long*) malloc (N * sizeof (int));
> > +
> > + __builtin_memset (epi32_exp, 0, N * sizeof (int));
> > + int b = 100;
> > +
> > + for (int i = 0; i != N / 2; i++)
> > + epi64_exp[i] = ((long long) b) | (((long long) -b) << 32);
> > +
> > + memcpy (epi32_exp, epi64_exp, N * sizeof (int));
> > + foo_neg (epi32_dst, b);
> > + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> > + __builtin_abort ();
> > +
> > + for (int i = 0; i != N / 2; i++)
> > + epi64_exp[i] = ((long long) 1) | (((long long) -1) << 32);
> > +
> > + memcpy (epi32_exp, epi64_exp, N * sizeof (int));
> > + foo_neg_const (epi32_dst);
> > + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> > + __builtin_abort ();
> > +
> > + return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
> > new file mode 100644
> > index 00000000000..51531495d60
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
> > @@ -0,0 +1,34 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 3 "vect" } } */
> > +
> > +#define N 10000
> > +void
> > +foo_shl (int* a, int b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + {
> > + a[i] = b;
> > + b <<= 1;
> > + }
> > +}
> > +
> > +void
> > +foo_ashr (int* a, int b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + {
> > + a[i] = b;
> > + b >>= 1;
> > + }
> > +}
> > +
> > +void
> > +foo_lshr (unsigned int* a, unsigned int b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + {
> > + a[i] = b;
> > + b >>= 1U;
> > + }
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c
> > new file mode 100644
> > index 00000000000..02deda6d1ab
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c
> > @@ -0,0 +1,61 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx2 } */
> > +
> > +#include "avx2-check.h"
> > +#include <string.h>
> > +#include "pr103144-shift-1.c"
> > +
> > +typedef int v8si __attribute__((vector_size(32)));
> > +typedef unsigned int v8usi __attribute__((vector_size(32)));
> > +
> > +void
> > +avx2_test (void)
> > +{
> > + int* epi32_exp = (int*) malloc (N * sizeof (int));
> > + int* epi32_dst = (int*) malloc (N * sizeof (int));
> > + unsigned int* epu32_exp = (unsigned int*) malloc (N * sizeof (int));
> > + unsigned int* epu32_dst = (unsigned int*) malloc (N * sizeof (int));
> > +
> > +
> > + __builtin_memset (epi32_exp, 0, N * sizeof (int));
> > + int b = 8;
> > + v8si init = __extension__(v8si) { b, b << 1, b << 2, b << 3, b << 4, b << 5, b << 6, b << 7 };
> > +
> > + for (int i = 0; i != N / 8; i++)
> > + {
> > + memcpy (epi32_exp + i * 8, &init, 32);
> > + init <<= 8;
> > + }
> > +
> > + foo_shl (epi32_dst, b);
> > + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> > + __builtin_abort ();
> > +
> > + b = 11111111;
> > + init = __extension__(v8si) { b, b >> 1, b >> 2, b >> 3, b >> 4, b >> 5, b >> 6, b >> 7 };
> > + for (int i = 0; i != N / 8; i++)
> > + {
> > + memcpy (epi32_exp + i * 8, &init, 32);
> > + init >>= 8;
> > + }
> > +
> > + foo_ashr (epi32_dst, b);
> > + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> > + __builtin_abort ();
> > +
> > + __builtin_memset (epu32_exp, 0, N * sizeof (int));
> > + unsigned int c = 11111111;
> > + v8usi initu = __extension__(v8usi) { c, c >> 1U, c >> 2U, c >> 3U, c >> 4U, c >> 5U, c >> 6U, c >> 7U };
> > + for (int i = 0; i != N / 8; i++)
> > + {
> > + memcpy (epu32_exp + i * 8, &initu, 32);
> > + initu >>= 8U;
> > + }
> > +
> > + foo_lshr (epu32_dst, c);
> > + if (__builtin_memcmp (epu32_dst, epu32_exp, N * sizeof (int)) != 0)
> > + __builtin_abort ();
> > +
> > + return;
> > +}
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index 2257b29a652..fc2a1584048 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -425,6 +425,98 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
> > return true;
> > }
> >
> > +static bool
> > +vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
> > + gphi* loop_phi_node, tree *init, tree *step)
> > +{
> > + tree init_expr, ev_expr, result, op1, op2;
> > + basic_block bb1, bb2;
> > + gimple* def;
> > + imm_use_iterator imm_iter;
> > + use_operand_p use_p;
> > +
> > +
> > + gcc_assert (gimple_code (loop_phi_node) == GIMPLE_PHI);
>
> gcc_assert (is_a <gphi *> (loop_phi_node));
>
> is shorter
>
> > +
> > + if (gimple_phi_num_args (loop_phi_node) != 2)
> > + return false;
> > +
> > + init_expr = PHI_ARG_DEF (loop_phi_node, 0);
> > + ev_expr = PHI_ARG_DEF (loop_phi_node, 1);
>
> I think you should use
>
> init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node,loop_preheader_edge (loop));
> and loop_latch_edge (loop) for ev_expr, you can't rely on them being arg 0 / 1.
>
> > +
> > + /* Only support nonlinear induction for integer. */
> > + if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
> > + return false;
> > +
> > + bb1 = gimple_phi_arg_edge (loop_phi_node, 0)->src;
> > + bb2 = gimple_phi_arg_edge (loop_phi_node, 1)->src;
>
> Likewise. Use preheader/latch edge.
>
> > +
> > + /* One outside the loop, the other inside the loop. */
> > + if (!(flow_bb_inside_loop_p (loop, bb1) ^ flow_bb_inside_loop_p (loop, bb2)))
> > + return false;
> > +
> > + if (flow_bb_inside_loop_p (loop, bb1))
> > + std::swap (init_expr, ev_expr);
>
> and those two should then go away.
>
> > +
> > + if (TREE_CODE (ev_expr) != SSA_NAME
> > + || !has_single_use (ev_expr)
>
> A comment might be nice - this makes sure we only
> see post-increment uses. But that also means
>
> for(;;)
> {
> b = -b;
> a[i] = b;
> }
>
> is not vectorized? I think it should be possible to relax
> this.
>
> > + || !(def = SSA_NAME_DEF_STMT (ev_expr))
>
> def is never NULL so a cheaper way to write this is
>
> || ((def = SSA_NAME_DEF_STMT (ev_expr)), true)
>
> > + || !is_gimple_assign (def))
> > + return false;
> > +
> > + *init = init_expr;
> > + result = PHI_RESULT (loop_phi_node);
> > +
> > + /* final_value_replacement_loop should handle outside loop use. */
> > + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, result)
> > + {
> > + gimple *use_stmt;
> > + use_stmt = USE_STMT (use_p);
> > + if (is_gimple_debug (use_stmt))
> > + continue;
> > + if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
> > + return false;
> > + }
>
> not sure if we need to bother here - ideally vectorizable_live_operation would
> give up instead (but I suppose the regular IV / induction analysis gives up
> here as well?)
>
> > +
> > + enum tree_code t_code = gimple_assign_rhs_code (def);
> > + switch (t_code)
> > + {
> > + case NEGATE_EXPR:
> > + if (gimple_assign_rhs1 (def) != result)
> > + return false;
> > + *step = build_int_cst (TREE_TYPE (init_expr), -1);
> > + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
> > + break;
> > +
> > + case RSHIFT_EXPR:
> > + case LSHIFT_EXPR:
> > + case MULT_EXPR:
> > + op1 = gimple_assign_rhs1 (def);
> > + op2 = gimple_assign_rhs2 (def);
> > + if (TREE_CODE (op2) != INTEGER_CST
> > + || op1 != result)
> > + return false;
> > + *step = op2;
> > + break;
> > +
> > + default:
> > + return false;
> > + }
> > +
> > + if (t_code == LSHIFT_EXPR)
> > + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
> > + else if (t_code == RSHIFT_EXPR)
> > + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
> > + /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
> > + else if (t_code == MULT_EXPR)
> > + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
>
> the above can at least go into the combined switch case
>
> > + STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
>
> Seeing this - did you check whether you handle prologue peeling correctly? The
> same issue might show up with a vectorized epilogue. I think you can force a
> peeled prologue with storing unaligned and -fno-vect-cost-model (that IIRC will
> simply optimize for the larger number of aligned memory ops)
Oh, i forget to update vect_update_ivs_after_vectorizer.
>
> > + STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
> > +
> > + return true;
> > +}
> > +
> > /* Return true if PHI, described by STMT_INFO, is the inner PHI in
> > what we are assuming is a double reduction. For example, given
> > a structure like this:
> > @@ -512,11 +604,16 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
> > = evolution_part_in_loop_num (access_fn, loop->num);
> > }
> >
> > - if (!access_fn
> > - || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
> > - || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
> > - || (LOOP_VINFO_LOOP (loop_vinfo) != loop
> > - && TREE_CODE (step) != INTEGER_CST))
> > + if ((!access_fn
> > + || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
> > + || !vect_is_simple_iv_evolution (loop->num, access_fn,
> > + &init, &step)
> > + || (LOOP_VINFO_LOOP (loop_vinfo) != loop
> > + && TREE_CODE (step) != INTEGER_CST))
> > + /* Only handle nonlinear iv for same loop. */
> > + && (!vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
> > + phi, &init, &step)
> > + || LOOP_VINFO_LOOP (loop_vinfo) != loop))
>
> since you only handle inner loop nonlinear IVs you should probably
> swap the two checks?
>
> > {
> > worklist.safe_push (stmt_vinfo);
> > continue;
> > @@ -8229,6 +8326,496 @@ vect_can_vectorize_without_simd_p (code_helper code)
> > && vect_can_vectorize_without_simd_p (tree_code (code)));
> > }
> >
> > +/* Create vector init for vectorized iv. */
> > +static tree
> > +vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
> > + tree step_expr, poly_uint64 nunits,
> > + tree vectype,
> > + enum vect_induction_op_type induction_type)
> > +{
> > + unsigned HOST_WIDE_INT const_nunits;
> > + tree vec_shift, vec_init, new_name;
> > + unsigned i;
> > +
> > + /* iv_loop is the loop to be vectorized. Create:
> > + vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
> > + new_name = init_expr;
> > + switch (induction_type)
> > + {
> > + case vect_step_op_shr:
> > + case vect_step_op_shl:
> > + /* Build the Initial value from shift_expr. */
> > + vec_init = gimple_build_vector_from_val (stmts,
> > + vectype,
> > + new_name);
> > + vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
> > + build_zero_cst (TREE_TYPE (step_expr)),
> > + step_expr);
>
> There might be a more canonical way to build the series expr - Richard?
>
> > + vec_init = gimple_build (stmts,
> > + (induction_type == vect_step_op_shr
> > + ? RSHIFT_EXPR : LSHIFT_EXPR),
> > + vectype, vec_init, vec_shift);
> > + break;
> > +
> > + case vect_step_op_neg:
> > + {
> > + vec_init = gimple_build_vector_from_val (stmts,
> > + vectype,
> > + new_name);
> > + tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
> > + vectype, vec_init);
> > + /* The encoding has 2 interleaved stepped patterns. */
> > + vec_perm_builder sel (nunits, 2, 3);
> > + sel.quick_grow (6);
> > + for (i = 0; i < 3; i++)
> > + {
> > + sel[2 * i] = i;
> > + sel[2 * i + 1] = i + nunits;
> > + }
> > + vec_perm_indices indices (sel, 2, nunits);
> > + tree perm_mask_even
> > + = vect_gen_perm_mask_checked (vectype, indices);
> > + vec_init = gimple_build (stmts, VEC_PERM_EXPR,
> > + vectype,
> > + vec_init, vec_neg,
> > + perm_mask_even);
> > + }
> > + break;
> > +
> > + case vect_step_op_mul:
> > + {
> > + gcc_assert (nunits.is_constant (&const_nunits));
> > + vec_init = gimple_build_vector_from_val (stmts,
> > + vectype,
> > + new_name);
> > + tree_vector_builder elts (vectype, const_nunits, 1);
> > + tree elt_step = build_one_cst (TREE_TYPE (step_expr));
> > +
> > + elts.quick_push (elt_step);
> > + for (i = 1; i < const_nunits; i++)
> > + {
> > + /* Create: new_name_i = new_name + step_expr. */
> > + elt_step = gimple_build (stmts, MULT_EXPR,
> > + TREE_TYPE (step_expr),
> > + elt_step, step_expr);
>
> You probably should perform this multiplication in an unsigned type to avoid
> overflow.
>
> > + elts.quick_push (elt_step);
> > + }
> > + /* Create a vector from [new_name_0, new_name_1, ...,
> > + new_name_nunits-1]. */
> > + tree vec_mul = gimple_build_vector (stmts, &elts);
> > + vec_init = gimple_build (stmts, MULT_EXPR, vectype,
> > + vec_init, vec_mul);
> > + }
> > + break;
> > +
> > + default:
> > + gcc_unreachable ();
> > + }
> > +
> > + return vec_init;
> > +}
> > +
> > +/* Create vector step for vectorized iv. */
> > +static tree
> > +vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
> > + poly_uint64 vf,
> > + enum vect_induction_op_type induction_type)
> > +{
> > + tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
> > + tree new_name = NULL;
> > + /* Step should be pow (step, vf) for mult induction. */
> > + if (induction_type == vect_step_op_mul)
> > + {
> > + gcc_assert (vf.is_constant ());
> > + wide_int begin = wi::to_wide (step_expr);
> > +
> > + for (unsigned i = 0; i != vf.to_constant () - 1; i++)
> > + begin = wi::mul (begin, wi::to_wide (step_expr));
> > +
> > + new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
> > + }
> > + else if (induction_type == vect_step_op_neg)
> > + /* Do nothing. */
> > + ;
> > + else
> > + new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
> > + expr, step_expr);
> > + return new_name;
> > +}
> > +
> > +static tree
> > +vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
> > + stmt_vec_info stmt_info,
> > + tree new_name, tree vectype,
> > + enum vect_induction_op_type induction_type)
> > +{
> > + /* No step is needed for neg induction. */
> > + if (induction_type == vect_step_op_neg)
> > + return NULL;
> > +
> > + tree t = unshare_expr (new_name);
> > + gcc_assert (CONSTANT_CLASS_P (new_name)
> > + || TREE_CODE (new_name) == SSA_NAME);
> > + tree new_vec = build_vector_from_val (vectype, t);
> > + tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
> > + new_vec, vectype, NULL);
> > + return vec_step;
> > +}
> > +
> > +/* Update vectorized iv with vect_step, induc_def is init. */
> > +static tree
> > +vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
> > + tree induc_def, tree vec_step,
> > + enum vect_induction_op_type induction_type)
> > +{
> > + tree vec_def = induc_def;
> > + switch (induction_type)
> > + {
> > + case vect_step_op_mul:
> > + vec_def = gimple_build (stmts, MULT_EXPR, vectype,
> > + vec_def, vec_step);
> > + break;
> > +
> > + case vect_step_op_shr:
> > + vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
> > + vec_def, vec_step);
> > + break;
> > +
> > + case vect_step_op_shl:
> > + vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
> > + vec_def, vec_step);
> > + break;
> > + case vect_step_op_neg:
> > + vec_def = induc_def;
> > + /* Do nothing. */
> > + break;
> > + default:
> > + gcc_unreachable ();
> > + }
> > +
> > + return vec_def;
> > +
> > +}
> > +/* Function vectorizable_induction
> > +
> > + Check if STMT_INFO performs an nonlinear induction computation that can be
> > + vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
> > + a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
> > + basic block.
> > + Return true if STMT_INFO is vectorizable in this way. */
> > +
> > +static bool
> > +vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
> > + stmt_vec_info stmt_info,
> > + gimple **vec_stmt, slp_tree slp_node,
> > + stmt_vector_for_cost *cost_vec)
> > +{
> > + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> > + unsigned ncopies;
> > + bool nested_in_vect_loop = false;
> > + class loop *iv_loop;
> > + tree vec_def;
> > + edge pe = loop_preheader_edge (loop);
> > + basic_block new_bb;
> > + tree vec_init, vec_step;
> > + tree new_name;
> > + gimple *new_stmt;
> > + gphi *induction_phi;
> > + tree induc_def, vec_dest;
> > + tree init_expr, step_expr;
> > + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > + unsigned i;
> > + gimple_stmt_iterator si;
> > +
> > + gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
> > +
> > + tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> > + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > + enum vect_induction_op_type induction_type
> > + = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
> > +
> > + gcc_assert (induction_type > vect_step_op_add);
> > +
> > + if (slp_node)
> > + ncopies = 1;
> > + else
> > + ncopies = vect_get_num_copies (loop_vinfo, vectype);
> > + gcc_assert (ncopies >= 1);
> > +
> > + /* FORNOW. Only handle nonlinear induction in the same loop. */
> > + if (nested_in_vect_loop_p (loop, stmt_info))
> > + {
> > + if (dump_enabled_p ())
> > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > + "nonlinear induction in nested loop.\n");
> > + return false;
> > + }
> > +
> > + iv_loop = loop;
> > + gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
> > +
> > + if (slp_node)
> > + {
> > + if (dump_enabled_p ())
> > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > + "SLP induction not supported for nonlinear"
> > + " induction.\n");
> > + return false;
> > + }
> > +
> > + if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
> > + {
> > + if (dump_enabled_p ())
> > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > + "Peel for alignment not supported for nonlinear"
> > + " induction.\n");
>
> Note LOOP_VINFO_MASK_SKIP_NITERS doesn't capture all cases of
> prologue peeling. I _think_ that you eventually could check
> LOOP_VINFO_EPILOGUE_P (is this a vectorized epilouge) and
> LOOP_VINFO_PEELING_FOR_ALIGNMENT here instead. But do you
> see any difficulties in supporting this?
>
> > + return false;
> > + }
> > +
> > + if (FLOAT_TYPE_P (vectype))
>
> It's always better to state what is supported "positively", so please use
>
> if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
>
> > + {
> > + if (dump_enabled_p ())
> > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > + "floating point nonlinear induction vectorization"
> > + " not supported.\n");
> > + return false;
> > + }
> > +
> > + step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
> > + init_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info);
> > + gcc_assert (step_expr != NULL_TREE && init_expr != NULL);
> > + /* step_expr should be aligned with init_expr,
> > + .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
> > + step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
> > + gcc_assert (TREE_CODE (init_expr) == INTEGER_CST
> > + || (TYPE_CANONICAL (TREE_TYPE (vectype))
> > + == TYPE_CANONICAL (TREE_TYPE (init_expr))));
>
> use types_compatible_p (...) instead of comparing TYPE_CANONICAL.
> A small enhancement would be to support different signedness
> (use tree_nop_conversion_p then).
>
> > + gcc_assert (TREE_CODE (step_expr) == INTEGER_CST);
> > + init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
>
> above you asserted that the conversion is only necessary for constants
> but then fold_convert will also generate a tree NOP_EXPR for
> some types_compatible_p types. So maybe only do this for INTEGER_CST
> init_expr or use init_expr = gimple_convert (...) and insert required stmts
> on the preheader.
>
> > +
> > + switch (induction_type)
> > + {
> > + case vect_step_op_neg:
> > + if (TREE_CODE (init_expr) != INTEGER_CST
> > + && TREE_CODE (init_expr) != REAL_CST)
> > + {
> > + /* Check for backend support of NEGATE_EXPR and vec_perm. */
> > + if (!directly_supported_p (NEGATE_EXPR, vectype))
> > + return false;
> > +
> > + /* The encoding has 2 interleaved stepped patterns. */
> > + vec_perm_builder sel (nunits, 2, 3);
> > + machine_mode mode = TYPE_MODE (vectype);
> > + sel.quick_grow (6);
> > + for (i = 0; i < 3; i++)
> > + {
> > + sel[i * 2] = i;
> > + sel[i * 2 + 1] = i + nunits;
> > + }
> > + vec_perm_indices indices (sel, 2, nunits);
> > + if (!can_vec_perm_const_p (mode, mode, indices))
> > + return false;
> > + }
> > + break;
> > +
> > + case vect_step_op_mul:
> > + {
> > + /* Check for backend support of MULT_EXPR. */
> > + if (!directly_supported_p (MULT_EXPR, vectype))
> > + return false;
> > +
> > + /* ?? How to construct vector step for variable number vector.
> > + [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
> > + if (!nunits.is_constant ())
> > + return false;
> > +
> > + /* Make sure there's no overflow or overflow is defined. */
> > + if (!TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (init_expr)))
> > + break;
>
> Alternatively you could perform the vector IV updates in an unsigned type?
>
> > + wi::overflow_type overflow;
> > + wide_int begin = wi::to_wide (step_expr);
> > +
> > + for (unsigned i = 0; i != nunits.to_constant () - 1; i++)
> > + {
> > + begin = wi::mul (begin, wi::to_wide (step_expr),
> > + TYPE_SIGN (TREE_TYPE (init_expr)),
> > + &overflow);
> > + if (overflow)
> > + return false;
> > + }
> > + }
> > + break;
> > +
> > + case vect_step_op_shr:
> > + /* Check for backend support of RSHIFT_EXPR. */
> > + if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
> > + return false;
> > +
> > + /* Don't shift more than type precision. */
> > + if (!tree_fits_uhwi_p (step_expr)
> > + || maybe_ge (nunits * tree_to_uhwi (step_expr),
> > + TYPE_PRECISION (TREE_TYPE (init_expr))))
> > + return false;
>
> why's that necessary? can we do a MIN (vector_step, { prec-1, prec-1,
> prec-1 ... })
> to fix things up? The shift IVs degenerate quickly with a large niter (or VF).
>
> > + break;
> > +
> > + case vect_step_op_shl:
> > + /* Check for backend support of RSHIFT_EXPR. */
> > + if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
> > + return false;
> > +
> > + /* Don't shift more than type precision. */
> > + if (!tree_fits_uhwi_p (step_expr)
> > + || maybe_ge (nunits * tree_to_uhwi (step_expr),
> > + TYPE_PRECISION (TREE_TYPE (init_expr))))
> > + return false;
>
> Likewise.
>
> > +
> > + break;
> > +
> > + default:
> > + gcc_unreachable ();
> > + }
> > +
> > + if (!vec_stmt) /* transformation not required. */
> > + {
> > + unsigned inside_cost = 0, prologue_cost = 0;
> > + /* loop cost for vec_loop. Neg induction doesn't have any
> > + inside_cost. */
> > + inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
> > + stmt_info, 0, vect_body);
> > +
> > + /* loop cost for vec_loop. Neg induction doesn't have any
> > + inside_cost. */
> > + if (induction_type == vect_step_op_neg)
> > + inside_cost = 0;
> > +
> > + /* prologue cost for vec_init and vec_step. */
> > + prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
> > + stmt_info, 0, vect_prologue);
> > +
> > + if (dump_enabled_p ())
> > + dump_printf_loc (MSG_NOTE, vect_location,
> > + "vect_model_induction_cost: inside_cost = %d, "
> > + "prologue_cost = %d. \n", inside_cost,
> > + prologue_cost);
> > +
> > + STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
> > + DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
> > + return true;
> > + }
> > +
> > + /* Transform. */
> > +
> > + /* Compute a vector variable, initialized with the first VF values of
> > + the induction variable. E.g., for an iv with IV_PHI='X' and
> > + evolution S, for a vector of 4 units, we want to compute:
> > + [X, X + S, X + 2*S, X + 3*S]. */
> > +
> > + if (dump_enabled_p ())
> > + dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
> > +
> > + pe = loop_preheader_edge (iv_loop);
> > + /* Find the first insertion point in the BB. */
> > + basic_block bb = gimple_bb (phi);
> > + si = gsi_after_labels (bb);
> > +
> > + init_expr = vect_phi_initial_value (phi);
> > +
> > + gimple_seq stmts = NULL;
> > + vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
> > + step_expr, nunits, vectype,
> > + induction_type);
> > + if (stmts)
> > + {
> > + new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
> > + gcc_assert (!new_bb);
> > + }
> > +
> > + stmts = NULL;
> > + new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
> > + vf, induction_type);
> > + if (stmts)
> > + {
> > + new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
> > + gcc_assert (!new_bb);
> > + }
> > +
> > + vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
> > + new_name, vectype,
> > + induction_type);
> > + /* Create the following def-use cycle:
> > + loop prolog:
> > + vec_init = ...
> > + vec_step = ...
> > + loop:
> > + vec_iv = PHI <vec_init, vec_loop>
> > + ...
> > + STMT
> > + ...
> > + vec_loop = vec_iv + vec_step; */
> > +
> > + /* Create the induction-phi that defines the induction-operand. */
> > + vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
> > + induction_phi = create_phi_node (vec_dest, iv_loop->header);
> > + induc_def = PHI_RESULT (induction_phi);
> > +
> > + /* Create the iv update inside the loop. */
> > + stmts = NULL;
> > + vec_def = vect_update_nonlinear_iv (&stmts, vectype,
> > + induc_def, vec_step,
> > + induction_type);
> > +
> > + gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
> > + new_stmt = SSA_NAME_DEF_STMT (vec_def);
> > +
> > + /* Set the arguments of the phi node: */
> > + add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
> > + add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
> > + UNKNOWN_LOCATION);
> > +
> > + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
> > + *vec_stmt = induction_phi;
> > +
> > + /* In case that vectorization factor (VF) is bigger than the number
> > + of elements that we can fit in a vectype (nunits), we have to generate
> > + more than one vector stmt - i.e - we need to "unroll" the
> > + vector stmt by a factor VF/nunits. For more details see documentation
> > + in vectorizable_operation. */
> > +
> > + if (ncopies > 1)
> > + {
> > + stmts = NULL;
> > + /* FORNOW. This restriction should be relaxed. */
> > + gcc_assert (!nested_in_vect_loop);
> > +
> > + new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
> > + nunits, induction_type);
> > +
> > + vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
> > + new_name, vectype,
> > + induction_type);
>
> are these not the same as created above?
>
> > + vec_def = induc_def;
> > + for (i = 1; i < ncopies; i++)
> > + {
> > + /* vec_i = vec_prev + vec_step. */
> > + stmts = NULL;
> > + vec_def = vect_update_nonlinear_iv (&stmts, vectype,
> > + vec_def, vec_step,
> > + induction_type);
>
> I think you need to update the PHI latch definition to the last vec_i?
>
> > + gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
> > + new_stmt = SSA_NAME_DEF_STMT (vec_def);
> > + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> > + }
> > + }
> > +
> > + if (dump_enabled_p ())
> > + dump_printf_loc (MSG_NOTE, vect_location,
> > + "transform induction: created def-use cycle: %G%G",
> > + induction_phi, SSA_NAME_DEF_STMT (vec_def));
> > +
> > + return true;
> > +}
> > +
> > /* Function vectorizable_induction
> >
> > Check if STMT_INFO performs an induction computation that can be vectorized.
> > @@ -8259,6 +8846,8 @@ vectorizable_induction (loop_vec_info loop_vinfo,
> > unsigned i;
> > tree expr;
> > gimple_stmt_iterator si;
> > + enum vect_induction_op_type induction_type
> > + = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
> >
> > gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
> > if (!phi)
> > @@ -8271,6 +8860,11 @@ vectorizable_induction (loop_vec_info loop_vinfo,
> > if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
> > return false;
> >
> > + /* Handle nonlinear induction in a separate place. */
> > + if (induction_type != vect_step_op_add)
> > + return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
> > + vec_stmt, slp_node, cost_vec);
> > +
> > tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> > poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> >
> > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > index e5fdc9e0a14..0c7f1a32070 100644
> > --- a/gcc/tree-vectorizer.h
> > +++ b/gcc/tree-vectorizer.h
> > @@ -68,6 +68,15 @@ enum vect_def_type {
> > vect_unknown_def_type
> > };
> >
> > +/* Define operation type of linear/non-linear induction variable. */
> > +enum vect_induction_op_type {
> > + vect_step_op_add = 0,
> > + vect_step_op_neg,
> > + vect_step_op_mul,
> > + vect_step_op_shl,
> > + vect_step_op_shr
> > +};
> > +
> > /* Define type of reduction. */
> > enum vect_reduction_type {
> > TREE_CODE_REDUCTION,
> > @@ -1188,6 +1197,7 @@ public:
> > the version here. */
> > tree loop_phi_evolution_base_unchanged;
> > tree loop_phi_evolution_part;
> > + enum vect_induction_op_type loop_phi_evolution_type;
> >
> > /* Used for various bookkeeping purposes, generally holding a pointer to
> > some other stmt S that is in some way "related" to this stmt.
> > @@ -1421,6 +1431,7 @@ struct gather_scatter_info {
> > ((S)->dr_aux.dr && DR_GROUP_FIRST_ELEMENT(S))
> > #define STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED(S) (S)->loop_phi_evolution_base_unchanged
> > #define STMT_VINFO_LOOP_PHI_EVOLUTION_PART(S) (S)->loop_phi_evolution_part
> > +#define STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE(S) (S)->loop_phi_evolution_type
> > #define STMT_VINFO_MIN_NEG_DIST(S) (S)->min_neg_dist
> > #define STMT_VINFO_REDUC_TYPE(S) (S)->reduc_type
> > #define STMT_VINFO_REDUC_CODE(S) (S)->reduc_code
> > --
> > 2.18.1
> >
--
BR,
Hongtao
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH V2] Extend vectorizer to handle nonlinear induction for neg, mul/lshift/rshift with a constant.
2022-08-04 8:18 ` Richard Biener
2022-08-04 10:09 ` Richard Sandiford
2022-08-05 5:21 ` Hongtao Liu
@ 2022-08-29 5:24 ` liuhongt
2022-08-29 5:24 ` liuhongt
2022-09-05 10:58 ` Richard Biener
2 siblings, 2 replies; 7+ messages in thread
From: liuhongt @ 2022-08-29 5:24 UTC (permalink / raw)
To: gcc-patches; +Cc: crazylht, hjl.tools
>Looks good overall - a few comments inline. Also can you please add
>SLP support?
>I've tried hard to fill in gaps where SLP support is missing since my
>goal is still to get
>rid of non-SLP.
For slp with different induction type, they need separate iv update and
an vector permutation. And if they are the same induction type, iv can
be updated with 1 instructions w/o permutation.
I'll add a incremental patch for that.
>gcc_assert (is_a <gphi *> (loop_phi_node));
>is shorter
It looks like it doesn't support gphi*, just support gimple*. Since it's
already gphi* here, I've removed the assert.
>I think you should use
>init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node,loop_preheader_edge (loop));
>and loop_latch_edge (loop) for ev_expr, you can't rely on them being arg 0 / 1.
Changed.
>Likewise. Use preheader/latch edge.
Changed.
>and those two should then go away.
Removed.
>is not vectorized? I think it should be possible to relax
>this.
Relaxed.
>def is never NULL so a cheaper way to write this is
> || ((def = SSA_NAME_DEF_STMT (ev_expr)), true)
Changed.
>not sure if we need to bother here - ideally vectorizable_live_operation would
>give up instead (but I suppose the regular IV / induction analysis gives up
>here as well?)
Removed.
>the above can at least go into the combined switch case
Changed.
>Seeing this - did you check whether you handle prologue peeling correctly? The
>same issue might show up with a vectorized epilogue. I think you can force a
>peeled prologue with storing unaligned and -fno-vect-cost-model (that IIRC will
>simply optimize for the larger number of aligned memory ops)
Update in vect_update_ivs_after_vectorizer, also support peel for unaligned cases.
>since you only handle inner loop nonlinear IVs you should probably
>swap the two checks?
Changed.
>There might be a more canonical way to build the series expr
build_vec_series doens't add stmt to sequence, so i'll still keep VEC_SERY_EXPR here?
>use types_compatible_p (...) instead of comparing TYPE_CANONICAL.
>A small enhancement would be to support different signedness
>(use tree_nop_conversion_p then).
Support different signedness.
>above you asserted that the conversion is only necessary for constants
>but then fold_convert will also generate a tree NOP_EXPR for
>some types_compatible_p types. So maybe only do this for INTEGER_CST
>init_expr or use init_expr = gimple_convert (...) and insert required stmts
>on the preheader.
Changed.
>Alternatively you could perform the vector IV updates in an unsigned type?
Changed.
>why's that necessary? can we do a MIN (vector_step, { prec-1, prec-1,
>prec-1 ... })
It's true for ashr, but not for ashl, lshr. For the later 2, when vector_step >= precision
The result should be zero instead of shift by prec - 1.
>> + new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
>> + nunits, induction_type);
>> +
>> + vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
>> + new_name, vectype,
>> + induction_type);
>are these not the same as created above?are these not the same as created above?
They are different, the first one is vf, this is nunits, vf could be multi copy of nunits which
is exact this code is handled and phi_latch is updated in the former vf place.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
For neg, the patch create a vec_init as [ a, -a, a, -a, ... ] and no
vec_step is needed to update vectorized iv since vf is always multiple
of 2(negative * negative is positive).
For shift, the patch create a vec_init as [ a, a >> c, a >> 2*c, ..]
as vec_step as [ c * nunits, c * nunits, c * nunits, ... ], vectorized iv is
updated as vec_def = vec_init >>/<< vec_step.
For mul, the patch create a vec_init as [ a, a * c, a * pow(c, 2), ..]
as vec_step as [ pow(c,nunits), pow(c,nunits),...] iv is updated as vec_def =
vec_init * vec_step.
The patch handles nonlinear iv for
1. Integer type only, floating point is not handled.
2. No slp_node.
3. iv_loop should be same as vector loop, not nested loop.
4. No UD is created, for mul, use unsigned mult to avoid UD, for
shift, shift count should be less than type precision.
gcc/ChangeLog:
PR tree-optimization/103144
* tree-vect-loop.cc (vect_is_nonlinear_iv_evolution): New function.
(vect_analyze_scalar_cycles_1): Detect nonlinear iv by upper function.
(vect_create_nonlinear_iv_init): New function.
(vect_peel_nonlinear_iv_init): Ditto.
(vect_create_nonlinear_iv_step): Ditto
(vect_create_nonlinear_iv_vec_step): Ditto
(vect_update_nonlinear_iv): Ditto
(vectorizable_nonlinear_induction): Ditto.
(vectorizable_induction): Call
vectorizable_nonlinear_induction when induction_type is not
vect_step_op_add.
* tree-vect-loop-manip.cc (vect_update_ivs_after_vectorizer):
Update nonlinear iv for epilogue loop.
* tree-vectorizer.h (enum vect_induction_op_type): New enum.
(STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE): New Macro.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr103144-mul-1.c: New test.
* gcc.target/i386/pr103144-mul-2.c: New test.
* gcc.target/i386/pr103144-neg-1.c: New test.
* gcc.target/i386/pr103144-neg-2.c: New test.
* gcc.target/i386/pr103144-shift-1.c: New test.
* gcc.target/i386/pr103144-shift-2.c: New test.
---
.../gcc.target/i386/pr103144-mul-1.c | 51 ++
.../gcc.target/i386/pr103144-mul-2.c | 51 ++
.../gcc.target/i386/pr103144-neg-1.c | 51 ++
.../gcc.target/i386/pr103144-neg-2.c | 44 ++
.../gcc.target/i386/pr103144-shift-1.c | 70 ++
.../gcc.target/i386/pr103144-shift-2.c | 79 ++
gcc/tree-vect-loop-manip.cc | 37 +-
gcc/tree-vect-loop.cc | 678 +++++++++++++++++-
gcc/tree-vectorizer.h | 15 +
9 files changed, 1062 insertions(+), 14 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-shift-2.c
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
new file mode 100644
index 00000000000..640c34fd959
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" } } */
+
+#define N 10000
+
+void
+__attribute__((noipa))
+foo_mul (int* a, int b)
+{
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b *= 3;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_mul_const (int* a)
+{
+ int b = 1;
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b *= 3;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_mul_peel (int* a, int b)
+{
+ for (int i = 0; i != 39; i++)
+ {
+ a[i] = b;
+ b *= 3;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_mul_peel_const (int* a)
+{
+ int b = 1;
+ for (int i = 0; i != 39; i++)
+ {
+ a[i] = b;
+ b *= 3;
+ }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
new file mode 100644
index 00000000000..39fdea3a69d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
@@ -0,0 +1,51 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include <string.h>
+#include "pr103144-mul-1.c"
+
+typedef int v8si __attribute__((vector_size(32)));
+
+void
+avx2_test (void)
+{
+ int* epi32_exp = (int*) malloc (N * sizeof (int));
+ int* epi32_dst = (int*) malloc (N * sizeof (int));
+
+ __builtin_memset (epi32_exp, 0, N * sizeof (int));
+ int b = 8;
+ v8si init = __extension__(v8si) { b, b * 3, b * 9, b * 27, b * 81, b * 243, b * 729, b * 2187 };
+
+ for (int i = 0; i != N / 8; i++)
+ {
+ memcpy (epi32_exp + i * 8, &init, 32);
+ init *= 6561;
+ }
+
+ foo_mul (epi32_dst, b);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ foo_mul_peel (epi32_dst, b);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * 4) != 0)
+ __builtin_abort ();
+
+ init = __extension__(v8si) { 1, 3, 9, 27, 81, 243, 729, 2187 };
+ for (int i = 0; i != N / 8; i++)
+ {
+ memcpy (epi32_exp + i * 8, &init, 32);
+ init *= 6561;
+ }
+
+ foo_mul_const (epi32_dst);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ foo_mul_peel_const (epi32_dst);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * 4) != 0)
+ __builtin_abort ();
+
+ return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
new file mode 100644
index 00000000000..f87b1d6e529
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" } } */
+
+#define N 10000
+
+void
+__attribute__((noipa))
+foo_neg (int* a, int b)
+{
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b = -b;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_neg_const (int* a)
+{
+ int b = 1;
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b = -b;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_neg_peel (int* a, int b, int n)
+{
+ for (int i = 0; i != n; i++)
+ {
+ a[i] = b;
+ b = -b;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_neg_const_peel (int* a, int n)
+{
+ int b = 1;
+ for (int i = 0; i != n; i++)
+ {
+ a[i] = b;
+ b = -b;
+ }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
new file mode 100644
index 00000000000..bb8c22b9f9e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
@@ -0,0 +1,44 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include <string.h>
+#include "pr103144-neg-1.c"
+
+void
+avx2_test (void)
+{
+ int* epi32_exp = (int*) malloc (N * sizeof (int));
+ int* epi32_dst = (int*) malloc (N * sizeof (int));
+ long long* epi64_exp = (long long*) malloc (N * sizeof (int));
+
+ __builtin_memset (epi32_exp, 0, N * sizeof (int));
+ int b = 100;
+
+ for (int i = 0; i != N / 2; i++)
+ epi64_exp[i] = ((long long) b) | (((long long) -b) << 32);
+
+ memcpy (epi32_exp, epi64_exp, N * sizeof (int));
+ foo_neg (epi32_dst, b);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ foo_neg_peel (epi32_dst, b, 39);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ for (int i = 0; i != N / 2; i++)
+ epi64_exp[i] = ((long long) 1) | (((long long) -1) << 32);
+
+ memcpy (epi32_exp, epi64_exp, N * sizeof (int));
+ foo_neg_const (epi32_dst);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ foo_neg_const_peel (epi32_dst, 39);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
new file mode 100644
index 00000000000..2a6920350dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
@@ -0,0 +1,70 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
+
+#define N 10000
+void
+__attribute__((noipa))
+foo_shl (int* a, int b)
+{
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b <<= 1;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_ashr (int* a, int b)
+{
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b >>= 1;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_lshr (unsigned int* a, unsigned int b)
+{
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b >>= 1U;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_shl_peel (int* a, int b)
+{
+ for (int i = 0; i != 39; i++)
+ {
+ a[i] = b;
+ b <<= 1;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_ashr_peel (int* a, int b)
+{
+ for (int i = 0; i != 39; i++)
+ {
+ a[i] = b;
+ b >>= 1;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_lshr_peel (unsigned int* a, unsigned int b)
+{
+ for (int i = 0; i != 39; i++)
+ {
+ a[i] = b;
+ b >>= 1U;
+ }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c
new file mode 100644
index 00000000000..6f477191d96
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c
@@ -0,0 +1,79 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include <string.h>
+#include "pr103144-shift-1.c"
+
+typedef int v8si __attribute__((vector_size(32)));
+typedef unsigned int v8usi __attribute__((vector_size(32)));
+
+void
+avx2_test (void)
+{
+ int* epi32_exp = (int*) malloc (N * sizeof (int));
+ int* epi32_dst = (int*) malloc (N * sizeof (int));
+ unsigned int* epu32_exp = (unsigned int*) malloc (N * sizeof (int));
+ unsigned int* epu32_dst = (unsigned int*) malloc (N * sizeof (int));
+
+ __builtin_memset (epi32_exp, 0, N * sizeof (int));
+ int b = 8;
+ v8si init = __extension__(v8si) { b, b << 1, b << 2, b << 3, b << 4, b << 5, b << 6, b << 7 };
+
+ for (int i = 0; i != N / 8; i++)
+ {
+ memcpy (epi32_exp + i * 8, &init, 32);
+ init <<= 8;
+ }
+
+ foo_shl (epi32_dst, b);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ foo_shl_peel (epi32_dst, b);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ b = -11111;
+ init = __extension__(v8si) { b, b >> 1, b >> 2, b >> 3, b >> 4, b >> 5, b >> 6, b >> 7 };
+ for (int i = 0; i != N / 8; i++)
+ {
+ memcpy (epi32_exp + i * 8, &init, 32);
+ init >>= 8;
+ }
+
+ foo_ashr (epi32_dst, b);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ foo_ashr_peel (epi32_dst, b);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * sizeof (int)) != 0)
+ {
+ for (int i = 0; i != 39; i++)
+ {
+ printf ("epi32_dst[%d] is %d ----", i, epi32_dst[i]);
+ printf ("epi32_exp[%d] is %d\n", i, epi32_exp[i]);
+ }
+ __builtin_abort ();
+ }
+
+ __builtin_memset (epu32_exp, 0, N * sizeof (int));
+ unsigned int c = 11111111;
+ v8usi initu = __extension__(v8usi) { c, c >> 1U, c >> 2U, c >> 3U, c >> 4U, c >> 5U, c >> 6U, c >> 7U };
+ for (int i = 0; i != N / 8; i++)
+ {
+ memcpy (epu32_exp + i * 8, &initu, 32);
+ initu >>= 8U;
+ }
+
+ foo_lshr (epu32_dst, c);
+ if (__builtin_memcmp (epu32_dst, epu32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ foo_lshr_peel (epu32_dst, c);
+ if (__builtin_memcmp (epu32_dst, epu32_exp, 39 * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ return;
+}
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 86d2264054a..fc7901d8a8a 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1559,15 +1559,28 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
gcc_assert (!tree_is_chrec (step_expr));
init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
+ gimple_seq stmts = NULL;
+ enum vect_induction_op_type induction_type
+ = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
- off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
- fold_convert (TREE_TYPE (step_expr), niters),
- step_expr);
- if (POINTER_TYPE_P (type))
- ni = fold_build_pointer_plus (init_expr, off);
+ if (induction_type == vect_step_op_add)
+ {
+ off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
+ fold_convert (TREE_TYPE (step_expr), niters),
+ step_expr);
+ if (POINTER_TYPE_P (type))
+ ni = fold_build_pointer_plus (init_expr, off);
+ else
+ ni = fold_build2 (PLUS_EXPR, type,
+ init_expr, fold_convert (type, off));
+ }
+ /* Don't bother call vect_peel_nonlinear_iv_init. */
+ else if (induction_type == vect_step_op_neg)
+ ni = init_expr;
else
- ni = fold_build2 (PLUS_EXPR, type,
- init_expr, fold_convert (type, off));
+ ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
+ niters, step_expr,
+ induction_type);
var = create_tmp_var (type, "tmp");
@@ -1576,9 +1589,15 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
ni_name = force_gimple_operand (ni, &new_stmts, false, var);
/* Exit_bb shouldn't be empty. */
if (!gsi_end_p (last_gsi))
- gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
+ {
+ gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
+ gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
+ }
else
- gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
+ {
+ gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
+ gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
+ }
/* Fix phi expressions in the successor bb. */
adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 2257b29a652..c2293466c1c 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -425,6 +425,77 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
return true;
}
+/* Function vect_is_nonlinear_iv_evolution
+
+ Only support nonlinear induction for integer type
+ 1. neg
+ 2. mul by constant
+ 3. lshift/rshift by constant.
+
+ For neg induction, return a fake step as integer -1. */
+static bool
+vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
+ gphi* loop_phi_node, tree *init, tree *step)
+{
+ tree init_expr, ev_expr, result, op1, op2;
+ gimple* def;
+
+ if (gimple_phi_num_args (loop_phi_node) != 2)
+ return false;
+
+ init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
+ ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
+
+ /* Support nonlinear induction only for integer type. */
+ if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
+ return false;
+
+ *init = init_expr;
+ result = PHI_RESULT (loop_phi_node);
+
+ if (TREE_CODE (ev_expr) != SSA_NAME
+ || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
+ || !is_gimple_assign (def))
+ return false;
+
+ enum tree_code t_code = gimple_assign_rhs_code (def);
+ switch (t_code)
+ {
+ case NEGATE_EXPR:
+ if (gimple_assign_rhs1 (def) != result)
+ return false;
+ *step = build_int_cst (TREE_TYPE (init_expr), -1);
+ STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
+ break;
+
+ case RSHIFT_EXPR:
+ case LSHIFT_EXPR:
+ case MULT_EXPR:
+ op1 = gimple_assign_rhs1 (def);
+ op2 = gimple_assign_rhs2 (def);
+ if (TREE_CODE (op2) != INTEGER_CST
+ || op1 != result)
+ return false;
+ *step = op2;
+ if (t_code == LSHIFT_EXPR)
+ STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
+ else if (t_code == RSHIFT_EXPR)
+ STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
+ /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
+ else
+ STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
+ break;
+
+ default:
+ return false;
+ }
+
+ STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
+ STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
+
+ return true;
+}
+
/* Return true if PHI, described by STMT_INFO, is the inner PHI in
what we are assuming is a double reduction. For example, given
a structure like this:
@@ -512,11 +583,16 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
= evolution_part_in_loop_num (access_fn, loop->num);
}
- if (!access_fn
- || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
- || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
- || (LOOP_VINFO_LOOP (loop_vinfo) != loop
- && TREE_CODE (step) != INTEGER_CST))
+ if ((!access_fn
+ || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
+ || !vect_is_simple_iv_evolution (loop->num, access_fn,
+ &init, &step)
+ || (LOOP_VINFO_LOOP (loop_vinfo) != loop
+ && TREE_CODE (step) != INTEGER_CST))
+ /* Only handle nonlinear iv for same loop. */
+ && (LOOP_VINFO_LOOP (loop_vinfo) != loop
+ || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
+ phi, &init, &step)))
{
worklist.safe_push (stmt_vinfo);
continue;
@@ -8229,6 +8305,591 @@ vect_can_vectorize_without_simd_p (code_helper code)
&& vect_can_vectorize_without_simd_p (tree_code (code)));
}
+/* Create vector init for vectorized iv. */
+static tree
+vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
+ tree step_expr, poly_uint64 nunits,
+ tree vectype,
+ enum vect_induction_op_type induction_type)
+{
+ unsigned HOST_WIDE_INT const_nunits;
+ tree vec_shift, vec_init, new_name;
+ unsigned i;
+ tree itype = TREE_TYPE (vectype);
+
+ /* iv_loop is the loop to be vectorized. Create:
+ vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
+ new_name = gimple_convert (stmts, itype, init_expr);
+ switch (induction_type)
+ {
+ case vect_step_op_shr:
+ case vect_step_op_shl:
+ /* Build the Initial value from shift_expr. */
+ vec_init = gimple_build_vector_from_val (stmts,
+ vectype,
+ new_name);
+ vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
+ build_zero_cst (itype), step_expr);
+ vec_init = gimple_build (stmts,
+ (induction_type == vect_step_op_shr
+ ? RSHIFT_EXPR : LSHIFT_EXPR),
+ vectype, vec_init, vec_shift);
+ break;
+
+ case vect_step_op_neg:
+ {
+ vec_init = gimple_build_vector_from_val (stmts,
+ vectype,
+ new_name);
+ tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
+ vectype, vec_init);
+ /* The encoding has 2 interleaved stepped patterns. */
+ vec_perm_builder sel (nunits, 2, 3);
+ sel.quick_grow (6);
+ for (i = 0; i < 3; i++)
+ {
+ sel[2 * i] = i;
+ sel[2 * i + 1] = i + nunits;
+ }
+ vec_perm_indices indices (sel, 2, nunits);
+ tree perm_mask_even
+ = vect_gen_perm_mask_checked (vectype, indices);
+ vec_init = gimple_build (stmts, VEC_PERM_EXPR,
+ vectype,
+ vec_init, vec_neg,
+ perm_mask_even);
+ }
+ break;
+
+ case vect_step_op_mul:
+ {
+ /* Use unsigned mult to avoid UD integer overflow. */
+ gcc_assert (nunits.is_constant (&const_nunits));
+ tree utype = unsigned_type_for (itype);
+ tree uvectype = build_vector_type (utype,
+ TYPE_VECTOR_SUBPARTS (vectype));
+ new_name = gimple_convert (stmts, utype, new_name);
+ vec_init = gimple_build_vector_from_val (stmts,
+ uvectype,
+ new_name);
+ tree_vector_builder elts (uvectype, const_nunits, 1);
+ tree elt_step = build_one_cst (utype);
+
+ elts.quick_push (elt_step);
+ for (i = 1; i < const_nunits; i++)
+ {
+ /* Create: new_name_i = new_name + step_expr. */
+ elt_step = gimple_build (stmts, MULT_EXPR,
+ utype, elt_step, step_expr);
+ elts.quick_push (elt_step);
+ }
+ /* Create a vector from [new_name_0, new_name_1, ...,
+ new_name_nunits-1]. */
+ tree vec_mul = gimple_build_vector (stmts, &elts);
+ vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
+ vec_init, vec_mul);
+ vec_init = gimple_convert (stmts, vectype, vec_init);
+ }
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ return vec_init;
+}
+
+/* Peel init_expr by skip_niter for induction_type. */
+tree
+vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
+ tree skip_niters, tree step_expr,
+ enum vect_induction_op_type induction_type)
+{
+ gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
+ tree type = TREE_TYPE (init_expr);
+ unsigned prec = TYPE_PRECISION (type);
+ switch (induction_type)
+ {
+ case vect_step_op_neg:
+ if (TREE_INT_CST_LOW (skip_niters) % 2)
+ init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
+ /* else no change. */
+ break;
+
+ case vect_step_op_shr:
+ case vect_step_op_shl:
+ skip_niters = gimple_convert (stmts, type, skip_niters);
+ step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
+ /* When shift mount >= precision, need to avoid UD.
+ In the original loop, there's no UD, and according to semantic,
+ init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
+ if (!tree_fits_uhwi_p (step_expr)
+ || tree_to_uhwi (step_expr) >= prec)
+ {
+ if (induction_type == vect_step_op_shl
+ || TYPE_UNSIGNED (type))
+ init_expr = build_zero_cst (type);
+ else
+ init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
+ init_expr,
+ wide_int_to_tree (type, prec - 1));
+ }
+ else
+ init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
+ ? RSHIFT_EXPR : LSHIFT_EXPR),
+ type, init_expr, step_expr);
+ break;
+
+ case vect_step_op_mul:
+ {
+ tree utype = unsigned_type_for (type);
+ init_expr = gimple_convert (stmts, utype, init_expr);
+ unsigned skipn = TREE_INT_CST_LOW (skip_niters);
+ wide_int begin = wi::to_wide (step_expr);
+ for (unsigned i = 0; i != skipn - 1; i++)
+ begin = wi::mul (begin, wi::to_wide (step_expr));
+ tree mult_expr = wide_int_to_tree (utype, begin);
+ init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
+ init_expr = gimple_convert (stmts, type, init_expr);
+ }
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ return init_expr;
+}
+
+/* Create vector step for vectorized iv. */
+static tree
+vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
+ poly_uint64 vf,
+ enum vect_induction_op_type induction_type)
+{
+ tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
+ tree new_name = NULL;
+ /* Step should be pow (step, vf) for mult induction. */
+ if (induction_type == vect_step_op_mul)
+ {
+ gcc_assert (vf.is_constant ());
+ wide_int begin = wi::to_wide (step_expr);
+
+ for (unsigned i = 0; i != vf.to_constant () - 1; i++)
+ begin = wi::mul (begin, wi::to_wide (step_expr));
+
+ new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
+ }
+ else if (induction_type == vect_step_op_neg)
+ /* Do nothing. */
+ ;
+ else
+ new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
+ expr, step_expr);
+ return new_name;
+}
+
+static tree
+vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
+ stmt_vec_info stmt_info,
+ tree new_name, tree vectype,
+ enum vect_induction_op_type induction_type)
+{
+ /* No step is needed for neg induction. */
+ if (induction_type == vect_step_op_neg)
+ return NULL;
+
+ tree t = unshare_expr (new_name);
+ gcc_assert (CONSTANT_CLASS_P (new_name)
+ || TREE_CODE (new_name) == SSA_NAME);
+ tree new_vec = build_vector_from_val (vectype, t);
+ tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
+ new_vec, vectype, NULL);
+ return vec_step;
+}
+
+/* Update vectorized iv with vect_step, induc_def is init. */
+static tree
+vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
+ tree induc_def, tree vec_step,
+ enum vect_induction_op_type induction_type)
+{
+ tree vec_def = induc_def;
+ switch (induction_type)
+ {
+ case vect_step_op_mul:
+ {
+ /* Use unsigned mult to avoid UD integer overflow. */
+ tree uvectype
+ = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
+ TYPE_VECTOR_SUBPARTS (vectype));
+ vec_def = gimple_convert (stmts, uvectype, vec_def);
+ vec_step = gimple_convert (stmts, uvectype, vec_step);
+ vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
+ vec_def, vec_step);
+ vec_def = gimple_convert (stmts, vectype, vec_def);
+ }
+ break;
+
+ case vect_step_op_shr:
+ vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
+ vec_def, vec_step);
+ break;
+
+ case vect_step_op_shl:
+ vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
+ vec_def, vec_step);
+ break;
+ case vect_step_op_neg:
+ vec_def = induc_def;
+ /* Do nothing. */
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ return vec_def;
+
+}
+/* Function vectorizable_induction
+
+ Check if STMT_INFO performs an nonlinear induction computation that can be
+ vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
+ a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
+ basic block.
+ Return true if STMT_INFO is vectorizable in this way. */
+
+static bool
+vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
+ stmt_vec_info stmt_info,
+ gimple **vec_stmt, slp_tree slp_node,
+ stmt_vector_for_cost *cost_vec)
+{
+ class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ unsigned ncopies;
+ bool nested_in_vect_loop = false;
+ class loop *iv_loop;
+ tree vec_def;
+ edge pe = loop_preheader_edge (loop);
+ basic_block new_bb;
+ tree vec_init, vec_step;
+ tree new_name;
+ gimple *new_stmt;
+ gphi *induction_phi;
+ tree induc_def, vec_dest;
+ tree init_expr, step_expr;
+ tree niters_skip;
+ poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ unsigned i;
+ gimple_stmt_iterator si;
+
+ gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
+
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+ poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+ enum vect_induction_op_type induction_type
+ = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
+
+ gcc_assert (induction_type > vect_step_op_add);
+
+ if (slp_node)
+ ncopies = 1;
+ else
+ ncopies = vect_get_num_copies (loop_vinfo, vectype);
+ gcc_assert (ncopies >= 1);
+
+ /* FORNOW. Only handle nonlinear induction in the same loop. */
+ if (nested_in_vect_loop_p (loop, stmt_info))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "nonlinear induction in nested loop.\n");
+ return false;
+ }
+
+ iv_loop = loop;
+ gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
+
+ /* TODO: Support slp for nonlinear iv. There should be separate vector iv
+ update for each iv and a permutation to generate wanted vector iv. */
+ if (slp_node)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "SLP induction not supported for nonlinear"
+ " induction.\n");
+ return false;
+ }
+
+ /* Init_expr will be update by vect_update_ivs_after_vectorizer,
+ if niters is unkown:
+ For shift, when shift mount >= precision, there would be UD.
+ For mult, don't known how to generate
+ init_expr * pow (step, niters) for variable niters.
+ For neg, it should be ok, since niters of vectorized main loop
+ will always be multiple of 2. */
+ if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && induction_type != vect_step_op_neg)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Peeling for epilogue is not supported"
+ " for nonlinear induction except neg"
+ " when iteration count is unknown.\n");
+ return false;
+ }
+
+ /* Also doens't support peel for neg when niter is variable.
+ ??? generate something like niter_expr & 1 ? init_expr : -init_expr? */
+ niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+ if (niters_skip != NULL_TREE
+ && TREE_CODE (niters_skip) != INTEGER_CST)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Peeling for alignement is not supported"
+ " for nonlinear induction when niters_skip"
+ " is not constant.\n");
+ return false;
+ }
+
+ if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && induction_type == vect_step_op_mul)
+ if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "floating point nonlinear induction vectorization"
+ " not supported.\n");
+ return false;
+ }
+
+ step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
+ init_expr = vect_phi_initial_value (phi);
+ gcc_assert (step_expr != NULL_TREE && init_expr != NULL
+ && TREE_CODE (step_expr) == INTEGER_CST);
+ /* step_expr should be aligned with init_expr,
+ .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
+ step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
+
+ if (TREE_CODE (init_expr) == INTEGER_CST)
+ init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
+ else
+ gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
+ TREE_TYPE (init_expr)));
+
+ switch (induction_type)
+ {
+ case vect_step_op_neg:
+ if (TREE_CODE (init_expr) != INTEGER_CST
+ && TREE_CODE (init_expr) != REAL_CST)
+ {
+ /* Check for backend support of NEGATE_EXPR and vec_perm. */
+ if (!directly_supported_p (NEGATE_EXPR, vectype))
+ return false;
+
+ /* The encoding has 2 interleaved stepped patterns. */
+ vec_perm_builder sel (nunits, 2, 3);
+ machine_mode mode = TYPE_MODE (vectype);
+ sel.quick_grow (6);
+ for (i = 0; i < 3; i++)
+ {
+ sel[i * 2] = i;
+ sel[i * 2 + 1] = i + nunits;
+ }
+ vec_perm_indices indices (sel, 2, nunits);
+ if (!can_vec_perm_const_p (mode, mode, indices))
+ return false;
+ }
+ break;
+
+ case vect_step_op_mul:
+ {
+ /* Check for backend support of MULT_EXPR. */
+ if (!directly_supported_p (MULT_EXPR, vectype))
+ return false;
+
+ /* ?? How to construct vector step for variable number vector.
+ [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
+ if (!vf.is_constant ())
+ return false;
+ }
+ break;
+
+ case vect_step_op_shr:
+ /* Check for backend support of RSHIFT_EXPR. */
+ if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
+ return false;
+
+ /* Don't shift more than type precision to avoid UD. */
+ if (!tree_fits_uhwi_p (step_expr)
+ || maybe_ge (nunits * tree_to_uhwi (step_expr),
+ TYPE_PRECISION (TREE_TYPE (init_expr))))
+ return false;
+ break;
+
+ case vect_step_op_shl:
+ /* Check for backend support of RSHIFT_EXPR. */
+ if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
+ return false;
+
+ /* Don't shift more than type precision to avoid UD. */
+ if (!tree_fits_uhwi_p (step_expr)
+ || maybe_ge (nunits * tree_to_uhwi (step_expr),
+ TYPE_PRECISION (TREE_TYPE (init_expr))))
+ return false;
+
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ if (!vec_stmt) /* transformation not required. */
+ {
+ unsigned inside_cost = 0, prologue_cost = 0;
+ /* loop cost for vec_loop. Neg induction doesn't have any
+ inside_cost. */
+ inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
+ stmt_info, 0, vect_body);
+
+ /* loop cost for vec_loop. Neg induction doesn't have any
+ inside_cost. */
+ if (induction_type == vect_step_op_neg)
+ inside_cost = 0;
+
+ /* prologue cost for vec_init and vec_step. */
+ prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
+ stmt_info, 0, vect_prologue);
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "vect_model_induction_cost: inside_cost = %d, "
+ "prologue_cost = %d. \n", inside_cost,
+ prologue_cost);
+
+ STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
+ DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
+ return true;
+ }
+
+ /* Transform. */
+
+ /* Compute a vector variable, initialized with the first VF values of
+ the induction variable. E.g., for an iv with IV_PHI='X' and
+ evolution S, for a vector of 4 units, we want to compute:
+ [X, X + S, X + 2*S, X + 3*S]. */
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
+
+ pe = loop_preheader_edge (iv_loop);
+ /* Find the first insertion point in the BB. */
+ basic_block bb = gimple_bb (phi);
+ si = gsi_after_labels (bb);
+
+ gimple_seq stmts = NULL;
+
+ /* If we are using the loop mask to "peel" for alignment then we need
+ to adjust the start value here. */
+ if (niters_skip != NULL_TREE)
+ init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
+ step_expr, induction_type);
+
+ vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
+ step_expr, nunits, vectype,
+ induction_type);
+ if (stmts)
+ {
+ new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+ gcc_assert (!new_bb);
+ }
+
+ stmts = NULL;
+ new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
+ vf, induction_type);
+ if (stmts)
+ {
+ new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+ gcc_assert (!new_bb);
+ }
+
+ vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
+ new_name, vectype,
+ induction_type);
+ /* Create the following def-use cycle:
+ loop prolog:
+ vec_init = ...
+ vec_step = ...
+ loop:
+ vec_iv = PHI <vec_init, vec_loop>
+ ...
+ STMT
+ ...
+ vec_loop = vec_iv + vec_step; */
+
+ /* Create the induction-phi that defines the induction-operand. */
+ vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
+ induction_phi = create_phi_node (vec_dest, iv_loop->header);
+ induc_def = PHI_RESULT (induction_phi);
+
+ /* Create the iv update inside the loop. */
+ stmts = NULL;
+ vec_def = vect_update_nonlinear_iv (&stmts, vectype,
+ induc_def, vec_step,
+ induction_type);
+
+ gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
+ new_stmt = SSA_NAME_DEF_STMT (vec_def);
+
+ /* Set the arguments of the phi node: */
+ add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
+ add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
+ UNKNOWN_LOCATION);
+
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
+ *vec_stmt = induction_phi;
+
+ /* In case that vectorization factor (VF) is bigger than the number
+ of elements that we can fit in a vectype (nunits), we have to generate
+ more than one vector stmt - i.e - we need to "unroll" the
+ vector stmt by a factor VF/nunits. For more details see documentation
+ in vectorizable_operation. */
+
+ if (ncopies > 1)
+ {
+ stmts = NULL;
+ /* FORNOW. This restriction should be relaxed. */
+ gcc_assert (!nested_in_vect_loop);
+
+ new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
+ nunits, induction_type);
+
+ vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
+ new_name, vectype,
+ induction_type);
+ vec_def = induc_def;
+ for (i = 1; i < ncopies; i++)
+ {
+ /* vec_i = vec_prev + vec_step. */
+ stmts = NULL;
+ vec_def = vect_update_nonlinear_iv (&stmts, vectype,
+ vec_def, vec_step,
+ induction_type);
+ gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
+ new_stmt = SSA_NAME_DEF_STMT (vec_def);
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+ }
+ }
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "transform induction: created def-use cycle: %G%G",
+ induction_phi, SSA_NAME_DEF_STMT (vec_def));
+
+ return true;
+}
+
/* Function vectorizable_induction
Check if STMT_INFO performs an induction computation that can be vectorized.
@@ -8259,6 +8920,8 @@ vectorizable_induction (loop_vec_info loop_vinfo,
unsigned i;
tree expr;
gimple_stmt_iterator si;
+ enum vect_induction_op_type induction_type
+ = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
if (!phi)
@@ -8271,6 +8934,11 @@ vectorizable_induction (loop_vec_info loop_vinfo,
if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
return false;
+ /* Handle nonlinear induction in a separate place. */
+ if (induction_type != vect_step_op_add)
+ return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
+ vec_stmt, slp_node, cost_vec);
+
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index e5fdc9e0a14..1fa08126ad8 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -68,6 +68,15 @@ enum vect_def_type {
vect_unknown_def_type
};
+/* Define operation type of linear/non-linear induction variable. */
+enum vect_induction_op_type {
+ vect_step_op_add = 0,
+ vect_step_op_neg,
+ vect_step_op_mul,
+ vect_step_op_shl,
+ vect_step_op_shr
+};
+
/* Define type of reduction. */
enum vect_reduction_type {
TREE_CODE_REDUCTION,
@@ -1188,6 +1197,7 @@ public:
the version here. */
tree loop_phi_evolution_base_unchanged;
tree loop_phi_evolution_part;
+ enum vect_induction_op_type loop_phi_evolution_type;
/* Used for various bookkeeping purposes, generally holding a pointer to
some other stmt S that is in some way "related" to this stmt.
@@ -1421,6 +1431,7 @@ struct gather_scatter_info {
((S)->dr_aux.dr && DR_GROUP_FIRST_ELEMENT(S))
#define STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED(S) (S)->loop_phi_evolution_base_unchanged
#define STMT_VINFO_LOOP_PHI_EVOLUTION_PART(S) (S)->loop_phi_evolution_part
+#define STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE(S) (S)->loop_phi_evolution_type
#define STMT_VINFO_MIN_NEG_DIST(S) (S)->min_neg_dist
#define STMT_VINFO_REDUC_TYPE(S) (S)->reduc_type
#define STMT_VINFO_REDUC_CODE(S) (S)->reduc_code
@@ -2327,6 +2338,10 @@ extern int vect_get_known_peeling_cost (loop_vec_info, int, int *,
stmt_vector_for_cost *);
extern tree cse_and_gimplify_to_preheader (loop_vec_info, tree);
+/* Nonlinear induction. */
+extern tree vect_peel_nonlinear_iv_init (gimple_seq*, tree, tree,
+ tree, enum vect_induction_op_type);
+
/* In tree-vect-slp.cc. */
extern void vect_slp_init (void);
extern void vect_slp_fini (void);
--
2.18.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH V2] Extend vectorizer to handle nonlinear induction for neg, mul/lshift/rshift with a constant.
2022-08-29 5:24 ` [PATCH V2] " liuhongt
@ 2022-08-29 5:24 ` liuhongt
2022-09-05 10:58 ` Richard Biener
1 sibling, 0 replies; 7+ messages in thread
From: liuhongt @ 2022-08-29 5:24 UTC (permalink / raw)
To: gcc-patches
>Looks good overall - a few comments inline. Also can you please add
>SLP support?
>I've tried hard to fill in gaps where SLP support is missing since my
>goal is still to get
>rid of non-SLP.
For slp with different induction type, they need separate iv update and
an vector permutation. And if they are the same induction type, iv can
be updated with 1 instructions w/o permutation.
I'll add a incremental patch for that.
>gcc_assert (is_a <gphi *> (loop_phi_node));
>is shorter
It looks like it doesn't support gphi*, just support gimple*. Since it's
already gphi* here, I've removed the assert.
>I think you should use
>init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node,loop_preheader_edge (loop));
>and loop_latch_edge (loop) for ev_expr, you can't rely on them being arg 0 / 1.
Changed.
>Likewise. Use preheader/latch edge.
Changed.
>and those two should then go away.
Removed.
>is not vectorized? I think it should be possible to relax
>this.
Relaxed.
>def is never NULL so a cheaper way to write this is
> || ((def = SSA_NAME_DEF_STMT (ev_expr)), true)
Changed.
>not sure if we need to bother here - ideally vectorizable_live_operation would
>give up instead (but I suppose the regular IV / induction analysis gives up
>here as well?)
Removed.
>the above can at least go into the combined switch case
Changed.
>Seeing this - did you check whether you handle prologue peeling correctly? The
>same issue might show up with a vectorized epilogue. I think you can force a
>peeled prologue with storing unaligned and -fno-vect-cost-model (that IIRC will
>simply optimize for the larger number of aligned memory ops)
Update in vect_update_ivs_after_vectorizer, also support peel for unaligned cases.
>since you only handle inner loop nonlinear IVs you should probably
>swap the two checks?
Changed.
>There might be a more canonical way to build the series expr
build_vec_series doens't add stmt to sequence, so i'll still keep VEC_SERY_EXPR here?
>use types_compatible_p (...) instead of comparing TYPE_CANONICAL.
>A small enhancement would be to support different signedness
>(use tree_nop_conversion_p then).
Support different signedness.
>above you asserted that the conversion is only necessary for constants
>but then fold_convert will also generate a tree NOP_EXPR for
>some types_compatible_p types. So maybe only do this for INTEGER_CST
>init_expr or use init_expr = gimple_convert (...) and insert required stmts
>on the preheader.
Changed.
>Alternatively you could perform the vector IV updates in an unsigned type?
Changed.
>why's that necessary? can we do a MIN (vector_step, { prec-1, prec-1,
>prec-1 ... })
It's true for ashr, but not for ashl, lshr. For the later 2, when vector_step >= precision
The result should be zero instead of shift by prec - 1.
>> + new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
>> + nunits, induction_type);
>> +
>> + vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
>> + new_name, vectype,
>> + induction_type);
>are these not the same as created above?are these not the same as created above?
They are different, the first one is vf, this is nunits, vf could be multi copy of nunits which
is exact this code is handled and phi_latch is updated in the former vf place.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
For neg, the patch create a vec_init as [ a, -a, a, -a, ... ] and no
vec_step is needed to update vectorized iv since vf is always multiple
of 2(negative * negative is positive).
For shift, the patch create a vec_init as [ a, a >> c, a >> 2*c, ..]
as vec_step as [ c * nunits, c * nunits, c * nunits, ... ], vectorized iv is
updated as vec_def = vec_init >>/<< vec_step.
For mul, the patch create a vec_init as [ a, a * c, a * pow(c, 2), ..]
as vec_step as [ pow(c,nunits), pow(c,nunits),...] iv is updated as vec_def =
vec_init * vec_step.
The patch handles nonlinear iv for
1. Integer type only, floating point is not handled.
2. No slp_node.
3. iv_loop should be same as vector loop, not nested loop.
4. No UD is created, for mul, use unsigned mult to avoid UD, for
shift, shift count should be less than type precision.
gcc/ChangeLog:
PR tree-optimization/103144
* tree-vect-loop.cc (vect_is_nonlinear_iv_evolution): New function.
(vect_analyze_scalar_cycles_1): Detect nonlinear iv by upper function.
(vect_create_nonlinear_iv_init): New function.
(vect_peel_nonlinear_iv_init): Ditto.
(vect_create_nonlinear_iv_step): Ditto
(vect_create_nonlinear_iv_vec_step): Ditto
(vect_update_nonlinear_iv): Ditto
(vectorizable_nonlinear_induction): Ditto.
(vectorizable_induction): Call
vectorizable_nonlinear_induction when induction_type is not
vect_step_op_add.
* tree-vect-loop-manip.cc (vect_update_ivs_after_vectorizer):
Update nonlinear iv for epilogue loop.
* tree-vectorizer.h (enum vect_induction_op_type): New enum.
(STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE): New Macro.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr103144-mul-1.c: New test.
* gcc.target/i386/pr103144-mul-2.c: New test.
* gcc.target/i386/pr103144-neg-1.c: New test.
* gcc.target/i386/pr103144-neg-2.c: New test.
* gcc.target/i386/pr103144-shift-1.c: New test.
* gcc.target/i386/pr103144-shift-2.c: New test.
---
.../gcc.target/i386/pr103144-mul-1.c | 51 ++
.../gcc.target/i386/pr103144-mul-2.c | 51 ++
.../gcc.target/i386/pr103144-neg-1.c | 51 ++
.../gcc.target/i386/pr103144-neg-2.c | 44 ++
.../gcc.target/i386/pr103144-shift-1.c | 70 ++
.../gcc.target/i386/pr103144-shift-2.c | 79 ++
gcc/tree-vect-loop-manip.cc | 37 +-
gcc/tree-vect-loop.cc | 678 +++++++++++++++++-
gcc/tree-vectorizer.h | 15 +
9 files changed, 1062 insertions(+), 14 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-shift-2.c
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
new file mode 100644
index 00000000000..640c34fd959
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" } } */
+
+#define N 10000
+
+void
+__attribute__((noipa))
+foo_mul (int* a, int b)
+{
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b *= 3;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_mul_const (int* a)
+{
+ int b = 1;
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b *= 3;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_mul_peel (int* a, int b)
+{
+ for (int i = 0; i != 39; i++)
+ {
+ a[i] = b;
+ b *= 3;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_mul_peel_const (int* a)
+{
+ int b = 1;
+ for (int i = 0; i != 39; i++)
+ {
+ a[i] = b;
+ b *= 3;
+ }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
new file mode 100644
index 00000000000..39fdea3a69d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
@@ -0,0 +1,51 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include <string.h>
+#include "pr103144-mul-1.c"
+
+typedef int v8si __attribute__((vector_size(32)));
+
+void
+avx2_test (void)
+{
+ int* epi32_exp = (int*) malloc (N * sizeof (int));
+ int* epi32_dst = (int*) malloc (N * sizeof (int));
+
+ __builtin_memset (epi32_exp, 0, N * sizeof (int));
+ int b = 8;
+ v8si init = __extension__(v8si) { b, b * 3, b * 9, b * 27, b * 81, b * 243, b * 729, b * 2187 };
+
+ for (int i = 0; i != N / 8; i++)
+ {
+ memcpy (epi32_exp + i * 8, &init, 32);
+ init *= 6561;
+ }
+
+ foo_mul (epi32_dst, b);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ foo_mul_peel (epi32_dst, b);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * 4) != 0)
+ __builtin_abort ();
+
+ init = __extension__(v8si) { 1, 3, 9, 27, 81, 243, 729, 2187 };
+ for (int i = 0; i != N / 8; i++)
+ {
+ memcpy (epi32_exp + i * 8, &init, 32);
+ init *= 6561;
+ }
+
+ foo_mul_const (epi32_dst);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ foo_mul_peel_const (epi32_dst);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * 4) != 0)
+ __builtin_abort ();
+
+ return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
new file mode 100644
index 00000000000..f87b1d6e529
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" } } */
+
+#define N 10000
+
+void
+__attribute__((noipa))
+foo_neg (int* a, int b)
+{
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b = -b;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_neg_const (int* a)
+{
+ int b = 1;
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b = -b;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_neg_peel (int* a, int b, int n)
+{
+ for (int i = 0; i != n; i++)
+ {
+ a[i] = b;
+ b = -b;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_neg_const_peel (int* a, int n)
+{
+ int b = 1;
+ for (int i = 0; i != n; i++)
+ {
+ a[i] = b;
+ b = -b;
+ }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
new file mode 100644
index 00000000000..bb8c22b9f9e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
@@ -0,0 +1,44 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include <string.h>
+#include "pr103144-neg-1.c"
+
+void
+avx2_test (void)
+{
+ int* epi32_exp = (int*) malloc (N * sizeof (int));
+ int* epi32_dst = (int*) malloc (N * sizeof (int));
+ long long* epi64_exp = (long long*) malloc (N * sizeof (int));
+
+ __builtin_memset (epi32_exp, 0, N * sizeof (int));
+ int b = 100;
+
+ for (int i = 0; i != N / 2; i++)
+ epi64_exp[i] = ((long long) b) | (((long long) -b) << 32);
+
+ memcpy (epi32_exp, epi64_exp, N * sizeof (int));
+ foo_neg (epi32_dst, b);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ foo_neg_peel (epi32_dst, b, 39);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ for (int i = 0; i != N / 2; i++)
+ epi64_exp[i] = ((long long) 1) | (((long long) -1) << 32);
+
+ memcpy (epi32_exp, epi64_exp, N * sizeof (int));
+ foo_neg_const (epi32_dst);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ foo_neg_const_peel (epi32_dst, 39);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
new file mode 100644
index 00000000000..2a6920350dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
@@ -0,0 +1,70 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
+
+#define N 10000
+void
+__attribute__((noipa))
+foo_shl (int* a, int b)
+{
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b <<= 1;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_ashr (int* a, int b)
+{
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b >>= 1;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_lshr (unsigned int* a, unsigned int b)
+{
+ for (int i = 0; i != N; i++)
+ {
+ a[i] = b;
+ b >>= 1U;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_shl_peel (int* a, int b)
+{
+ for (int i = 0; i != 39; i++)
+ {
+ a[i] = b;
+ b <<= 1;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_ashr_peel (int* a, int b)
+{
+ for (int i = 0; i != 39; i++)
+ {
+ a[i] = b;
+ b >>= 1;
+ }
+}
+
+void
+__attribute__((noipa))
+foo_lshr_peel (unsigned int* a, unsigned int b)
+{
+ for (int i = 0; i != 39; i++)
+ {
+ a[i] = b;
+ b >>= 1U;
+ }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c
new file mode 100644
index 00000000000..6f477191d96
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c
@@ -0,0 +1,79 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include <string.h>
+#include "pr103144-shift-1.c"
+
+typedef int v8si __attribute__((vector_size(32)));
+typedef unsigned int v8usi __attribute__((vector_size(32)));
+
+void
+avx2_test (void)
+{
+ int* epi32_exp = (int*) malloc (N * sizeof (int));
+ int* epi32_dst = (int*) malloc (N * sizeof (int));
+ unsigned int* epu32_exp = (unsigned int*) malloc (N * sizeof (int));
+ unsigned int* epu32_dst = (unsigned int*) malloc (N * sizeof (int));
+
+ __builtin_memset (epi32_exp, 0, N * sizeof (int));
+ int b = 8;
+ v8si init = __extension__(v8si) { b, b << 1, b << 2, b << 3, b << 4, b << 5, b << 6, b << 7 };
+
+ for (int i = 0; i != N / 8; i++)
+ {
+ memcpy (epi32_exp + i * 8, &init, 32);
+ init <<= 8;
+ }
+
+ foo_shl (epi32_dst, b);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ foo_shl_peel (epi32_dst, b);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ b = -11111;
+ init = __extension__(v8si) { b, b >> 1, b >> 2, b >> 3, b >> 4, b >> 5, b >> 6, b >> 7 };
+ for (int i = 0; i != N / 8; i++)
+ {
+ memcpy (epi32_exp + i * 8, &init, 32);
+ init >>= 8;
+ }
+
+ foo_ashr (epi32_dst, b);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ foo_ashr_peel (epi32_dst, b);
+ if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * sizeof (int)) != 0)
+ {
+ for (int i = 0; i != 39; i++)
+ {
+ printf ("epi32_dst[%d] is %d ----", i, epi32_dst[i]);
+ printf ("epi32_exp[%d] is %d\n", i, epi32_exp[i]);
+ }
+ __builtin_abort ();
+ }
+
+ __builtin_memset (epu32_exp, 0, N * sizeof (int));
+ unsigned int c = 11111111;
+ v8usi initu = __extension__(v8usi) { c, c >> 1U, c >> 2U, c >> 3U, c >> 4U, c >> 5U, c >> 6U, c >> 7U };
+ for (int i = 0; i != N / 8; i++)
+ {
+ memcpy (epu32_exp + i * 8, &initu, 32);
+ initu >>= 8U;
+ }
+
+ foo_lshr (epu32_dst, c);
+ if (__builtin_memcmp (epu32_dst, epu32_exp, N * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ foo_lshr_peel (epu32_dst, c);
+ if (__builtin_memcmp (epu32_dst, epu32_exp, 39 * sizeof (int)) != 0)
+ __builtin_abort ();
+
+ return;
+}
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 86d2264054a..fc7901d8a8a 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1559,15 +1559,28 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
gcc_assert (!tree_is_chrec (step_expr));
init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
+ gimple_seq stmts = NULL;
+ enum vect_induction_op_type induction_type
+ = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
- off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
- fold_convert (TREE_TYPE (step_expr), niters),
- step_expr);
- if (POINTER_TYPE_P (type))
- ni = fold_build_pointer_plus (init_expr, off);
+ if (induction_type == vect_step_op_add)
+ {
+ off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
+ fold_convert (TREE_TYPE (step_expr), niters),
+ step_expr);
+ if (POINTER_TYPE_P (type))
+ ni = fold_build_pointer_plus (init_expr, off);
+ else
+ ni = fold_build2 (PLUS_EXPR, type,
+ init_expr, fold_convert (type, off));
+ }
+ /* Don't bother call vect_peel_nonlinear_iv_init. */
+ else if (induction_type == vect_step_op_neg)
+ ni = init_expr;
else
- ni = fold_build2 (PLUS_EXPR, type,
- init_expr, fold_convert (type, off));
+ ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
+ niters, step_expr,
+ induction_type);
var = create_tmp_var (type, "tmp");
@@ -1576,9 +1589,15 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
ni_name = force_gimple_operand (ni, &new_stmts, false, var);
/* Exit_bb shouldn't be empty. */
if (!gsi_end_p (last_gsi))
- gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
+ {
+ gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
+ gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
+ }
else
- gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
+ {
+ gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
+ gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
+ }
/* Fix phi expressions in the successor bb. */
adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 2257b29a652..c2293466c1c 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -425,6 +425,77 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
return true;
}
+/* Function vect_is_nonlinear_iv_evolution
+
+ Only support nonlinear induction for integer type
+ 1. neg
+ 2. mul by constant
+ 3. lshift/rshift by constant.
+
+ For neg induction, return a fake step as integer -1. */
+static bool
+vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
+ gphi* loop_phi_node, tree *init, tree *step)
+{
+ tree init_expr, ev_expr, result, op1, op2;
+ gimple* def;
+
+ if (gimple_phi_num_args (loop_phi_node) != 2)
+ return false;
+
+ init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
+ ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
+
+ /* Support nonlinear induction only for integer type. */
+ if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
+ return false;
+
+ *init = init_expr;
+ result = PHI_RESULT (loop_phi_node);
+
+ if (TREE_CODE (ev_expr) != SSA_NAME
+ || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
+ || !is_gimple_assign (def))
+ return false;
+
+ enum tree_code t_code = gimple_assign_rhs_code (def);
+ switch (t_code)
+ {
+ case NEGATE_EXPR:
+ if (gimple_assign_rhs1 (def) != result)
+ return false;
+ *step = build_int_cst (TREE_TYPE (init_expr), -1);
+ STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
+ break;
+
+ case RSHIFT_EXPR:
+ case LSHIFT_EXPR:
+ case MULT_EXPR:
+ op1 = gimple_assign_rhs1 (def);
+ op2 = gimple_assign_rhs2 (def);
+ if (TREE_CODE (op2) != INTEGER_CST
+ || op1 != result)
+ return false;
+ *step = op2;
+ if (t_code == LSHIFT_EXPR)
+ STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
+ else if (t_code == RSHIFT_EXPR)
+ STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
+ /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
+ else
+ STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
+ break;
+
+ default:
+ return false;
+ }
+
+ STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
+ STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
+
+ return true;
+}
+
/* Return true if PHI, described by STMT_INFO, is the inner PHI in
what we are assuming is a double reduction. For example, given
a structure like this:
@@ -512,11 +583,16 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
= evolution_part_in_loop_num (access_fn, loop->num);
}
- if (!access_fn
- || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
- || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
- || (LOOP_VINFO_LOOP (loop_vinfo) != loop
- && TREE_CODE (step) != INTEGER_CST))
+ if ((!access_fn
+ || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
+ || !vect_is_simple_iv_evolution (loop->num, access_fn,
+ &init, &step)
+ || (LOOP_VINFO_LOOP (loop_vinfo) != loop
+ && TREE_CODE (step) != INTEGER_CST))
+ /* Only handle nonlinear iv for same loop. */
+ && (LOOP_VINFO_LOOP (loop_vinfo) != loop
+ || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
+ phi, &init, &step)))
{
worklist.safe_push (stmt_vinfo);
continue;
@@ -8229,6 +8305,591 @@ vect_can_vectorize_without_simd_p (code_helper code)
&& vect_can_vectorize_without_simd_p (tree_code (code)));
}
+/* Create vector init for vectorized iv. */
+static tree
+vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
+ tree step_expr, poly_uint64 nunits,
+ tree vectype,
+ enum vect_induction_op_type induction_type)
+{
+ unsigned HOST_WIDE_INT const_nunits;
+ tree vec_shift, vec_init, new_name;
+ unsigned i;
+ tree itype = TREE_TYPE (vectype);
+
+ /* iv_loop is the loop to be vectorized. Create:
+ vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
+ new_name = gimple_convert (stmts, itype, init_expr);
+ switch (induction_type)
+ {
+ case vect_step_op_shr:
+ case vect_step_op_shl:
+ /* Build the Initial value from shift_expr. */
+ vec_init = gimple_build_vector_from_val (stmts,
+ vectype,
+ new_name);
+ vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
+ build_zero_cst (itype), step_expr);
+ vec_init = gimple_build (stmts,
+ (induction_type == vect_step_op_shr
+ ? RSHIFT_EXPR : LSHIFT_EXPR),
+ vectype, vec_init, vec_shift);
+ break;
+
+ case vect_step_op_neg:
+ {
+ vec_init = gimple_build_vector_from_val (stmts,
+ vectype,
+ new_name);
+ tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
+ vectype, vec_init);
+ /* The encoding has 2 interleaved stepped patterns. */
+ vec_perm_builder sel (nunits, 2, 3);
+ sel.quick_grow (6);
+ for (i = 0; i < 3; i++)
+ {
+ sel[2 * i] = i;
+ sel[2 * i + 1] = i + nunits;
+ }
+ vec_perm_indices indices (sel, 2, nunits);
+ tree perm_mask_even
+ = vect_gen_perm_mask_checked (vectype, indices);
+ vec_init = gimple_build (stmts, VEC_PERM_EXPR,
+ vectype,
+ vec_init, vec_neg,
+ perm_mask_even);
+ }
+ break;
+
+ case vect_step_op_mul:
+ {
+ /* Use unsigned mult to avoid UD integer overflow. */
+ gcc_assert (nunits.is_constant (&const_nunits));
+ tree utype = unsigned_type_for (itype);
+ tree uvectype = build_vector_type (utype,
+ TYPE_VECTOR_SUBPARTS (vectype));
+ new_name = gimple_convert (stmts, utype, new_name);
+ vec_init = gimple_build_vector_from_val (stmts,
+ uvectype,
+ new_name);
+ tree_vector_builder elts (uvectype, const_nunits, 1);
+ tree elt_step = build_one_cst (utype);
+
+ elts.quick_push (elt_step);
+ for (i = 1; i < const_nunits; i++)
+ {
+ /* Create: new_name_i = new_name + step_expr. */
+ elt_step = gimple_build (stmts, MULT_EXPR,
+ utype, elt_step, step_expr);
+ elts.quick_push (elt_step);
+ }
+ /* Create a vector from [new_name_0, new_name_1, ...,
+ new_name_nunits-1]. */
+ tree vec_mul = gimple_build_vector (stmts, &elts);
+ vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
+ vec_init, vec_mul);
+ vec_init = gimple_convert (stmts, vectype, vec_init);
+ }
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ return vec_init;
+}
+
+/* Peel init_expr by skip_niter for induction_type. */
+tree
+vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
+ tree skip_niters, tree step_expr,
+ enum vect_induction_op_type induction_type)
+{
+ gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
+ tree type = TREE_TYPE (init_expr);
+ unsigned prec = TYPE_PRECISION (type);
+ switch (induction_type)
+ {
+ case vect_step_op_neg:
+ if (TREE_INT_CST_LOW (skip_niters) % 2)
+ init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
+ /* else no change. */
+ break;
+
+ case vect_step_op_shr:
+ case vect_step_op_shl:
+ skip_niters = gimple_convert (stmts, type, skip_niters);
+ step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
+ /* When shift mount >= precision, need to avoid UD.
+ In the original loop, there's no UD, and according to semantic,
+ init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
+ if (!tree_fits_uhwi_p (step_expr)
+ || tree_to_uhwi (step_expr) >= prec)
+ {
+ if (induction_type == vect_step_op_shl
+ || TYPE_UNSIGNED (type))
+ init_expr = build_zero_cst (type);
+ else
+ init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
+ init_expr,
+ wide_int_to_tree (type, prec - 1));
+ }
+ else
+ init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
+ ? RSHIFT_EXPR : LSHIFT_EXPR),
+ type, init_expr, step_expr);
+ break;
+
+ case vect_step_op_mul:
+ {
+ tree utype = unsigned_type_for (type);
+ init_expr = gimple_convert (stmts, utype, init_expr);
+ unsigned skipn = TREE_INT_CST_LOW (skip_niters);
+ wide_int begin = wi::to_wide (step_expr);
+ for (unsigned i = 0; i != skipn - 1; i++)
+ begin = wi::mul (begin, wi::to_wide (step_expr));
+ tree mult_expr = wide_int_to_tree (utype, begin);
+ init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
+ init_expr = gimple_convert (stmts, type, init_expr);
+ }
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ return init_expr;
+}
+
+/* Create vector step for vectorized iv. */
+static tree
+vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
+ poly_uint64 vf,
+ enum vect_induction_op_type induction_type)
+{
+ tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
+ tree new_name = NULL;
+ /* Step should be pow (step, vf) for mult induction. */
+ if (induction_type == vect_step_op_mul)
+ {
+ gcc_assert (vf.is_constant ());
+ wide_int begin = wi::to_wide (step_expr);
+
+ for (unsigned i = 0; i != vf.to_constant () - 1; i++)
+ begin = wi::mul (begin, wi::to_wide (step_expr));
+
+ new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
+ }
+ else if (induction_type == vect_step_op_neg)
+ /* Do nothing. */
+ ;
+ else
+ new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
+ expr, step_expr);
+ return new_name;
+}
+
+static tree
+vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
+ stmt_vec_info stmt_info,
+ tree new_name, tree vectype,
+ enum vect_induction_op_type induction_type)
+{
+ /* No step is needed for neg induction. */
+ if (induction_type == vect_step_op_neg)
+ return NULL;
+
+ tree t = unshare_expr (new_name);
+ gcc_assert (CONSTANT_CLASS_P (new_name)
+ || TREE_CODE (new_name) == SSA_NAME);
+ tree new_vec = build_vector_from_val (vectype, t);
+ tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
+ new_vec, vectype, NULL);
+ return vec_step;
+}
+
+/* Update vectorized iv with vect_step, induc_def is init. */
+static tree
+vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
+ tree induc_def, tree vec_step,
+ enum vect_induction_op_type induction_type)
+{
+ tree vec_def = induc_def;
+ switch (induction_type)
+ {
+ case vect_step_op_mul:
+ {
+ /* Use unsigned mult to avoid UD integer overflow. */
+ tree uvectype
+ = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
+ TYPE_VECTOR_SUBPARTS (vectype));
+ vec_def = gimple_convert (stmts, uvectype, vec_def);
+ vec_step = gimple_convert (stmts, uvectype, vec_step);
+ vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
+ vec_def, vec_step);
+ vec_def = gimple_convert (stmts, vectype, vec_def);
+ }
+ break;
+
+ case vect_step_op_shr:
+ vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
+ vec_def, vec_step);
+ break;
+
+ case vect_step_op_shl:
+ vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
+ vec_def, vec_step);
+ break;
+ case vect_step_op_neg:
+ vec_def = induc_def;
+ /* Do nothing. */
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ return vec_def;
+
+}
+/* Function vectorizable_induction
+
+ Check if STMT_INFO performs an nonlinear induction computation that can be
+ vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
+ a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
+ basic block.
+ Return true if STMT_INFO is vectorizable in this way. */
+
+static bool
+vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
+ stmt_vec_info stmt_info,
+ gimple **vec_stmt, slp_tree slp_node,
+ stmt_vector_for_cost *cost_vec)
+{
+ class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ unsigned ncopies;
+ bool nested_in_vect_loop = false;
+ class loop *iv_loop;
+ tree vec_def;
+ edge pe = loop_preheader_edge (loop);
+ basic_block new_bb;
+ tree vec_init, vec_step;
+ tree new_name;
+ gimple *new_stmt;
+ gphi *induction_phi;
+ tree induc_def, vec_dest;
+ tree init_expr, step_expr;
+ tree niters_skip;
+ poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ unsigned i;
+ gimple_stmt_iterator si;
+
+ gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
+
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+ poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+ enum vect_induction_op_type induction_type
+ = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
+
+ gcc_assert (induction_type > vect_step_op_add);
+
+ if (slp_node)
+ ncopies = 1;
+ else
+ ncopies = vect_get_num_copies (loop_vinfo, vectype);
+ gcc_assert (ncopies >= 1);
+
+ /* FORNOW. Only handle nonlinear induction in the same loop. */
+ if (nested_in_vect_loop_p (loop, stmt_info))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "nonlinear induction in nested loop.\n");
+ return false;
+ }
+
+ iv_loop = loop;
+ gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
+
+ /* TODO: Support slp for nonlinear iv. There should be separate vector iv
+ update for each iv and a permutation to generate wanted vector iv. */
+ if (slp_node)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "SLP induction not supported for nonlinear"
+ " induction.\n");
+ return false;
+ }
+
+ /* Init_expr will be update by vect_update_ivs_after_vectorizer,
+ if niters is unkown:
+ For shift, when shift mount >= precision, there would be UD.
+ For mult, don't known how to generate
+ init_expr * pow (step, niters) for variable niters.
+ For neg, it should be ok, since niters of vectorized main loop
+ will always be multiple of 2. */
+ if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && induction_type != vect_step_op_neg)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Peeling for epilogue is not supported"
+ " for nonlinear induction except neg"
+ " when iteration count is unknown.\n");
+ return false;
+ }
+
+ /* Also doens't support peel for neg when niter is variable.
+ ??? generate something like niter_expr & 1 ? init_expr : -init_expr? */
+ niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+ if (niters_skip != NULL_TREE
+ && TREE_CODE (niters_skip) != INTEGER_CST)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Peeling for alignement is not supported"
+ " for nonlinear induction when niters_skip"
+ " is not constant.\n");
+ return false;
+ }
+
+ if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && induction_type == vect_step_op_mul)
+ if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "floating point nonlinear induction vectorization"
+ " not supported.\n");
+ return false;
+ }
+
+ step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
+ init_expr = vect_phi_initial_value (phi);
+ gcc_assert (step_expr != NULL_TREE && init_expr != NULL
+ && TREE_CODE (step_expr) == INTEGER_CST);
+ /* step_expr should be aligned with init_expr,
+ .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
+ step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
+
+ if (TREE_CODE (init_expr) == INTEGER_CST)
+ init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
+ else
+ gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
+ TREE_TYPE (init_expr)));
+
+ switch (induction_type)
+ {
+ case vect_step_op_neg:
+ if (TREE_CODE (init_expr) != INTEGER_CST
+ && TREE_CODE (init_expr) != REAL_CST)
+ {
+ /* Check for backend support of NEGATE_EXPR and vec_perm. */
+ if (!directly_supported_p (NEGATE_EXPR, vectype))
+ return false;
+
+ /* The encoding has 2 interleaved stepped patterns. */
+ vec_perm_builder sel (nunits, 2, 3);
+ machine_mode mode = TYPE_MODE (vectype);
+ sel.quick_grow (6);
+ for (i = 0; i < 3; i++)
+ {
+ sel[i * 2] = i;
+ sel[i * 2 + 1] = i + nunits;
+ }
+ vec_perm_indices indices (sel, 2, nunits);
+ if (!can_vec_perm_const_p (mode, mode, indices))
+ return false;
+ }
+ break;
+
+ case vect_step_op_mul:
+ {
+ /* Check for backend support of MULT_EXPR. */
+ if (!directly_supported_p (MULT_EXPR, vectype))
+ return false;
+
+ /* ?? How to construct vector step for variable number vector.
+ [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
+ if (!vf.is_constant ())
+ return false;
+ }
+ break;
+
+ case vect_step_op_shr:
+ /* Check for backend support of RSHIFT_EXPR. */
+ if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
+ return false;
+
+ /* Don't shift more than type precision to avoid UD. */
+ if (!tree_fits_uhwi_p (step_expr)
+ || maybe_ge (nunits * tree_to_uhwi (step_expr),
+ TYPE_PRECISION (TREE_TYPE (init_expr))))
+ return false;
+ break;
+
+ case vect_step_op_shl:
+ /* Check for backend support of RSHIFT_EXPR. */
+ if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
+ return false;
+
+ /* Don't shift more than type precision to avoid UD. */
+ if (!tree_fits_uhwi_p (step_expr)
+ || maybe_ge (nunits * tree_to_uhwi (step_expr),
+ TYPE_PRECISION (TREE_TYPE (init_expr))))
+ return false;
+
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ if (!vec_stmt) /* transformation not required. */
+ {
+ unsigned inside_cost = 0, prologue_cost = 0;
+ /* loop cost for vec_loop. Neg induction doesn't have any
+ inside_cost. */
+ inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
+ stmt_info, 0, vect_body);
+
+ /* loop cost for vec_loop. Neg induction doesn't have any
+ inside_cost. */
+ if (induction_type == vect_step_op_neg)
+ inside_cost = 0;
+
+ /* prologue cost for vec_init and vec_step. */
+ prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
+ stmt_info, 0, vect_prologue);
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "vect_model_induction_cost: inside_cost = %d, "
+ "prologue_cost = %d. \n", inside_cost,
+ prologue_cost);
+
+ STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
+ DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
+ return true;
+ }
+
+ /* Transform. */
+
+ /* Compute a vector variable, initialized with the first VF values of
+ the induction variable. E.g., for an iv with IV_PHI='X' and
+ evolution S, for a vector of 4 units, we want to compute:
+ [X, X + S, X + 2*S, X + 3*S]. */
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
+
+ pe = loop_preheader_edge (iv_loop);
+ /* Find the first insertion point in the BB. */
+ basic_block bb = gimple_bb (phi);
+ si = gsi_after_labels (bb);
+
+ gimple_seq stmts = NULL;
+
+ /* If we are using the loop mask to "peel" for alignment then we need
+ to adjust the start value here. */
+ if (niters_skip != NULL_TREE)
+ init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
+ step_expr, induction_type);
+
+ vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
+ step_expr, nunits, vectype,
+ induction_type);
+ if (stmts)
+ {
+ new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+ gcc_assert (!new_bb);
+ }
+
+ stmts = NULL;
+ new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
+ vf, induction_type);
+ if (stmts)
+ {
+ new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+ gcc_assert (!new_bb);
+ }
+
+ vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
+ new_name, vectype,
+ induction_type);
+ /* Create the following def-use cycle:
+ loop prolog:
+ vec_init = ...
+ vec_step = ...
+ loop:
+ vec_iv = PHI <vec_init, vec_loop>
+ ...
+ STMT
+ ...
+ vec_loop = vec_iv + vec_step; */
+
+ /* Create the induction-phi that defines the induction-operand. */
+ vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
+ induction_phi = create_phi_node (vec_dest, iv_loop->header);
+ induc_def = PHI_RESULT (induction_phi);
+
+ /* Create the iv update inside the loop. */
+ stmts = NULL;
+ vec_def = vect_update_nonlinear_iv (&stmts, vectype,
+ induc_def, vec_step,
+ induction_type);
+
+ gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
+ new_stmt = SSA_NAME_DEF_STMT (vec_def);
+
+ /* Set the arguments of the phi node: */
+ add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
+ add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
+ UNKNOWN_LOCATION);
+
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
+ *vec_stmt = induction_phi;
+
+ /* In case that vectorization factor (VF) is bigger than the number
+ of elements that we can fit in a vectype (nunits), we have to generate
+ more than one vector stmt - i.e - we need to "unroll" the
+ vector stmt by a factor VF/nunits. For more details see documentation
+ in vectorizable_operation. */
+
+ if (ncopies > 1)
+ {
+ stmts = NULL;
+ /* FORNOW. This restriction should be relaxed. */
+ gcc_assert (!nested_in_vect_loop);
+
+ new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
+ nunits, induction_type);
+
+ vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
+ new_name, vectype,
+ induction_type);
+ vec_def = induc_def;
+ for (i = 1; i < ncopies; i++)
+ {
+ /* vec_i = vec_prev + vec_step. */
+ stmts = NULL;
+ vec_def = vect_update_nonlinear_iv (&stmts, vectype,
+ vec_def, vec_step,
+ induction_type);
+ gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
+ new_stmt = SSA_NAME_DEF_STMT (vec_def);
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+ }
+ }
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "transform induction: created def-use cycle: %G%G",
+ induction_phi, SSA_NAME_DEF_STMT (vec_def));
+
+ return true;
+}
+
/* Function vectorizable_induction
Check if STMT_INFO performs an induction computation that can be vectorized.
@@ -8259,6 +8920,8 @@ vectorizable_induction (loop_vec_info loop_vinfo,
unsigned i;
tree expr;
gimple_stmt_iterator si;
+ enum vect_induction_op_type induction_type
+ = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
if (!phi)
@@ -8271,6 +8934,11 @@ vectorizable_induction (loop_vec_info loop_vinfo,
if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
return false;
+ /* Handle nonlinear induction in a separate place. */
+ if (induction_type != vect_step_op_add)
+ return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
+ vec_stmt, slp_node, cost_vec);
+
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index e5fdc9e0a14..1fa08126ad8 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -68,6 +68,15 @@ enum vect_def_type {
vect_unknown_def_type
};
+/* Define operation type of linear/non-linear induction variable. */
+enum vect_induction_op_type {
+ vect_step_op_add = 0,
+ vect_step_op_neg,
+ vect_step_op_mul,
+ vect_step_op_shl,
+ vect_step_op_shr
+};
+
/* Define type of reduction. */
enum vect_reduction_type {
TREE_CODE_REDUCTION,
@@ -1188,6 +1197,7 @@ public:
the version here. */
tree loop_phi_evolution_base_unchanged;
tree loop_phi_evolution_part;
+ enum vect_induction_op_type loop_phi_evolution_type;
/* Used for various bookkeeping purposes, generally holding a pointer to
some other stmt S that is in some way "related" to this stmt.
@@ -1421,6 +1431,7 @@ struct gather_scatter_info {
((S)->dr_aux.dr && DR_GROUP_FIRST_ELEMENT(S))
#define STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED(S) (S)->loop_phi_evolution_base_unchanged
#define STMT_VINFO_LOOP_PHI_EVOLUTION_PART(S) (S)->loop_phi_evolution_part
+#define STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE(S) (S)->loop_phi_evolution_type
#define STMT_VINFO_MIN_NEG_DIST(S) (S)->min_neg_dist
#define STMT_VINFO_REDUC_TYPE(S) (S)->reduc_type
#define STMT_VINFO_REDUC_CODE(S) (S)->reduc_code
@@ -2327,6 +2338,10 @@ extern int vect_get_known_peeling_cost (loop_vec_info, int, int *,
stmt_vector_for_cost *);
extern tree cse_and_gimplify_to_preheader (loop_vec_info, tree);
+/* Nonlinear induction. */
+extern tree vect_peel_nonlinear_iv_init (gimple_seq*, tree, tree,
+ tree, enum vect_induction_op_type);
+
/* In tree-vect-slp.cc. */
extern void vect_slp_init (void);
extern void vect_slp_fini (void);
--
2.18.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH V2] Extend vectorizer to handle nonlinear induction for neg, mul/lshift/rshift with a constant.
2022-08-29 5:24 ` [PATCH V2] " liuhongt
2022-08-29 5:24 ` liuhongt
@ 2022-09-05 10:58 ` Richard Biener
1 sibling, 0 replies; 7+ messages in thread
From: Richard Biener @ 2022-09-05 10:58 UTC (permalink / raw)
To: liuhongt; +Cc: GCC Patches
On Mon, Aug 29, 2022 at 7:29 AM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> >Looks good overall - a few comments inline. Also can you please add
> >SLP support?
> >I've tried hard to fill in gaps where SLP support is missing since my
> >goal is still to get
> >rid of non-SLP.
> For slp with different induction type, they need separate iv update and
> an vector permutation. And if they are the same induction type, iv can
> be updated with 1 instructions w/o permutation.
> I'll add a incremental patch for that.
>
> >gcc_assert (is_a <gphi *> (loop_phi_node));
> >is shorter
> It looks like it doesn't support gphi*, just support gimple*. Since it's
> already gphi* here, I've removed the assert.
>
> >I think you should use
> >init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node,loop_preheader_edge (loop));
> >and loop_latch_edge (loop) for ev_expr, you can't rely on them being arg 0 / 1.
> Changed.
>
> >Likewise. Use preheader/latch edge.
> Changed.
>
> >and those two should then go away.
> Removed.
>
> >is not vectorized? I think it should be possible to relax
> >this.
> Relaxed.
>
> >def is never NULL so a cheaper way to write this is
> > || ((def = SSA_NAME_DEF_STMT (ev_expr)), true)
> Changed.
>
> >not sure if we need to bother here - ideally vectorizable_live_operation would
> >give up instead (but I suppose the regular IV / induction analysis gives up
> >here as well?)
> Removed.
>
> >the above can at least go into the combined switch case
> Changed.
>
> >Seeing this - did you check whether you handle prologue peeling correctly? The
> >same issue might show up with a vectorized epilogue. I think you can force a
> >peeled prologue with storing unaligned and -fno-vect-cost-model (that IIRC will
> >simply optimize for the larger number of aligned memory ops)
> Update in vect_update_ivs_after_vectorizer, also support peel for unaligned cases.
>
> >since you only handle inner loop nonlinear IVs you should probably
> >swap the two checks?
> Changed.
>
> >There might be a more canonical way to build the series expr
> build_vec_series doens't add stmt to sequence, so i'll still keep VEC_SERY_EXPR here?
>
> >use types_compatible_p (...) instead of comparing TYPE_CANONICAL.
> >A small enhancement would be to support different signedness
> >(use tree_nop_conversion_p then).
> Support different signedness.
>
> >above you asserted that the conversion is only necessary for constants
> >but then fold_convert will also generate a tree NOP_EXPR for
> >some types_compatible_p types. So maybe only do this for INTEGER_CST
> >init_expr or use init_expr = gimple_convert (...) and insert required stmts
> >on the preheader.
> Changed.
>
> >Alternatively you could perform the vector IV updates in an unsigned type?
> Changed.
>
> >why's that necessary? can we do a MIN (vector_step, { prec-1, prec-1,
> >prec-1 ... })
> It's true for ashr, but not for ashl, lshr. For the later 2, when vector_step >= precision
> The result should be zero instead of shift by prec - 1.
>
> >> + new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
> >> + nunits, induction_type);
> >> +
> >> + vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
> >> + new_name, vectype,
> >> + induction_type);
>
> >are these not the same as created above?are these not the same as created above?
>
> They are different, the first one is vf, this is nunits, vf could be multi copy of nunits which
> is exact this code is handled and phi_latch is updated in the former vf place.
>
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
OK, and sorry for the delay.
Thanks.
Richard.
>
> For neg, the patch create a vec_init as [ a, -a, a, -a, ... ] and no
> vec_step is needed to update vectorized iv since vf is always multiple
> of 2(negative * negative is positive).
>
> For shift, the patch create a vec_init as [ a, a >> c, a >> 2*c, ..]
> as vec_step as [ c * nunits, c * nunits, c * nunits, ... ], vectorized iv is
> updated as vec_def = vec_init >>/<< vec_step.
>
> For mul, the patch create a vec_init as [ a, a * c, a * pow(c, 2), ..]
> as vec_step as [ pow(c,nunits), pow(c,nunits),...] iv is updated as vec_def =
> vec_init * vec_step.
>
> The patch handles nonlinear iv for
> 1. Integer type only, floating point is not handled.
> 2. No slp_node.
> 3. iv_loop should be same as vector loop, not nested loop.
> 4. No UD is created, for mul, use unsigned mult to avoid UD, for
> shift, shift count should be less than type precision.
>
> gcc/ChangeLog:
>
> PR tree-optimization/103144
> * tree-vect-loop.cc (vect_is_nonlinear_iv_evolution): New function.
> (vect_analyze_scalar_cycles_1): Detect nonlinear iv by upper function.
> (vect_create_nonlinear_iv_init): New function.
> (vect_peel_nonlinear_iv_init): Ditto.
> (vect_create_nonlinear_iv_step): Ditto
> (vect_create_nonlinear_iv_vec_step): Ditto
> (vect_update_nonlinear_iv): Ditto
> (vectorizable_nonlinear_induction): Ditto.
> (vectorizable_induction): Call
> vectorizable_nonlinear_induction when induction_type is not
> vect_step_op_add.
> * tree-vect-loop-manip.cc (vect_update_ivs_after_vectorizer):
> Update nonlinear iv for epilogue loop.
> * tree-vectorizer.h (enum vect_induction_op_type): New enum.
> (STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE): New Macro.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr103144-mul-1.c: New test.
> * gcc.target/i386/pr103144-mul-2.c: New test.
> * gcc.target/i386/pr103144-neg-1.c: New test.
> * gcc.target/i386/pr103144-neg-2.c: New test.
> * gcc.target/i386/pr103144-shift-1.c: New test.
> * gcc.target/i386/pr103144-shift-2.c: New test.
> ---
> .../gcc.target/i386/pr103144-mul-1.c | 51 ++
> .../gcc.target/i386/pr103144-mul-2.c | 51 ++
> .../gcc.target/i386/pr103144-neg-1.c | 51 ++
> .../gcc.target/i386/pr103144-neg-2.c | 44 ++
> .../gcc.target/i386/pr103144-shift-1.c | 70 ++
> .../gcc.target/i386/pr103144-shift-2.c | 79 ++
> gcc/tree-vect-loop-manip.cc | 37 +-
> gcc/tree-vect-loop.cc | 678 +++++++++++++++++-
> gcc/tree-vectorizer.h | 15 +
> 9 files changed, 1062 insertions(+), 14 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-shift-2.c
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
> new file mode 100644
> index 00000000000..640c34fd959
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
> @@ -0,0 +1,51 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" } } */
> +
> +#define N 10000
> +
> +void
> +__attribute__((noipa))
> +foo_mul (int* a, int b)
> +{
> + for (int i = 0; i != N; i++)
> + {
> + a[i] = b;
> + b *= 3;
> + }
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_mul_const (int* a)
> +{
> + int b = 1;
> + for (int i = 0; i != N; i++)
> + {
> + a[i] = b;
> + b *= 3;
> + }
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_mul_peel (int* a, int b)
> +{
> + for (int i = 0; i != 39; i++)
> + {
> + a[i] = b;
> + b *= 3;
> + }
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_mul_peel_const (int* a)
> +{
> + int b = 1;
> + for (int i = 0; i != 39; i++)
> + {
> + a[i] = b;
> + b *= 3;
> + }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
> new file mode 100644
> index 00000000000..39fdea3a69d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
> @@ -0,0 +1,51 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx2 } */
> +
> +#include "avx2-check.h"
> +#include <string.h>
> +#include "pr103144-mul-1.c"
> +
> +typedef int v8si __attribute__((vector_size(32)));
> +
> +void
> +avx2_test (void)
> +{
> + int* epi32_exp = (int*) malloc (N * sizeof (int));
> + int* epi32_dst = (int*) malloc (N * sizeof (int));
> +
> + __builtin_memset (epi32_exp, 0, N * sizeof (int));
> + int b = 8;
> + v8si init = __extension__(v8si) { b, b * 3, b * 9, b * 27, b * 81, b * 243, b * 729, b * 2187 };
> +
> + for (int i = 0; i != N / 8; i++)
> + {
> + memcpy (epi32_exp + i * 8, &init, 32);
> + init *= 6561;
> + }
> +
> + foo_mul (epi32_dst, b);
> + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> + __builtin_abort ();
> +
> + foo_mul_peel (epi32_dst, b);
> + if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * 4) != 0)
> + __builtin_abort ();
> +
> + init = __extension__(v8si) { 1, 3, 9, 27, 81, 243, 729, 2187 };
> + for (int i = 0; i != N / 8; i++)
> + {
> + memcpy (epi32_exp + i * 8, &init, 32);
> + init *= 6561;
> + }
> +
> + foo_mul_const (epi32_dst);
> + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> + __builtin_abort ();
> +
> + foo_mul_peel_const (epi32_dst);
> + if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * 4) != 0)
> + __builtin_abort ();
> +
> + return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
> new file mode 100644
> index 00000000000..f87b1d6e529
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
> @@ -0,0 +1,51 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" } } */
> +
> +#define N 10000
> +
> +void
> +__attribute__((noipa))
> +foo_neg (int* a, int b)
> +{
> + for (int i = 0; i != N; i++)
> + {
> + a[i] = b;
> + b = -b;
> + }
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_neg_const (int* a)
> +{
> + int b = 1;
> + for (int i = 0; i != N; i++)
> + {
> + a[i] = b;
> + b = -b;
> + }
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_neg_peel (int* a, int b, int n)
> +{
> + for (int i = 0; i != n; i++)
> + {
> + a[i] = b;
> + b = -b;
> + }
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_neg_const_peel (int* a, int n)
> +{
> + int b = 1;
> + for (int i = 0; i != n; i++)
> + {
> + a[i] = b;
> + b = -b;
> + }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
> new file mode 100644
> index 00000000000..bb8c22b9f9e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
> @@ -0,0 +1,44 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx2 } */
> +
> +#include "avx2-check.h"
> +#include <string.h>
> +#include "pr103144-neg-1.c"
> +
> +void
> +avx2_test (void)
> +{
> + int* epi32_exp = (int*) malloc (N * sizeof (int));
> + int* epi32_dst = (int*) malloc (N * sizeof (int));
> + long long* epi64_exp = (long long*) malloc (N * sizeof (int));
> +
> + __builtin_memset (epi32_exp, 0, N * sizeof (int));
> + int b = 100;
> +
> + for (int i = 0; i != N / 2; i++)
> + epi64_exp[i] = ((long long) b) | (((long long) -b) << 32);
> +
> + memcpy (epi32_exp, epi64_exp, N * sizeof (int));
> + foo_neg (epi32_dst, b);
> + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> + __builtin_abort ();
> +
> + foo_neg_peel (epi32_dst, b, 39);
> + if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * sizeof (int)) != 0)
> + __builtin_abort ();
> +
> + for (int i = 0; i != N / 2; i++)
> + epi64_exp[i] = ((long long) 1) | (((long long) -1) << 32);
> +
> + memcpy (epi32_exp, epi64_exp, N * sizeof (int));
> + foo_neg_const (epi32_dst);
> + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> + __builtin_abort ();
> +
> + foo_neg_const_peel (epi32_dst, 39);
> + if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * sizeof (int)) != 0)
> + __builtin_abort ();
> +
> + return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
> new file mode 100644
> index 00000000000..2a6920350dd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
> @@ -0,0 +1,70 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> +
> +#define N 10000
> +void
> +__attribute__((noipa))
> +foo_shl (int* a, int b)
> +{
> + for (int i = 0; i != N; i++)
> + {
> + a[i] = b;
> + b <<= 1;
> + }
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ashr (int* a, int b)
> +{
> + for (int i = 0; i != N; i++)
> + {
> + a[i] = b;
> + b >>= 1;
> + }
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_lshr (unsigned int* a, unsigned int b)
> +{
> + for (int i = 0; i != N; i++)
> + {
> + a[i] = b;
> + b >>= 1U;
> + }
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_shl_peel (int* a, int b)
> +{
> + for (int i = 0; i != 39; i++)
> + {
> + a[i] = b;
> + b <<= 1;
> + }
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ashr_peel (int* a, int b)
> +{
> + for (int i = 0; i != 39; i++)
> + {
> + a[i] = b;
> + b >>= 1;
> + }
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_lshr_peel (unsigned int* a, unsigned int b)
> +{
> + for (int i = 0; i != 39; i++)
> + {
> + a[i] = b;
> + b >>= 1U;
> + }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c
> new file mode 100644
> index 00000000000..6f477191d96
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c
> @@ -0,0 +1,79 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx2 } */
> +
> +#include "avx2-check.h"
> +#include <string.h>
> +#include "pr103144-shift-1.c"
> +
> +typedef int v8si __attribute__((vector_size(32)));
> +typedef unsigned int v8usi __attribute__((vector_size(32)));
> +
> +void
> +avx2_test (void)
> +{
> + int* epi32_exp = (int*) malloc (N * sizeof (int));
> + int* epi32_dst = (int*) malloc (N * sizeof (int));
> + unsigned int* epu32_exp = (unsigned int*) malloc (N * sizeof (int));
> + unsigned int* epu32_dst = (unsigned int*) malloc (N * sizeof (int));
> +
> + __builtin_memset (epi32_exp, 0, N * sizeof (int));
> + int b = 8;
> + v8si init = __extension__(v8si) { b, b << 1, b << 2, b << 3, b << 4, b << 5, b << 6, b << 7 };
> +
> + for (int i = 0; i != N / 8; i++)
> + {
> + memcpy (epi32_exp + i * 8, &init, 32);
> + init <<= 8;
> + }
> +
> + foo_shl (epi32_dst, b);
> + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> + __builtin_abort ();
> +
> + foo_shl_peel (epi32_dst, b);
> + if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * sizeof (int)) != 0)
> + __builtin_abort ();
> +
> + b = -11111;
> + init = __extension__(v8si) { b, b >> 1, b >> 2, b >> 3, b >> 4, b >> 5, b >> 6, b >> 7 };
> + for (int i = 0; i != N / 8; i++)
> + {
> + memcpy (epi32_exp + i * 8, &init, 32);
> + init >>= 8;
> + }
> +
> + foo_ashr (epi32_dst, b);
> + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> + __builtin_abort ();
> +
> + foo_ashr_peel (epi32_dst, b);
> + if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * sizeof (int)) != 0)
> + {
> + for (int i = 0; i != 39; i++)
> + {
> + printf ("epi32_dst[%d] is %d ----", i, epi32_dst[i]);
> + printf ("epi32_exp[%d] is %d\n", i, epi32_exp[i]);
> + }
> + __builtin_abort ();
> + }
> +
> + __builtin_memset (epu32_exp, 0, N * sizeof (int));
> + unsigned int c = 11111111;
> + v8usi initu = __extension__(v8usi) { c, c >> 1U, c >> 2U, c >> 3U, c >> 4U, c >> 5U, c >> 6U, c >> 7U };
> + for (int i = 0; i != N / 8; i++)
> + {
> + memcpy (epu32_exp + i * 8, &initu, 32);
> + initu >>= 8U;
> + }
> +
> + foo_lshr (epu32_dst, c);
> + if (__builtin_memcmp (epu32_dst, epu32_exp, N * sizeof (int)) != 0)
> + __builtin_abort ();
> +
> + foo_lshr_peel (epu32_dst, c);
> + if (__builtin_memcmp (epu32_dst, epu32_exp, 39 * sizeof (int)) != 0)
> + __builtin_abort ();
> +
> + return;
> +}
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index 86d2264054a..fc7901d8a8a 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -1559,15 +1559,28 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> gcc_assert (!tree_is_chrec (step_expr));
>
> init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
> + gimple_seq stmts = NULL;
> + enum vect_induction_op_type induction_type
> + = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
>
> - off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
> - fold_convert (TREE_TYPE (step_expr), niters),
> - step_expr);
> - if (POINTER_TYPE_P (type))
> - ni = fold_build_pointer_plus (init_expr, off);
> + if (induction_type == vect_step_op_add)
> + {
> + off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
> + fold_convert (TREE_TYPE (step_expr), niters),
> + step_expr);
> + if (POINTER_TYPE_P (type))
> + ni = fold_build_pointer_plus (init_expr, off);
> + else
> + ni = fold_build2 (PLUS_EXPR, type,
> + init_expr, fold_convert (type, off));
> + }
> + /* Don't bother call vect_peel_nonlinear_iv_init. */
> + else if (induction_type == vect_step_op_neg)
> + ni = init_expr;
> else
> - ni = fold_build2 (PLUS_EXPR, type,
> - init_expr, fold_convert (type, off));
> + ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
> + niters, step_expr,
> + induction_type);
>
> var = create_tmp_var (type, "tmp");
>
> @@ -1576,9 +1589,15 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
> ni_name = force_gimple_operand (ni, &new_stmts, false, var);
> /* Exit_bb shouldn't be empty. */
> if (!gsi_end_p (last_gsi))
> - gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
> + {
> + gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
> + gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
> + }
> else
> - gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
> + {
> + gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
> + gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
> + }
>
> /* Fix phi expressions in the successor bb. */
> adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 2257b29a652..c2293466c1c 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -425,6 +425,77 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
> return true;
> }
>
> +/* Function vect_is_nonlinear_iv_evolution
> +
> + Only support nonlinear induction for integer type
> + 1. neg
> + 2. mul by constant
> + 3. lshift/rshift by constant.
> +
> + For neg induction, return a fake step as integer -1. */
> +static bool
> +vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
> + gphi* loop_phi_node, tree *init, tree *step)
> +{
> + tree init_expr, ev_expr, result, op1, op2;
> + gimple* def;
> +
> + if (gimple_phi_num_args (loop_phi_node) != 2)
> + return false;
> +
> + init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
> + ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
> +
> + /* Support nonlinear induction only for integer type. */
> + if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
> + return false;
> +
> + *init = init_expr;
> + result = PHI_RESULT (loop_phi_node);
> +
> + if (TREE_CODE (ev_expr) != SSA_NAME
> + || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
> + || !is_gimple_assign (def))
> + return false;
> +
> + enum tree_code t_code = gimple_assign_rhs_code (def);
> + switch (t_code)
> + {
> + case NEGATE_EXPR:
> + if (gimple_assign_rhs1 (def) != result)
> + return false;
> + *step = build_int_cst (TREE_TYPE (init_expr), -1);
> + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
> + break;
> +
> + case RSHIFT_EXPR:
> + case LSHIFT_EXPR:
> + case MULT_EXPR:
> + op1 = gimple_assign_rhs1 (def);
> + op2 = gimple_assign_rhs2 (def);
> + if (TREE_CODE (op2) != INTEGER_CST
> + || op1 != result)
> + return false;
> + *step = op2;
> + if (t_code == LSHIFT_EXPR)
> + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
> + else if (t_code == RSHIFT_EXPR)
> + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
> + /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
> + else
> + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
> + break;
> +
> + default:
> + return false;
> + }
> +
> + STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
> + STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
> +
> + return true;
> +}
> +
> /* Return true if PHI, described by STMT_INFO, is the inner PHI in
> what we are assuming is a double reduction. For example, given
> a structure like this:
> @@ -512,11 +583,16 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
> = evolution_part_in_loop_num (access_fn, loop->num);
> }
>
> - if (!access_fn
> - || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
> - || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
> - || (LOOP_VINFO_LOOP (loop_vinfo) != loop
> - && TREE_CODE (step) != INTEGER_CST))
> + if ((!access_fn
> + || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
> + || !vect_is_simple_iv_evolution (loop->num, access_fn,
> + &init, &step)
> + || (LOOP_VINFO_LOOP (loop_vinfo) != loop
> + && TREE_CODE (step) != INTEGER_CST))
> + /* Only handle nonlinear iv for same loop. */
> + && (LOOP_VINFO_LOOP (loop_vinfo) != loop
> + || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
> + phi, &init, &step)))
> {
> worklist.safe_push (stmt_vinfo);
> continue;
> @@ -8229,6 +8305,591 @@ vect_can_vectorize_without_simd_p (code_helper code)
> && vect_can_vectorize_without_simd_p (tree_code (code)));
> }
>
> +/* Create vector init for vectorized iv. */
> +static tree
> +vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
> + tree step_expr, poly_uint64 nunits,
> + tree vectype,
> + enum vect_induction_op_type induction_type)
> +{
> + unsigned HOST_WIDE_INT const_nunits;
> + tree vec_shift, vec_init, new_name;
> + unsigned i;
> + tree itype = TREE_TYPE (vectype);
> +
> + /* iv_loop is the loop to be vectorized. Create:
> + vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
> + new_name = gimple_convert (stmts, itype, init_expr);
> + switch (induction_type)
> + {
> + case vect_step_op_shr:
> + case vect_step_op_shl:
> + /* Build the Initial value from shift_expr. */
> + vec_init = gimple_build_vector_from_val (stmts,
> + vectype,
> + new_name);
> + vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
> + build_zero_cst (itype), step_expr);
> + vec_init = gimple_build (stmts,
> + (induction_type == vect_step_op_shr
> + ? RSHIFT_EXPR : LSHIFT_EXPR),
> + vectype, vec_init, vec_shift);
> + break;
> +
> + case vect_step_op_neg:
> + {
> + vec_init = gimple_build_vector_from_val (stmts,
> + vectype,
> + new_name);
> + tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
> + vectype, vec_init);
> + /* The encoding has 2 interleaved stepped patterns. */
> + vec_perm_builder sel (nunits, 2, 3);
> + sel.quick_grow (6);
> + for (i = 0; i < 3; i++)
> + {
> + sel[2 * i] = i;
> + sel[2 * i + 1] = i + nunits;
> + }
> + vec_perm_indices indices (sel, 2, nunits);
> + tree perm_mask_even
> + = vect_gen_perm_mask_checked (vectype, indices);
> + vec_init = gimple_build (stmts, VEC_PERM_EXPR,
> + vectype,
> + vec_init, vec_neg,
> + perm_mask_even);
> + }
> + break;
> +
> + case vect_step_op_mul:
> + {
> + /* Use unsigned mult to avoid UD integer overflow. */
> + gcc_assert (nunits.is_constant (&const_nunits));
> + tree utype = unsigned_type_for (itype);
> + tree uvectype = build_vector_type (utype,
> + TYPE_VECTOR_SUBPARTS (vectype));
> + new_name = gimple_convert (stmts, utype, new_name);
> + vec_init = gimple_build_vector_from_val (stmts,
> + uvectype,
> + new_name);
> + tree_vector_builder elts (uvectype, const_nunits, 1);
> + tree elt_step = build_one_cst (utype);
> +
> + elts.quick_push (elt_step);
> + for (i = 1; i < const_nunits; i++)
> + {
> + /* Create: new_name_i = new_name + step_expr. */
> + elt_step = gimple_build (stmts, MULT_EXPR,
> + utype, elt_step, step_expr);
> + elts.quick_push (elt_step);
> + }
> + /* Create a vector from [new_name_0, new_name_1, ...,
> + new_name_nunits-1]. */
> + tree vec_mul = gimple_build_vector (stmts, &elts);
> + vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
> + vec_init, vec_mul);
> + vec_init = gimple_convert (stmts, vectype, vec_init);
> + }
> + break;
> +
> + default:
> + gcc_unreachable ();
> + }
> +
> + return vec_init;
> +}
> +
> +/* Peel init_expr by skip_niter for induction_type. */
> +tree
> +vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
> + tree skip_niters, tree step_expr,
> + enum vect_induction_op_type induction_type)
> +{
> + gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
> + tree type = TREE_TYPE (init_expr);
> + unsigned prec = TYPE_PRECISION (type);
> + switch (induction_type)
> + {
> + case vect_step_op_neg:
> + if (TREE_INT_CST_LOW (skip_niters) % 2)
> + init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
> + /* else no change. */
> + break;
> +
> + case vect_step_op_shr:
> + case vect_step_op_shl:
> + skip_niters = gimple_convert (stmts, type, skip_niters);
> + step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
> + /* When shift mount >= precision, need to avoid UD.
> + In the original loop, there's no UD, and according to semantic,
> + init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
> + if (!tree_fits_uhwi_p (step_expr)
> + || tree_to_uhwi (step_expr) >= prec)
> + {
> + if (induction_type == vect_step_op_shl
> + || TYPE_UNSIGNED (type))
> + init_expr = build_zero_cst (type);
> + else
> + init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
> + init_expr,
> + wide_int_to_tree (type, prec - 1));
> + }
> + else
> + init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
> + ? RSHIFT_EXPR : LSHIFT_EXPR),
> + type, init_expr, step_expr);
> + break;
> +
> + case vect_step_op_mul:
> + {
> + tree utype = unsigned_type_for (type);
> + init_expr = gimple_convert (stmts, utype, init_expr);
> + unsigned skipn = TREE_INT_CST_LOW (skip_niters);
> + wide_int begin = wi::to_wide (step_expr);
> + for (unsigned i = 0; i != skipn - 1; i++)
> + begin = wi::mul (begin, wi::to_wide (step_expr));
> + tree mult_expr = wide_int_to_tree (utype, begin);
> + init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
> + init_expr = gimple_convert (stmts, type, init_expr);
> + }
> + break;
> +
> + default:
> + gcc_unreachable ();
> + }
> +
> + return init_expr;
> +}
> +
> +/* Create vector step for vectorized iv. */
> +static tree
> +vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
> + poly_uint64 vf,
> + enum vect_induction_op_type induction_type)
> +{
> + tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
> + tree new_name = NULL;
> + /* Step should be pow (step, vf) for mult induction. */
> + if (induction_type == vect_step_op_mul)
> + {
> + gcc_assert (vf.is_constant ());
> + wide_int begin = wi::to_wide (step_expr);
> +
> + for (unsigned i = 0; i != vf.to_constant () - 1; i++)
> + begin = wi::mul (begin, wi::to_wide (step_expr));
> +
> + new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
> + }
> + else if (induction_type == vect_step_op_neg)
> + /* Do nothing. */
> + ;
> + else
> + new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
> + expr, step_expr);
> + return new_name;
> +}
> +
> +static tree
> +vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
> + stmt_vec_info stmt_info,
> + tree new_name, tree vectype,
> + enum vect_induction_op_type induction_type)
> +{
> + /* No step is needed for neg induction. */
> + if (induction_type == vect_step_op_neg)
> + return NULL;
> +
> + tree t = unshare_expr (new_name);
> + gcc_assert (CONSTANT_CLASS_P (new_name)
> + || TREE_CODE (new_name) == SSA_NAME);
> + tree new_vec = build_vector_from_val (vectype, t);
> + tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
> + new_vec, vectype, NULL);
> + return vec_step;
> +}
> +
> +/* Update vectorized iv with vect_step, induc_def is init. */
> +static tree
> +vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
> + tree induc_def, tree vec_step,
> + enum vect_induction_op_type induction_type)
> +{
> + tree vec_def = induc_def;
> + switch (induction_type)
> + {
> + case vect_step_op_mul:
> + {
> + /* Use unsigned mult to avoid UD integer overflow. */
> + tree uvectype
> + = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
> + TYPE_VECTOR_SUBPARTS (vectype));
> + vec_def = gimple_convert (stmts, uvectype, vec_def);
> + vec_step = gimple_convert (stmts, uvectype, vec_step);
> + vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
> + vec_def, vec_step);
> + vec_def = gimple_convert (stmts, vectype, vec_def);
> + }
> + break;
> +
> + case vect_step_op_shr:
> + vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
> + vec_def, vec_step);
> + break;
> +
> + case vect_step_op_shl:
> + vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
> + vec_def, vec_step);
> + break;
> + case vect_step_op_neg:
> + vec_def = induc_def;
> + /* Do nothing. */
> + break;
> + default:
> + gcc_unreachable ();
> + }
> +
> + return vec_def;
> +
> +}
> +/* Function vectorizable_induction
> +
> + Check if STMT_INFO performs an nonlinear induction computation that can be
> + vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
> + a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
> + basic block.
> + Return true if STMT_INFO is vectorizable in this way. */
> +
> +static bool
> +vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
> + stmt_vec_info stmt_info,
> + gimple **vec_stmt, slp_tree slp_node,
> + stmt_vector_for_cost *cost_vec)
> +{
> + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> + unsigned ncopies;
> + bool nested_in_vect_loop = false;
> + class loop *iv_loop;
> + tree vec_def;
> + edge pe = loop_preheader_edge (loop);
> + basic_block new_bb;
> + tree vec_init, vec_step;
> + tree new_name;
> + gimple *new_stmt;
> + gphi *induction_phi;
> + tree induc_def, vec_dest;
> + tree init_expr, step_expr;
> + tree niters_skip;
> + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> + unsigned i;
> + gimple_stmt_iterator si;
> +
> + gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
> +
> + tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> + enum vect_induction_op_type induction_type
> + = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
> +
> + gcc_assert (induction_type > vect_step_op_add);
> +
> + if (slp_node)
> + ncopies = 1;
> + else
> + ncopies = vect_get_num_copies (loop_vinfo, vectype);
> + gcc_assert (ncopies >= 1);
> +
> + /* FORNOW. Only handle nonlinear induction in the same loop. */
> + if (nested_in_vect_loop_p (loop, stmt_info))
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "nonlinear induction in nested loop.\n");
> + return false;
> + }
> +
> + iv_loop = loop;
> + gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
> +
> + /* TODO: Support slp for nonlinear iv. There should be separate vector iv
> + update for each iv and a permutation to generate wanted vector iv. */
> + if (slp_node)
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "SLP induction not supported for nonlinear"
> + " induction.\n");
> + return false;
> + }
> +
> + /* Init_expr will be update by vect_update_ivs_after_vectorizer,
> + if niters is unkown:
> + For shift, when shift mount >= precision, there would be UD.
> + For mult, don't known how to generate
> + init_expr * pow (step, niters) for variable niters.
> + For neg, it should be ok, since niters of vectorized main loop
> + will always be multiple of 2. */
> + if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> + && induction_type != vect_step_op_neg)
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "Peeling for epilogue is not supported"
> + " for nonlinear induction except neg"
> + " when iteration count is unknown.\n");
> + return false;
> + }
> +
> + /* Also doens't support peel for neg when niter is variable.
> + ??? generate something like niter_expr & 1 ? init_expr : -init_expr? */
> + niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
> + if (niters_skip != NULL_TREE
> + && TREE_CODE (niters_skip) != INTEGER_CST)
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "Peeling for alignement is not supported"
> + " for nonlinear induction when niters_skip"
> + " is not constant.\n");
> + return false;
> + }
> +
> + if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> + && induction_type == vect_step_op_mul)
> + if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "floating point nonlinear induction vectorization"
> + " not supported.\n");
> + return false;
> + }
> +
> + step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
> + init_expr = vect_phi_initial_value (phi);
> + gcc_assert (step_expr != NULL_TREE && init_expr != NULL
> + && TREE_CODE (step_expr) == INTEGER_CST);
> + /* step_expr should be aligned with init_expr,
> + .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
> + step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
> +
> + if (TREE_CODE (init_expr) == INTEGER_CST)
> + init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
> + else
> + gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
> + TREE_TYPE (init_expr)));
> +
> + switch (induction_type)
> + {
> + case vect_step_op_neg:
> + if (TREE_CODE (init_expr) != INTEGER_CST
> + && TREE_CODE (init_expr) != REAL_CST)
> + {
> + /* Check for backend support of NEGATE_EXPR and vec_perm. */
> + if (!directly_supported_p (NEGATE_EXPR, vectype))
> + return false;
> +
> + /* The encoding has 2 interleaved stepped patterns. */
> + vec_perm_builder sel (nunits, 2, 3);
> + machine_mode mode = TYPE_MODE (vectype);
> + sel.quick_grow (6);
> + for (i = 0; i < 3; i++)
> + {
> + sel[i * 2] = i;
> + sel[i * 2 + 1] = i + nunits;
> + }
> + vec_perm_indices indices (sel, 2, nunits);
> + if (!can_vec_perm_const_p (mode, mode, indices))
> + return false;
> + }
> + break;
> +
> + case vect_step_op_mul:
> + {
> + /* Check for backend support of MULT_EXPR. */
> + if (!directly_supported_p (MULT_EXPR, vectype))
> + return false;
> +
> + /* ?? How to construct vector step for variable number vector.
> + [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
> + if (!vf.is_constant ())
> + return false;
> + }
> + break;
> +
> + case vect_step_op_shr:
> + /* Check for backend support of RSHIFT_EXPR. */
> + if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
> + return false;
> +
> + /* Don't shift more than type precision to avoid UD. */
> + if (!tree_fits_uhwi_p (step_expr)
> + || maybe_ge (nunits * tree_to_uhwi (step_expr),
> + TYPE_PRECISION (TREE_TYPE (init_expr))))
> + return false;
> + break;
> +
> + case vect_step_op_shl:
> + /* Check for backend support of RSHIFT_EXPR. */
> + if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
> + return false;
> +
> + /* Don't shift more than type precision to avoid UD. */
> + if (!tree_fits_uhwi_p (step_expr)
> + || maybe_ge (nunits * tree_to_uhwi (step_expr),
> + TYPE_PRECISION (TREE_TYPE (init_expr))))
> + return false;
> +
> + break;
> +
> + default:
> + gcc_unreachable ();
> + }
> +
> + if (!vec_stmt) /* transformation not required. */
> + {
> + unsigned inside_cost = 0, prologue_cost = 0;
> + /* loop cost for vec_loop. Neg induction doesn't have any
> + inside_cost. */
> + inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
> + stmt_info, 0, vect_body);
> +
> + /* loop cost for vec_loop. Neg induction doesn't have any
> + inside_cost. */
> + if (induction_type == vect_step_op_neg)
> + inside_cost = 0;
> +
> + /* prologue cost for vec_init and vec_step. */
> + prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
> + stmt_info, 0, vect_prologue);
> +
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_NOTE, vect_location,
> + "vect_model_induction_cost: inside_cost = %d, "
> + "prologue_cost = %d. \n", inside_cost,
> + prologue_cost);
> +
> + STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
> + DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
> + return true;
> + }
> +
> + /* Transform. */
> +
> + /* Compute a vector variable, initialized with the first VF values of
> + the induction variable. E.g., for an iv with IV_PHI='X' and
> + evolution S, for a vector of 4 units, we want to compute:
> + [X, X + S, X + 2*S, X + 3*S]. */
> +
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
> +
> + pe = loop_preheader_edge (iv_loop);
> + /* Find the first insertion point in the BB. */
> + basic_block bb = gimple_bb (phi);
> + si = gsi_after_labels (bb);
> +
> + gimple_seq stmts = NULL;
> +
> + /* If we are using the loop mask to "peel" for alignment then we need
> + to adjust the start value here. */
> + if (niters_skip != NULL_TREE)
> + init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
> + step_expr, induction_type);
> +
> + vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
> + step_expr, nunits, vectype,
> + induction_type);
> + if (stmts)
> + {
> + new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
> + gcc_assert (!new_bb);
> + }
> +
> + stmts = NULL;
> + new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
> + vf, induction_type);
> + if (stmts)
> + {
> + new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
> + gcc_assert (!new_bb);
> + }
> +
> + vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
> + new_name, vectype,
> + induction_type);
> + /* Create the following def-use cycle:
> + loop prolog:
> + vec_init = ...
> + vec_step = ...
> + loop:
> + vec_iv = PHI <vec_init, vec_loop>
> + ...
> + STMT
> + ...
> + vec_loop = vec_iv + vec_step; */
> +
> + /* Create the induction-phi that defines the induction-operand. */
> + vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
> + induction_phi = create_phi_node (vec_dest, iv_loop->header);
> + induc_def = PHI_RESULT (induction_phi);
> +
> + /* Create the iv update inside the loop. */
> + stmts = NULL;
> + vec_def = vect_update_nonlinear_iv (&stmts, vectype,
> + induc_def, vec_step,
> + induction_type);
> +
> + gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
> + new_stmt = SSA_NAME_DEF_STMT (vec_def);
> +
> + /* Set the arguments of the phi node: */
> + add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
> + add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
> + UNKNOWN_LOCATION);
> +
> + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
> + *vec_stmt = induction_phi;
> +
> + /* In case that vectorization factor (VF) is bigger than the number
> + of elements that we can fit in a vectype (nunits), we have to generate
> + more than one vector stmt - i.e - we need to "unroll" the
> + vector stmt by a factor VF/nunits. For more details see documentation
> + in vectorizable_operation. */
> +
> + if (ncopies > 1)
> + {
> + stmts = NULL;
> + /* FORNOW. This restriction should be relaxed. */
> + gcc_assert (!nested_in_vect_loop);
> +
> + new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
> + nunits, induction_type);
> +
> + vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
> + new_name, vectype,
> + induction_type);
> + vec_def = induc_def;
> + for (i = 1; i < ncopies; i++)
> + {
> + /* vec_i = vec_prev + vec_step. */
> + stmts = NULL;
> + vec_def = vect_update_nonlinear_iv (&stmts, vectype,
> + vec_def, vec_step,
> + induction_type);
> + gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
> + new_stmt = SSA_NAME_DEF_STMT (vec_def);
> + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> + }
> + }
> +
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_NOTE, vect_location,
> + "transform induction: created def-use cycle: %G%G",
> + induction_phi, SSA_NAME_DEF_STMT (vec_def));
> +
> + return true;
> +}
> +
> /* Function vectorizable_induction
>
> Check if STMT_INFO performs an induction computation that can be vectorized.
> @@ -8259,6 +8920,8 @@ vectorizable_induction (loop_vec_info loop_vinfo,
> unsigned i;
> tree expr;
> gimple_stmt_iterator si;
> + enum vect_induction_op_type induction_type
> + = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
>
> gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
> if (!phi)
> @@ -8271,6 +8934,11 @@ vectorizable_induction (loop_vec_info loop_vinfo,
> if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
> return false;
>
> + /* Handle nonlinear induction in a separate place. */
> + if (induction_type != vect_step_op_add)
> + return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
> + vec_stmt, slp_node, cost_vec);
> +
> tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
>
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index e5fdc9e0a14..1fa08126ad8 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -68,6 +68,15 @@ enum vect_def_type {
> vect_unknown_def_type
> };
>
> +/* Define operation type of linear/non-linear induction variable. */
> +enum vect_induction_op_type {
> + vect_step_op_add = 0,
> + vect_step_op_neg,
> + vect_step_op_mul,
> + vect_step_op_shl,
> + vect_step_op_shr
> +};
> +
> /* Define type of reduction. */
> enum vect_reduction_type {
> TREE_CODE_REDUCTION,
> @@ -1188,6 +1197,7 @@ public:
> the version here. */
> tree loop_phi_evolution_base_unchanged;
> tree loop_phi_evolution_part;
> + enum vect_induction_op_type loop_phi_evolution_type;
>
> /* Used for various bookkeeping purposes, generally holding a pointer to
> some other stmt S that is in some way "related" to this stmt.
> @@ -1421,6 +1431,7 @@ struct gather_scatter_info {
> ((S)->dr_aux.dr && DR_GROUP_FIRST_ELEMENT(S))
> #define STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED(S) (S)->loop_phi_evolution_base_unchanged
> #define STMT_VINFO_LOOP_PHI_EVOLUTION_PART(S) (S)->loop_phi_evolution_part
> +#define STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE(S) (S)->loop_phi_evolution_type
> #define STMT_VINFO_MIN_NEG_DIST(S) (S)->min_neg_dist
> #define STMT_VINFO_REDUC_TYPE(S) (S)->reduc_type
> #define STMT_VINFO_REDUC_CODE(S) (S)->reduc_code
> @@ -2327,6 +2338,10 @@ extern int vect_get_known_peeling_cost (loop_vec_info, int, int *,
> stmt_vector_for_cost *);
> extern tree cse_and_gimplify_to_preheader (loop_vec_info, tree);
>
> +/* Nonlinear induction. */
> +extern tree vect_peel_nonlinear_iv_init (gimple_seq*, tree, tree,
> + tree, enum vect_induction_op_type);
> +
> /* In tree-vect-slp.cc. */
> extern void vect_slp_init (void);
> extern void vect_slp_fini (void);
> --
> 2.18.1
>
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2022-09-05 10:58 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-04 4:28 [RFC: PATCH] Extend vectorizer to handle nonlinear induction for neg, mul/lshift/rshift with a constant liuhongt
2022-08-04 8:18 ` Richard Biener
2022-08-04 10:09 ` Richard Sandiford
2022-08-05 5:21 ` Hongtao Liu
2022-08-29 5:24 ` [PATCH V2] " liuhongt
2022-08-29 5:24 ` liuhongt
2022-09-05 10:58 ` Richard Biener
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).