When transforming multiple lane-reducing operations in a loop reduction chain, originally, corresponding vectorized statements are generated into def-use cycles starting from 0. The def-use cycle with smaller index, would contain more statements, which means more instruction dependency. For example: int sum = 0; for (i) { sum += d0[i] * d1[i]; // dot-prod sum += w[i]; // widen-sum sum += abs(s0[i] - s1[i]); // sad } Original transformation result: for (i / 16) { sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0); sum_v1 = sum_v1; // copy sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0); sum_v1 = sum_v1; // copy sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0); sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1); sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy } For a higher instruction parallelism in final vectorized loop, an optimal means is to make those effective vectorized lane-reducing statements be distributed evenly among all def-use cycles. Transformed as the below, DOT_PROD, WIDEN_SUM and SADs are generated into disparate cycles, instruction dependency could be eliminated. for (i / 16) { sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0); sum_v1 = sum_v1; // copy sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy sum_v0 = sum_v0; // copy sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1); sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy sum_v0 = sum_v0; // copy sum_v1 = sum_v1; // copy sum_v2 = SAD (s0_v2[i: 0 ~ 7 ], s1_v2[i: 0 ~ 7 ], sum_v2); sum_v3 = SAD (s0_v3[i: 8 ~ 15], s1_v3[i: 8 ~ 15], sum_v3); } Thanks, Feng --- gcc/ PR tree-optimization/114440 * tree-vectorizer.h (struct _stmt_vec_info): Add a new field reduc_result_pos. * tree-vect-loop.cc (vect_transform_reduction): Generate lane-reducing statements in an optimized order. --- gcc/tree-vect-loop.cc | 51 ++++++++++++++++++++++++++++++++++++++----- gcc/tree-vectorizer.h | 6 +++++ 2 files changed, 51 insertions(+), 6 deletions(-) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index b5849dbb08a..4807f529506 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -8703,7 +8703,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, } bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info); - gcc_assert (single_defuse_cycle || lane_reducing_op_p (code)); + bool lane_reducing = lane_reducing_op_p (code); + gcc_assert (single_defuse_cycle || lane_reducing); /* Create the destination vector */ tree scalar_dest = gimple_get_lhs (stmt_info->stmt); @@ -8720,6 +8721,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, } else { + int result_pos = 0; + /* The input vectype of the reduction PHI determines copies of vectorized def-use cycles, which might be more than effective copies of vectorized lane-reducing reduction statements. This could be @@ -8749,9 +8752,9 @@ vect_transform_reduction (loop_vec_info loop_vinfo, sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy - sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0); - sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1); - sum_v2 = sum_v2; // copy + sum_v0 = sum_v0; // copy + sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1); + sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2); sum_v3 = sum_v3; // copy sum_v0 += n_v0[i: 0 ~ 3 ]; @@ -8759,7 +8762,20 @@ vect_transform_reduction (loop_vec_info loop_vinfo, sum_v2 += n_v2[i: 8 ~ 11]; sum_v3 += n_v3[i: 12 ~ 15]; } - */ + + Moreover, for a higher instruction parallelism in final vectorized + loop, it is considered to make those effective vectorized + lane-reducing statements be distributed evenly among all def-use + cycles. In the above example, SADs are generated into other cycles + rather than that of DOT_PROD. */ + + if (stmt_ncopies < ncopies) + { + gcc_assert (lane_reducing); + result_pos = reduc_info->reduc_result_pos; + reduc_info->reduc_result_pos = (result_pos + stmt_ncopies) % ncopies; + gcc_assert (result_pos >= 0 && result_pos < ncopies); + } for (i = 0; i < MIN (3, (int) op.num_ops); i++) { @@ -8792,7 +8808,30 @@ vect_transform_reduction (loop_vec_info loop_vinfo, op.ops[i], &vec_oprnds[i], vectype); if (used_ncopies < ncopies) - vec_oprnds[i].safe_grow_cleared (ncopies); + { + vec_oprnds[i].safe_grow_cleared (ncopies); + + /* Find suitable def-use cycles to generate vectorized + statements into, and reorder operands based on the + selection. */ + if (i != reduc_index && result_pos) + { + int count = ncopies - used_ncopies; + int start = result_pos - count; + + if (start < 0) + { + count = result_pos; + start = 0; + } + + for (int j = used_ncopies - 1; j >= start; j--) + { + std::swap (vec_oprnds[i][j], vec_oprnds[i][j + count]); + gcc_assert (!vec_oprnds[i][j]); + } + } + } } } diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index ca810869592..d64729ac953 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1402,6 +1402,12 @@ public: /* The vector type for performing the actual reduction. */ tree reduc_vectype; + /* For loop reduction with multiple vectorized results (ncopies > 1), a + lane-reducing operation participating in it may not use all of those + results, this field specifies result index starting from which any + following land-reducing operation would be assigned to. */ + int reduc_result_pos; + /* If IS_REDUC_INFO is true and if the vector code is performing N scalar reductions in parallel, this variable gives the initial scalar values of those N reductions. */ -- 2.17.1