From ecde6210ff483ad899a2eeba91aa1b623c49173a Mon Sep 17 00:00:00 2001 From: Feng Xue Date: Fri, 22 Mar 2024 19:57:45 +0800 Subject: [PATCH] vect: Support multiple lane-reducing operations for loop reduction [PR114440] gcc/ PR tree-optimization/114440 * tree-vectorizer.h (struct _stmt_vec_info): Add a new field reduc_result_pos. (vectorizable_lane_reducing): New function declaration. * tree-vect-stmts.cc (vectorizable_condition): Treat the condition statement that is pointed by stmt_vec_info of reduction PHI as the real "for_reduction" statement. (vect_analyze_stmt): Call new function vectorizable_lane_reducing to analyze lane-reducing operation. * tree-vect-loop.cc (vect_is_emulated_mixed_dot_prod): Remove parameter loop_vinfo. Get input vectype from stmt_info instead of reduction PHI. (vect_model_reduction_cost): Remove cost computation code related to emulated_mixed_dot_prod. (vect_reduction_use_partial_vector): New function. (vectorizable_lane_reducing): New function. (vectorizable_reduction): Allow multiple lane-reducing operations in loop reduction. Move some original lane-reducing related code to vectorizable_lane_reducing, and move partial vectorization checking code to vect_reduction_use_partial_vector. (vect_transform_reduction): Extend transformation to support reduction statements with mixed input vectypes. gcc/testsuite/ PR tree-optimization/114440 * gcc.dg/vect/vect-reduc-chain-1.c * gcc.dg/vect/vect-reduc-chain-2.c * gcc.dg/vect/vect-reduc-chain-3.c * gcc.dg/vect/vect-reduc-dot-slp-1.c * gcc.dg/vect/vect-reduc-dot-slp-2.c --- .../gcc.dg/vect/vect-reduc-chain-1.c | 62 ++ .../gcc.dg/vect/vect-reduc-chain-2.c | 77 ++ .../gcc.dg/vect/vect-reduc-chain-3.c | 66 ++ .../gcc.dg/vect/vect-reduc-dot-slp-1.c | 97 +++ .../gcc.dg/vect/vect-reduc-dot-slp-2.c | 81 +++ gcc/tree-vect-loop.cc | 668 ++++++++++++------ gcc/tree-vect-stmts.cc | 13 +- gcc/tree-vectorizer.h | 8 + 8 files changed, 863 insertions(+), 209 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-2.c diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c new file mode 100644 index 00000000000..04bfc419dbd --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c @@ -0,0 +1,62 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_dotprod_neon } */ + +#include "tree-vect.h" + +#define N 50 + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, + SIGNEDNESS_2 char *restrict a, + SIGNEDNESS_2 char *restrict b, + SIGNEDNESS_2 char *restrict c, + SIGNEDNESS_2 char *restrict d, + SIGNEDNESS_1 int *restrict e) +{ + for (int i = 0; i < N; ++i) + { + res += a[i] * b[i]; + res += c[i] * d[i]; + res += e[i]; + } + return res; +} + +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 char a[N], b[N]; + SIGNEDNESS_2 char c[N], d[N]; + SIGNEDNESS_1 int e[N]; + int expected = 0x12345; + for (int i = 0; i < N; ++i) + { + a[i] = BASE + i * 5; + b[i] = BASE + OFFSET + i * 4; + c[i] = BASE + i * 2; + d[i] = BASE + OFFSET + i * 3; + e[i] = i; + asm volatile ("" ::: "memory"); + expected += a[i] * b[i]; + expected += c[i] * d[i]; + expected += e[i]; + } + if (f (0x12345, a, b, c, d, e) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 2 "vect" { target vect_sdot_qi } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c new file mode 100644 index 00000000000..6c803b80120 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c @@ -0,0 +1,77 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_dotprod_neon } */ + +#include "tree-vect.h" + +#define N 50 + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 unsigned +#define SIGNEDNESS_3 signed +#define SIGNEDNESS_4 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +fn (SIGNEDNESS_1 int res, + SIGNEDNESS_2 char *restrict a, + SIGNEDNESS_2 char *restrict b, + SIGNEDNESS_3 char *restrict c, + SIGNEDNESS_3 char *restrict d, + SIGNEDNESS_4 short *restrict e, + SIGNEDNESS_4 short *restrict f, + SIGNEDNESS_1 int *restrict g) +{ + for (int i = 0; i < N; ++i) + { + res += a[i] * b[i]; + res += i + 1; + res += c[i] * d[i]; + res += e[i] * f[i]; + res += g[i]; + } + return res; +} + +#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) +#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -126 : 4) +#define BASE4 ((SIGNEDNESS_4 int) -1 < 0 ? -1026 : 373) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 char a[N], b[N]; + SIGNEDNESS_3 char c[N], d[N]; + SIGNEDNESS_4 short e[N], f[N]; + SIGNEDNESS_1 int g[N]; + int expected = 0x12345; + for (int i = 0; i < N; ++i) + { + a[i] = BASE2 + i * 5; + b[i] = BASE2 + OFFSET + i * 4; + c[i] = BASE3 + i * 2; + d[i] = BASE3 + OFFSET + i * 3; + e[i] = BASE4 + i * 6; + f[i] = BASE4 + OFFSET + i * 5; + g[i] = i; + asm volatile ("" ::: "memory"); + expected += a[i] * b[i]; + expected += i + 1; + expected += c[i] * d[i]; + expected += e[i] * f[i]; + expected += g[i]; + } + if (fn (0x12345, a, b, c, d, e, f, g) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_qi } } } } */ +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_udot_qi } } } } */ +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_hi } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c new file mode 100644 index 00000000000..a41e4b176c4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c @@ -0,0 +1,66 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ + +#include "tree-vect.h" + +#define N 50 + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 unsigned +#define SIGNEDNESS_3 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, + SIGNEDNESS_2 char *restrict a, + SIGNEDNESS_2 char *restrict b, + SIGNEDNESS_3 short *restrict c, + SIGNEDNESS_3 short *restrict d, + SIGNEDNESS_1 int *restrict e) +{ + for (int i = 0; i < N; ++i) + { + short diff = a[i] - b[i]; + SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff; + res += abs; + res += c[i] * d[i]; + res += e[i]; + } + return res; +} + +#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) +#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -1236 : 373) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 char a[N], b[N]; + SIGNEDNESS_3 short c[N], d[N]; + SIGNEDNESS_1 int e[N]; + int expected = 0x12345; + for (int i = 0; i < N; ++i) + { + a[i] = BASE2 + i * 5; + b[i] = BASE2 - i * 4; + c[i] = BASE3 + i * 2; + d[i] = BASE3 + OFFSET + i * 3; + e[i] = i; + asm volatile ("" ::: "memory"); + short diff = a[i] - b[i]; + SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff; + expected += abs; + expected += c[i] * d[i]; + expected += e[i]; + } + if (f (0x12345, a, b, c, d, e) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = SAD_EXPR" "vect" { target vect_udot_qi } } } */ +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target vect_sdot_hi } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c new file mode 100644 index 00000000000..51ef4eaaed8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c @@ -0,0 +1,97 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_dotprod_neon } */ + +#include "tree-vect.h" + +#define N 50 + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, + SIGNEDNESS_2 char *a, + SIGNEDNESS_2 char *b, + int step, int n) +{ + for (int i = 0; i < n; i++) + { + res += a[0] * b[0]; + res += a[1] * b[1]; + res += a[2] * b[2]; + res += a[3] * b[3]; + res += a[4] * b[4]; + res += a[5] * b[5]; + res += a[6] * b[6]; + res += a[7] * b[7]; + res += a[8] * b[8]; + res += a[9] * b[9]; + res += a[10] * b[10]; + res += a[11] * b[11]; + res += a[12] * b[12]; + res += a[13] * b[13]; + res += a[14] * b[14]; + res += a[15] * b[15]; + + a += step; + b += step; + } + + return res; +} + +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 char a[100], b[100]; + int expected = 0x12345; + int step = 16; + int n = 2; + int t = 0; + + for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i) + { + a[i] = BASE + i * 5; + b[i] = BASE + OFFSET + i * 4; + asm volatile ("" ::: "memory"); + } + + for (int i = 0; i < n; i++) + { + asm volatile ("" ::: "memory"); + expected += a[t + 0] * b[t + 0]; + expected += a[t + 1] * b[t + 1]; + expected += a[t + 2] * b[t + 2]; + expected += a[t + 3] * b[t + 3]; + expected += a[t + 4] * b[t + 4]; + expected += a[t + 5] * b[t + 5]; + expected += a[t + 6] * b[t + 6]; + expected += a[t + 7] * b[t + 7]; + expected += a[t + 8] * b[t + 8]; + expected += a[t + 9] * b[t + 9]; + expected += a[t + 10] * b[t + 10]; + expected += a[t + 11] * b[t + 11]; + expected += a[t + 12] * b[t + 12]; + expected += a[t + 13] * b[t + 13]; + expected += a[t + 14] * b[t + 14]; + expected += a[t + 15] * b[t + 15]; + t += step; + } + + if (f (0x12345, a, b, step, n) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 16 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-2.c new file mode 100644 index 00000000000..1532833c3ae --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-2.c @@ -0,0 +1,81 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_dotprod_neon } */ + +#include "tree-vect.h" + +#define N 50 + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, + SIGNEDNESS_2 short *a, + SIGNEDNESS_2 short *b, + int step, int n) +{ + for (int i = 0; i < n; i++) + { + res += a[0] * b[0]; + res += a[1] * b[1]; + res += a[2] * b[2]; + res += a[3] * b[3]; + res += a[4] * b[4]; + res += a[5] * b[5]; + res += a[6] * b[6]; + res += a[7] * b[7]; + + a += step; + b += step; + } + + return res; +} + +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 short a[100], b[100]; + int expected = 0x12345; + int step = 8; + int n = 2; + int t = 0; + + for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i) + { + a[i] = BASE + i * 5; + b[i] = BASE + OFFSET + i * 4; + asm volatile ("" ::: "memory"); + } + + for (int i = 0; i < n; i++) + { + asm volatile ("" ::: "memory"); + expected += a[t + 0] * b[t + 0]; + expected += a[t + 1] * b[t + 1]; + expected += a[t + 2] * b[t + 2]; + expected += a[t + 3] * b[t + 3]; + expected += a[t + 4] * b[t + 4]; + expected += a[t + 5] * b[t + 5]; + expected += a[t + 6] * b[t + 6]; + expected += a[t + 7] * b[t + 7]; + t += step; + } + + if (f (0x12345, a, b, step, n) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 8 "vect" { target vect_sdot_hi } } } */ diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 984636edbc5..5a3339b6594 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -5269,8 +5269,7 @@ have_whole_vector_shift (machine_mode mode) See vect_emulate_mixed_dot_prod for the actual sequence used. */ static bool -vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo, - stmt_vec_info stmt_info) +vect_is_emulated_mixed_dot_prod (stmt_vec_info stmt_info) { gassign *assign = dyn_cast (stmt_info->stmt); if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR) @@ -5281,10 +5280,9 @@ vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo, if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2))) return false; - stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); - gcc_assert (reduc_info->is_reduc_info); + gcc_assert (STMT_VINFO_REDUC_VECTYPE_IN (stmt_info)); return !directly_supported_p (DOT_PROD_EXPR, - STMT_VINFO_REDUC_VECTYPE_IN (reduc_info), + STMT_VINFO_REDUC_VECTYPE_IN (stmt_info), optab_vector_mixed_sign); } @@ -5323,8 +5321,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, if (!gimple_extract_op (orig_stmt_info->stmt, &op)) gcc_unreachable (); - bool emulated_mixed_dot_prod - = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info); if (reduction_type == EXTRACT_LAST_REDUCTION) /* No extra instructions are needed in the prologue. The loop body operations are costed in vectorizable_condition. */ @@ -5359,12 +5355,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, initial result of the data reduction, initial value of the index reduction. */ prologue_stmts = 4; - else if (emulated_mixed_dot_prod) - /* We need the initial reduction value and two invariants: - one that contains the minimum signed value and one that - contains half of its negative. */ - prologue_stmts = 3; else + /* We need the initial reduction value. */ prologue_stmts = 1; prologue_cost += record_stmt_cost (cost_vec, prologue_stmts, scalar_to_vec, stmt_info, 0, @@ -7376,6 +7368,244 @@ build_vect_cond_expr (code_helper code, tree vop[3], tree mask, } } +/* Given an operation with CODE in loop reduction path whose reduction PHI is + specified by REDUC_INFO, the operation has TYPE of scalar result, and its + input vectype is represented by VECTYPE_IN. The vectype of vectorized result + may be different from VECTYPE_IN, either in base type or vectype lanes, + lane-reducing operation is the case. This function check if it is possible, + and how to perform partial vectorization on the operation in the context + of LOOP_VINFO. */ + +static void +vect_reduction_use_partial_vector (loop_vec_info loop_vinfo, + stmt_vec_info reduc_info, + slp_tree slp_node, code_helper code, + tree type, tree vectype_in) +{ + if (!LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) + return; + + enum vect_reduction_type reduc_type = STMT_VINFO_REDUC_TYPE (reduc_info); + internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info); + internal_fn cond_fn = get_conditional_internal_fn (code, type); + + if (reduc_type != FOLD_LEFT_REDUCTION + && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in) + && (cond_fn == IFN_LAST + || !direct_internal_fn_supported_p (cond_fn, vectype_in, + OPTIMIZE_FOR_SPEED))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't operate on partial vectors because" + " no conditional operation is available.\n"); + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + } + else if (reduc_type == FOLD_LEFT_REDUCTION + && reduc_fn == IFN_LAST + && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in), + SSA_NAME)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't operate on partial vectors because" + " no conditional operation is available.\n"); + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + } + else if (reduc_type == FOLD_LEFT_REDUCTION + && internal_fn_mask_index (reduc_fn) == -1 + && FLOAT_TYPE_P (vectype_in) + && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't operate on partial vectors because" + " signed zeros cannot be preserved.\n"); + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + } + else + { + internal_fn mask_reduc_fn + = get_masked_reduction_fn (reduc_fn, vectype_in); + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); + unsigned nvectors; + + if (slp_node) + nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + else + nvectors = vect_get_num_copies (loop_vinfo, vectype_in); + + if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS) + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1); + else + vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL); + } +} + +/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in + the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC. + Now there are three such kinds of operations: dot-prod/widen-sum/sad + (sum-of-absolute-differences). + + For a lane-reducing operation, the loop reduction path that it lies in, + may contain normal operation, or other lane-reducing operation of different + input type size, an example as: + + int sum = 0; + for (i) + { + ... + sum += d0[i] * d1[i]; // dot-prod + sum += w[i]; // widen-sum + sum += abs(s0[i] - s1[i]); // sad + sum += n[i]; // normal + ... + } + + Vectorization factor is essentially determined by operation whose input + vectype has the most lanes ("vector(16) char" in the example), while we + need to choose input vectype with the least lanes ("vector(4) int" in the + example) for the reduction PHI statement. */ + +bool +vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, + slp_tree slp_node, stmt_vector_for_cost *cost_vec) +{ + gassign *stmt = dyn_cast (stmt_info->stmt); + if (!stmt) + return false; + + enum tree_code code = gimple_assign_rhs_code (stmt); + + if (code != DOT_PROD_EXPR && code != WIDEN_SUM_EXPR && code != SAD_EXPR) + return false; + + tree type = TREE_TYPE (gimple_assign_lhs (stmt)); + + if (!INTEGRAL_TYPE_P (type) && !SCALAR_FLOAT_TYPE_P (type)) + return false; + + /* Do not try to vectorize bit-precision reductions. */ + if (!type_has_mode_precision_p (type)) + return false; + + tree vectype_in = NULL_TREE; + + for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++) + { + stmt_vec_info def_stmt_info; + slp_tree slp_op; + tree op; + tree vectype; + enum vect_def_type dt; + + if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op, + &slp_op, &dt, &vectype, &def_stmt_info)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "use not simple.\n"); + return false; + } + + if (!vectype) + { + vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op), + slp_op); + if (!vectype) + return false; + } + + if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "incompatible vector types for invariants\n"); + return false; + } + + if (i == STMT_VINFO_REDUC_IDX (stmt_info)) + continue; + + /* There should be at most one cycle def in the stmt. */ + if (VECTORIZABLE_CYCLE_DEF (dt)) + return false; + + /* To properly compute ncopies we are interested in the widest + non-reduction input type in case we're looking at a widening + accumulation that we later handle in vect transformation. */ + if (!vectype_in + || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) + < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))))) + vectype_in = vectype; + } + + STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in; + + stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)); + + /* TODO: Support lane-reducing operation that does not directly participate + in loop reduction. */ + if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0) + return false; + + /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not + recoginized. */ + gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def); + gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION); + + tree vphi_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); + + /* To accommodate lane-reducing operations of mixed input vectypes, choose + input vectype with the least lanes for the reduction PHI statement, which + would result in the most ncopies for vectorized reduction results. */ + if (!vphi_vectype_in + || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) + > GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vphi_vectype_in))))) + STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; + + int ncopies_for_cost; + + if (slp_node) + { + /* Now lane-reducing operations in a slp node should only come from + the same loop reduction path. */ + gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info)); + ncopies_for_cost = 1; + } + else + { + ncopies_for_cost = vect_get_num_copies (loop_vinfo, vectype_in); + gcc_assert (ncopies_for_cost >= 1); + } + + if (vect_is_emulated_mixed_dot_prod (stmt_info)) + { + /* We need extra two invariants: one that contains the minimum signed + value and one that contains half of its negative. */ + int prologue_stmts = 2; + unsigned cost = record_stmt_cost (cost_vec, prologue_stmts, + scalar_to_vec, stmt_info, 0, + vect_prologue); + if (dump_enabled_p ()) + dump_printf (MSG_NOTE, "vectorizable_lane_reducing: " + "extra prologue_cost = %d .\n", cost); + + /* Three dot-products and a subtraction. */ + ncopies_for_cost *= 4; + } + + record_stmt_cost (cost_vec, ncopies_for_cost, vector_stmt, stmt_info, 0, + vect_body); + + vect_reduction_use_partial_vector (loop_vinfo, reduc_info, slp_node, code, + type, vectype_in); + + STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; + return true; +} + /* Function vectorizable_reduction. Check if STMT_INFO performs a reduction operation that can be vectorized. @@ -7441,7 +7671,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, bool single_defuse_cycle = false; bool nested_cycle = false; bool double_reduc = false; - int vec_num; tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE; tree cond_reduc_val = NULL_TREE; @@ -7522,6 +7751,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, (gimple_bb (reduc_def_phi)->loop_father)); unsigned reduc_chain_length = 0; bool only_slp_reduc_chain = true; + bool only_lane_reduc_code_p = true; stmt_info = NULL; slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL; while (reduc_def != PHI_RESULT (reduc_def_phi)) @@ -7543,14 +7773,15 @@ vectorizable_reduction (loop_vec_info loop_vinfo, all lanes here - even though we only will vectorize from the SLP node with live lane zero the other live lanes also need to be identified as part of a reduction to be able - to skip code generation for them. */ + to skip code generation for them. For lane-reducing operation + vectorizable analysis needs the reduction PHI information. */ if (slp_for_stmt_info) { for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info)) if (STMT_VINFO_LIVE_P (s)) STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info; } - else if (STMT_VINFO_LIVE_P (vdef)) + else STMT_VINFO_REDUC_DEF (def) = phi_info; gimple_match_op op; if (!gimple_extract_op (vdef->stmt, &op)) @@ -7571,9 +7802,18 @@ vectorizable_reduction (loop_vec_info loop_vinfo, return false; } } - else if (!stmt_info) - /* First non-conversion stmt. */ - stmt_info = vdef; + else + { + /* First non-conversion stmt. */ + if (!stmt_info) + stmt_info = vdef; + + if (op.code != DOT_PROD_EXPR + && op.code != WIDEN_SUM_EXPR + && op.code != SAD_EXPR) + only_lane_reduc_code_p = false; + } + reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)]; reduc_chain_length++; if (!stmt_info && slp_node) @@ -7647,18 +7887,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, if (!type_has_mode_precision_p (op.type)) return false; - /* For lane-reducing ops we're reducing the number of reduction PHIs - which means the only use of that may be in the lane-reducing operation. */ - if (lane_reduc_code_p - && reduc_chain_length != 1 - && !only_slp_reduc_chain) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "lane-reducing reduction with extra stmts.\n"); - return false; - } - /* All uses but the last are expected to be defined in the loop. The last use is the reduction variable. In case of nested cycle this assumption is not true: we use reduc_index to record the index of the @@ -7687,9 +7915,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, "use not simple.\n"); return false; } - if (i == STMT_VINFO_REDUC_IDX (stmt_info)) - continue; - /* For an IFN_COND_OP we might hit the reduction definition operand twice (once as definition, once as else). */ if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)]) @@ -7735,12 +7960,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo, } if (!vectype_in) vectype_in = STMT_VINFO_VECTYPE (phi_info); - STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; - enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info); - STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type; + /* If there is a normal (non-lane-reducing) operation in the loop reduction + path, to ensure there will be enough copies to hold vectorized results of + the operation, we need set the input vectype of the reduction PHI to be + same as the reduction output vectype somewhere, here is a suitable place. + Otherwise the input vectype is set to the one with the least lanes, which + can only be determined in vectorizable analysis routine of lane-reducing + operation. */ + if (!only_lane_reduc_code_p) + STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = STMT_VINFO_VECTYPE (phi_info); + + enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info); + STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type; /* If we have a condition reduction, see if we can simplify it further. */ - if (v_reduc_type == COND_REDUCTION) + if (reduction_type == COND_REDUCTION) { if (slp_node) return false; @@ -7906,8 +8140,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo, } STMT_VINFO_REDUC_CODE (reduc_info) = orig_code; + reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); - vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); if (reduction_type == TREE_CODE_REDUCTION) { /* Check whether it's ok to change the order of the computation. @@ -8181,14 +8415,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo, && loop_vinfo->suggested_unroll_factor == 1) single_defuse_cycle = true; - if (single_defuse_cycle || lane_reduc_code_p) + if (single_defuse_cycle && !lane_reduc_code_p) { gcc_assert (op.code != COND_EXPR); - /* 4. Supportable by target? */ - bool ok = true; - - /* 4.1. check support for the operation in the loop + /* 4. check support for the operation in the loop This isn't necessary for the lane reduction codes, since they can only be produced by pattern matching, and it's up to the @@ -8197,14 +8428,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo, mixed-sign dot-products can be implemented using signed dot-products. */ machine_mode vec_mode = TYPE_MODE (vectype_in); - if (!lane_reduc_code_p - && !directly_supported_p (op.code, vectype_in, optab_vector)) + if (!directly_supported_p (op.code, vectype_in, optab_vector)) { if (dump_enabled_p ()) dump_printf (MSG_NOTE, "op not supported by target.\n"); if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) || !vect_can_vectorize_without_simd_p (op.code)) - ok = false; + single_defuse_cycle = false; else if (dump_enabled_p ()) dump_printf (MSG_NOTE, "proceeding using word mode.\n"); @@ -8217,35 +8447,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo, dump_printf (MSG_NOTE, "using word mode not possible.\n"); return false; } - - /* lane-reducing operations have to go through vect_transform_reduction. - For the other cases try without the single cycle optimization. */ - if (!ok) - { - if (lane_reduc_code_p) - return false; - else - single_defuse_cycle = false; - } } STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle; - /* If the reduction stmt is one of the patterns that have lane - reduction embedded we cannot handle the case of ! single_defuse_cycle. */ - if ((ncopies > 1 && ! single_defuse_cycle) - && lane_reduc_code_p) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "multi def-use cycle not possible for lane-reducing " - "reduction operation\n"); - return false; - } - - if (slp_node - && !(!single_defuse_cycle - && !lane_reduc_code_p - && reduction_type != FOLD_LEFT_REDUCTION)) + /* Reduction type of lane-reducing operation is TREE_CODE_REDUCTION, the + below processing will be done in its own vectorizable function. */ + if (slp_node && reduction_type == FOLD_LEFT_REDUCTION) for (i = 0; i < (int) op.num_ops; i++) if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i])) { @@ -8255,36 +8462,24 @@ vectorizable_reduction (loop_vec_info loop_vinfo, return false; } - if (slp_node) - vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); - else - vec_num = 1; - vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn, reduction_type, ncopies, cost_vec); /* Cost the reduction op inside the loop if transformed via - vect_transform_reduction. Otherwise this is costed by the - separate vectorizable_* routines. */ - if (single_defuse_cycle || lane_reduc_code_p) - { - int factor = 1; - if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info)) - /* Three dot-products and a subtraction. */ - factor = 4; - record_stmt_cost (cost_vec, ncopies * factor, vector_stmt, - stmt_info, 0, vect_body); - } + vect_transform_reduction for non-lane-reducing operation. Otherwise + this is costed by the separate vectorizable_* routines. */ + if (single_defuse_cycle && !lane_reduc_code_p) + record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body); if (dump_enabled_p () && reduction_type == FOLD_LEFT_REDUCTION) dump_printf_loc (MSG_NOTE, vect_location, "using an in-order (fold-left) reduction.\n"); STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type; - /* All but single defuse-cycle optimized, lane-reducing and fold-left - reductions go through their own vectorizable_* routines. */ - if (!single_defuse_cycle - && !lane_reduc_code_p - && reduction_type != FOLD_LEFT_REDUCTION) + + /* All but single defuse-cycle optimized and fold-left reductions go + through their own vectorizable_* routines. */ + if ((!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION) + || lane_reduc_code_p) { stmt_vec_info tem = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); @@ -8296,60 +8491,10 @@ vectorizable_reduction (loop_vec_info loop_vinfo, STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def; STMT_VINFO_DEF_TYPE (tem) = vect_internal_def; } - else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) - { - vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); - vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); - internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type); - - if (reduction_type != FOLD_LEFT_REDUCTION - && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in) - && (cond_fn == IFN_LAST - || !direct_internal_fn_supported_p (cond_fn, vectype_in, - OPTIMIZE_FOR_SPEED))) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "can't operate on partial vectors because" - " no conditional operation is available.\n"); - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; - } - else if (reduction_type == FOLD_LEFT_REDUCTION - && reduc_fn == IFN_LAST - && !expand_vec_cond_expr_p (vectype_in, - truth_type_for (vectype_in), - SSA_NAME)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "can't operate on partial vectors because" - " no conditional operation is available.\n"); - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; - } - else if (reduction_type == FOLD_LEFT_REDUCTION - && internal_fn_mask_index (reduc_fn) == -1 - && FLOAT_TYPE_P (vectype_in) - && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "can't operate on partial vectors because" - " signed zeros cannot be preserved.\n"); - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; - } - else - { - internal_fn mask_reduc_fn - = get_masked_reduction_fn (reduc_fn, vectype_in); + else + vect_reduction_use_partial_vector (loop_vinfo, reduc_info, slp_node, + op.code, op.type, vectype_in); - if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS) - vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, - vectype_in, 1); - else - vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, - vectype_in, NULL); - } - } return true; } @@ -8440,6 +8585,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo, class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); int i; int ncopies; + int stmt_ncopies; int vec_num; stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); @@ -8463,15 +8609,28 @@ vect_transform_reduction (loop_vec_info loop_vinfo, gphi *reduc_def_phi = as_a (phi_info->stmt); int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info); tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); + tree stmt_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info); + + /* Get input vectypes from the reduction PHI and the statement to be + transformed, these two vectypes may have different lanes when + lane-reducing operation is present. */ + if (!vectype_in) + vectype_in = STMT_VINFO_REDUC_VECTYPE (reduc_info); + + if (!stmt_vectype_in) + stmt_vectype_in = STMT_VINFO_VECTYPE (stmt_info); if (slp_node) { ncopies = 1; + stmt_ncopies = 1; vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); } else { ncopies = vect_get_num_copies (loop_vinfo, vectype_in); + stmt_ncopies = vect_get_num_copies (loop_vinfo, stmt_vectype_in); + gcc_assert (stmt_ncopies >= 1 && stmt_ncopies <= ncopies); vec_num = 1; } @@ -8480,14 +8639,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo, vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); - bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in); - + bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, + stmt_vectype_in); /* Transform. */ - tree new_temp = NULL_TREE; - auto_vec vec_oprnds0; - auto_vec vec_oprnds1; - auto_vec vec_oprnds2; - tree def0; + auto_vec vec_oprnds[3]; if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); @@ -8510,8 +8665,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo, == op.ops[internal_fn_else_index ((internal_fn) code)])); } - bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); - vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); if (reduction_type == FOLD_LEFT_REDUCTION) { @@ -8519,7 +8672,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo, gcc_assert (code.is_tree_code () || cond_fn_p); return vectorize_fold_left_reduction (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, - code, reduc_fn, op.ops, op.num_ops, vectype_in, + code, reduc_fn, op.ops, op.num_ops, stmt_vectype_in, reduc_index, masks, lens); } @@ -8533,55 +8686,160 @@ vect_transform_reduction (loop_vec_info loop_vinfo, tree scalar_dest = gimple_get_lhs (stmt_info->stmt); tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out); - /* Get NCOPIES vector definitions for all operands except the reduction - definition. */ - if (!cond_fn_p) + gcc_assert (reduc_index < 3); + + if (slp_node) { - vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, - single_defuse_cycle && reduc_index == 0 - ? NULL_TREE : op.ops[0], &vec_oprnds0, - single_defuse_cycle && reduc_index == 1 - ? NULL_TREE : op.ops[1], &vec_oprnds1, - op.num_ops == 3 - && !(single_defuse_cycle && reduc_index == 2) - ? op.ops[2] : NULL_TREE, &vec_oprnds2); + gcc_assert (!single_defuse_cycle && op.num_ops <= 3); + + for (i = 0; i < (int) op.num_ops; i++) + vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]); } else { - /* For a conditional operation pass the truth type as mask - vectype. */ - gcc_assert (single_defuse_cycle - && (reduc_index == 1 || reduc_index == 2)); - vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, - op.ops[0], truth_type_for (vectype_in), &vec_oprnds0, - reduc_index == 1 ? NULL_TREE : op.ops[1], - NULL_TREE, &vec_oprnds1, - reduc_index == 2 ? NULL_TREE : op.ops[2], - NULL_TREE, &vec_oprnds2); - } + int result_pos = 0; + + /* The input vectype of the reduction PHI determines copies of + vectorized def-use cycles, which might be more than effective copies + of vectorized lane-reducing reduction statements. This could be + complemented by generating extra trivial pass-through copies. For + example: + + int sum = 0; + for (i) + { + sum += d0[i] * d1[i]; // dot-prod + sum += abs(s0[i] - s1[i]); // sad + sum += n[i]; // normal + } + + The vector size is 128-bit,vectorization factor is 16. Reduction + statements would be transformed as: + + vector<4> int sum_v0 = { 0, 0, 0, 0 }; + vector<4> int sum_v1 = { 0, 0, 0, 0 }; + vector<4> int sum_v2 = { 0, 0, 0, 0 }; + vector<4> int sum_v3 = { 0, 0, 0, 0 }; + + for (i / 16) + { + sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0); + sum_v1 = sum_v1; // copy + sum_v2 = sum_v2; // copy + sum_v3 = sum_v3; // copy + + sum_v0 = sum_v0; // copy + sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1); + sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2); + sum_v3 = sum_v3; // copy + + sum_v0 += n_v0[i: 0 ~ 3 ]; + sum_v1 += n_v1[i: 4 ~ 7 ]; + sum_v2 += n_v2[i: 8 ~ 11]; + sum_v3 += n_v3[i: 12 ~ 15]; + } + + Moreover, for a higher instruction parallelism in final vectorized + loop, it is considered to make those effective vectorized + lane-reducing statements be distributed evenly among all def-use + cycles. In the above example, SADs are generated into other cycles + rather than that of DOT_PROD. */ + + if (stmt_ncopies < ncopies) + { + gcc_assert (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR + || code == SAD_EXPR); + result_pos = reduc_info->reduc_result_pos; + reduc_info->reduc_result_pos = (result_pos + stmt_ncopies) % ncopies; + gcc_assert (result_pos >= 0 && result_pos < ncopies); + } + + for (i = 0; i < MIN (3, (int) op.num_ops); i++) + { + tree vectype = NULL_TREE; + int used_ncopies = ncopies; + + if (cond_fn_p && i == 0) + { + /* For a conditional operation pass the truth type as mask + vectype. */ + gcc_assert (single_defuse_cycle && reduc_index > 0); + vectype = truth_type_for (vectype_in); + } - /* For single def-use cycles get one copy of the vectorized reduction - definition. */ - if (single_defuse_cycle) - { - gcc_assert (!slp_node); - vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1, - op.ops[reduc_index], - reduc_index == 0 ? &vec_oprnds0 - : (reduc_index == 1 ? &vec_oprnds1 - : &vec_oprnds2)); + if (i != reduc_index) + { + /* For non-reduction operand, deduce effictive copies that are + involved in vectorized def-use cycles based on the input + vectype of the reduction statement. */ + used_ncopies = stmt_ncopies; + } + else if (single_defuse_cycle) + { + /* For single def-use cycles get one copy of the vectorized + reduction definition. */ + used_ncopies = 1; + } + + vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, used_ncopies, + op.ops[i], &vec_oprnds[i], vectype); + + if (used_ncopies < ncopies) + { + vec_oprnds[i].safe_grow_cleared (ncopies); + + /* Find suitable def-use cycles to generate vectorized + statements into, and reorder operands based on the + selection. */ + if (i != reduc_index && result_pos) + { + int count = ncopies - used_ncopies; + int start = result_pos - count; + + if (start < 0) + { + count = result_pos; + start = 0; + } + + for (int j = used_ncopies - 1; j >= start; j--) + { + std::swap (vec_oprnds[i][j], vec_oprnds[i][j + count]); + gcc_assert (!vec_oprnds[i][j]); + } + } + } + } } - bool emulated_mixed_dot_prod - = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info); - FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) + bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); + bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info); + tree def0; + + FOR_EACH_VEC_ELT (vec_oprnds[0], i, def0) { gimple *new_stmt; - tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; - if (masked_loop_p && !mask_by_cond_expr) + tree new_temp = NULL_TREE; + tree vop[3] = { def0, vec_oprnds[1][i], NULL_TREE }; + + if (!vop[0] || !vop[1]) + { + tree reduc_vop = vec_oprnds[reduc_index][i]; + + /* Insert trivial copy if no need to generate vectorized + statement. */ + gcc_assert (reduc_vop && stmt_ncopies < ncopies); + + new_stmt = gimple_build_assign (vec_dest, reduc_vop); + new_temp = make_ssa_name (vec_dest, new_stmt); + gimple_set_lhs (new_stmt, new_temp); + vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); + } + else if (masked_loop_p && !mask_by_cond_expr) { - /* No conditional ifns have been defined for dot-product yet. */ - gcc_assert (code != DOT_PROD_EXPR); + /* No conditional ifns have been defined for dot-product and sad + yet. */ + gcc_assert (code != DOT_PROD_EXPR && code != SAD_EXPR); /* Make sure that the reduction accumulator is vop[0]. */ if (reduc_index == 1) @@ -8590,7 +8848,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, std::swap (vop[0], vop[1]); } tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, - vec_num * ncopies, vectype_in, i); + vec_num * stmt_ncopies, + stmt_vectype_in, i); gcall *call = gimple_build_call_internal (cond_fn, 4, mask, vop[0], vop[1], vop[0]); new_temp = make_ssa_name (vec_dest, call); @@ -8602,12 +8861,13 @@ vect_transform_reduction (loop_vec_info loop_vinfo, else { if (op.num_ops >= 3) - vop[2] = vec_oprnds2[i]; + vop[2] = vec_oprnds[2][i]; if (masked_loop_p && mask_by_cond_expr) { tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, - vec_num * ncopies, vectype_in, i); + vec_num * stmt_ncopies, + stmt_vectype_in, i); build_vect_cond_expr (code, vop, mask, gsi); } @@ -8634,16 +8894,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, if (slp_node) slp_node->push_vec_def (new_stmt); - else if (single_defuse_cycle - && i < ncopies - 1) - { - if (reduc_index == 0) - vec_oprnds0.safe_push (gimple_get_lhs (new_stmt)); - else if (reduc_index == 1) - vec_oprnds1.safe_push (gimple_get_lhs (new_stmt)); - else if (reduc_index == 2) - vec_oprnds2.safe_push (gimple_get_lhs (new_stmt)); - } + else if (single_defuse_cycle && i < ncopies - 1) + vec_oprnds[reduc_index][i + 1] = gimple_get_lhs (new_stmt); else STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); } diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index f8d8636b139..15331ca87f2 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -12093,11 +12093,20 @@ vectorizable_condition (vec_info *vinfo, vect_reduction_type reduction_type = TREE_CODE_REDUCTION; bool for_reduction = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) != NULL; + if (for_reduction) + { + reduc_info = info_for_reduction (vinfo, stmt_info); + if (STMT_VINFO_REDUC_DEF (reduc_info) != vect_orig_stmt (stmt_info)) + { + for_reduction = false; + reduc_info = NULL; + } + } + if (for_reduction) { if (slp_node) return false; - reduc_info = info_for_reduction (vinfo, stmt_info); reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); reduc_index = STMT_VINFO_REDUC_IDX (stmt_info); gcc_assert (reduction_type != EXTRACT_LAST_REDUCTION @@ -13273,6 +13282,8 @@ vect_analyze_stmt (vec_info *vinfo, NULL, NULL, node, cost_vec) || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec) || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec) + || vectorizable_lane_reducing (as_a (vinfo), + stmt_info, node, cost_vec) || vectorizable_reduction (as_a (vinfo), stmt_info, node, node_instance, cost_vec) || vectorizable_induction (as_a (vinfo), stmt_info, diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index db44d730b70..a923e1cd657 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1399,6 +1399,12 @@ public: /* The vector type for performing the actual reduction. */ tree reduc_vectype; + /* For loop reduction with multiple vectorized results (ncopies > 1), a + lane-reducing operation participating in it may not use all of those + results, this field specifies result index starting from which any + following land-reducing operation would be assigned to. */ + int reduc_result_pos; + /* If IS_REDUC_INFO is true and if the vector code is performing N scalar reductions in parallel, this variable gives the initial scalar values of those N reductions. */ @@ -2430,6 +2436,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop *, vec_info_shared *, extern bool vectorizable_live_operation (vec_info *, stmt_vec_info, slp_tree, slp_instance, int, bool, stmt_vector_for_cost *); +extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info, + slp_tree, stmt_vector_for_cost *); extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info, slp_tree, slp_instance, stmt_vector_for_cost *); -- 2.17.1