From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtpbgeu2.qq.com (smtpbgeu2.qq.com [18.194.254.142]) by sourceware.org (Postfix) with ESMTPS id E757D385C8B1 for ; Fri, 30 Sep 2022 08:00:43 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org E757D385C8B1 Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=rivai.ai Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=rivai.ai X-QQ-mid: bizesmtp80t1664524836toqa9l5n Received: from server1.localdomain ( [42.247.22.65]) by bizesmtp.qq.com (ESMTP) with id ; Fri, 30 Sep 2022 16:00:34 +0800 (CST) X-QQ-SSF: 01400000000000D0J000000A0000000 X-QQ-FEAT: 0ZZPseV/Pe8+H8XtmIPsKRzq5V51HPGLemfvoL2M9CuGga69ldHhbBAu8itwQ czssUHuiNBlL92jAdj96WaGfQHwbSpIZh7xfqa3GUy5IjtJmyvOt6DCNkGeLfFjC1zX/gmr mcHvGsd6GlNj95nlhTmPwQs2iG1o7VE8ON1xYwc5WJAyNpzOdYMDVsmb6Twuc+EyScANMaF eeIyPs3SmVq0HVCK38KJDRQGAcslobpm3kXSbFbO2gnMv1eU2PW2h+Z6h1i3Vh0ZCBzDKEq BCrxsOf/fHM3YFMHrWgJtJkE0UgzkK30VD++q3uHLG9SyYmGA7L9eHAZkt69+9RxWvrmyxM RKBSlriAGFLHEh0mOff+QhvFBqJRndQBFUreCQvqr3FcNbLDZ+EnBCulMDH+/d9ePu2VVqu dei0ndCnfDg= X-QQ-GoodBg: 2 From: juzhe.zhong@rivai.ai To: gcc-patches@gcc.gnu.org Cc: richard.sandiford@arm.com, richard.guenther@gmail.com, Ju-Zhe Zhong Subject: [PATCH] Add first-order recurrence autovectorization Date: Fri, 30 Sep 2022 16:00:33 +0800 Message-Id: <20220930080033.70151-1-juzhe.zhong@rivai.ai> X-Mailer: git-send-email 2.36.1 MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-QQ-SENDSIZE: 520 Feedback-ID: bizesmtp:rivai.ai:qybglogicsvr:qybglogicsvr7 X-Spam-Status: No, score=-12.9 required=5.0 tests=BAYES_00,GIT_PATCH_0,KAM_DMARC_STATUS,RCVD_IN_DNSWL_NONE,SPF_PASS,TXREP,T_SPF_HELO_TEMPERROR,WEIRD_PORT autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: From: Ju-Zhe Zhong Hi, After fixing previous ICE. I add full implementation (insert permutation to get correct result.) The gimple IR is correct now I think: # t_21 = PHI <_4(6), t_12(9)> # i_22 = PHI # vectp_a.6_26 = PHI # vect_vec_recur_.9_9 = PHI # vectp_b.11_7 = PHI # curr_cnt_36 = PHI # loop_len_20 = PHI _38 = .WHILE_LEN (loop_len_20, 32, POLY_INT_CST [4, 4]); while_len_37 = _38; _1 = (long unsigned int) i_22; _2 = _1 * 4; _3 = a_14(D) + _2; vect__4.8_19 = .LEN_LOAD (vectp_a.6_26, 32B, loop_len_20, 0); _4 = *_3; _5 = b_15(D) + _2; vect_vec_recur_.9_9 = VEC_PERM_EXPR ; But I encounter another ICE: 0x169e0e7 process_bb ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:7498 0x16a09af do_rpo_vn(function*, edge_def*, bitmap_head*, bool, bool, vn_lookup_kind) ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:8109 0x16a0fe7 do_rpo_vn(function*, edge_def*, bitmap_head*) ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:8205 0x179b7db execute ../../../riscv-gcc/gcc/tree-vectorizer.cc:1365 Could you help me with this? After fixing this ICE, I think the loop vectorizer can run correctly. Maybe you can test is in X86 or ARM after fixing this ICE. Thanks. --- gcc/tree-vect-loop.cc | 248 ++++++++++++++++++++++++++++++++++++++++- gcc/tree-vect-stmts.cc | 47 +++++++- gcc/tree-vectorizer.h | 4 + 3 files changed, 296 insertions(+), 3 deletions(-) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 2536cc3cf49..1f0279d0d28 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -529,6 +529,65 @@ vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi) return false; } +/* Returns true if Phi is a first-order recurrence. A first-order + recurrence is a non-reduction recurrence relation in which the value of + the recurrence in the current loop iteration equals a value defined in + the previous iteration. */ + +static bool +vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop, + gphi *phi) +{ + /* Ensure the phi node is in the loop header and has two incoming values. */ + if (gimple_bb (phi) != loop->header || gimple_phi_num_args (phi) != 2) + return false; + + /* Ensure the loop has a preheader and a single latch block. The loop + vectorizer will need the latch to set up the next iteration of the loop. */ + edge preheader = loop_preheader_edge (loop); + edge latch = loop_latch_edge (loop); + if (!preheader || !latch) + return false; + + /* Ensure the phi node's incoming blocks are the loop preheader and latch. */ + if (!PHI_ARG_DEF_FROM_EDGE (phi, preheader) + || !PHI_ARG_DEF_FROM_EDGE (phi, latch)) + return false; + + /* Get the previous value. The previous value comes from the latch edge while + the initial value comes form the preheader edge. */ + gimple *previous = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, latch)); + if (!previous) + return false; + + /* Ensure every use_stmt of the phi node is dominated by the previous value. + The dominance requirement ensures the loop vectorizer will not need to + vectorize the initial value prior to the first iteration of the loop. */ + gimple *use_stmt; + imm_use_iterator imm_iter; + FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_phi_result (phi)) + if (use_stmt) + if (!dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), + gimple_bb (previous))) + return false; + + /* First-order recurrence autovectorization needs shuffle vector. */ + tree scalar_type = TREE_TYPE (PHI_RESULT (phi)); + tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type); + /* First-order recurrence autovectorization needs to handle permutation + with indices = [nunits-1, nunits, nunits+1, ...]. */ + machine_mode mode = TYPE_MODE (vectype); + poly_int64 nunits = GET_MODE_NUNITS (mode); + vec_perm_builder sel (nunits, 1, 3); + for (int i = 0; i < 3; ++i) + sel.quick_push (nunits - 1 + i); + vec_perm_indices indices (sel, 1, nunits * 2); + if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) + return false; + + return true; +} + /* Function vect_analyze_scalar_cycles_1. Examine the cross iteration def-use cycles of scalar variables @@ -666,6 +725,8 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop, } } } + else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi)) + STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence; else if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -1808,9 +1869,13 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo) gcc_assert (stmt_info); + if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence) + ok = vectorizable_dep_phi (loop_vinfo, stmt_info, NULL, NULL); + if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope || STMT_VINFO_LIVE_P (stmt_info)) - && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) + && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def + && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence) /* A scalar-dependence cycle that we don't support. */ return opt_result::failure_at (phi, "not vectorized:" @@ -8267,6 +8332,118 @@ vectorizable_phi (vec_info *, return true; } +/* Vectorizes Dep PHIs. */ + +bool +vectorizable_dep_phi (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, + gimple **vec_stmt, slp_tree slp_node) +{ + if (!loop_vinfo || !is_a (stmt_info->stmt)) + return false; + + gphi *phi = as_a (stmt_info->stmt); + + /* So far we only support first-order recurrence auto-vectorization. */ + if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence) + return false; + + if (!vec_stmt) /* transformation not required. */ + { + /* So far we don't support first-order recurrence for SLP + * auto-vectorization. */ + if (slp_node) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "do not support first-order recurrence" + "autovectorization for SLP node\n"); + return false; + } + STMT_VINFO_TYPE (stmt_info) = dep_phi_info_type; + return true; + } + + /* This is the second phase of vectorizing first-order rececurrences. An + overview of the transformation is described below. Suppose we have the + following loop. + + int32_t t = 0; + for (int i = 0; i < n; ++i) + { + b[i] = a[i] - t; + t = a[i]; + } + + There is a first-order recurrence on "a". For this loop, the shorthand + scalar IR looks like: + + scalar.preheader: + init = a[-1] + br loop.body + + scalar.body: + i = PHI <0(scalar.preheader), i+1(scalar.body)> + _2 = PHI <(init(scalar.preheader), <_1(scalar.body)> + _1 = a[i] + b[i] = _1 - _2 + br cond, scalar.body, ... + + In this example, _2 is a recurrence because it's value depends on the + previous iteration. In the first phase of vectorization, we created a + temporary value for _2. We now complete the vectorization and produce the + shorthand vector IR shown below (VF = 4). + + vector.preheader: + vect_init = vect_cst(..., ..., ..., a[-1]) + br vector.body + + vector.body + i = PHI <0(vector.preheader), i+4(vector.body)> + vect_1 = PHI + vect_2 = a[i, i+1, i+2, i+3]; + vect_3 = vector(vect_1(3), vect_2(0, 1, 2)) + b[i, i+1, i+2, i+3] = vect_2 - vect_3 + br cond, vector.body, middle.block + + middle.block: + x = vect_2(3) + br scalar.preheader + + scalar.ph: + s_init = PHI + br scalar.body + + After execution completes the vector loop, we extract the next value of + the recurrence (x) to use as the initial value in the scalar loop. */ + + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + tree scalar_dest = gimple_phi_result (stmt_info->stmt); + basic_block bb = gimple_bb (stmt_info->stmt); + tree vec_dest + = vect_get_new_vect_var (vectype, vect_simple_var, "vec_recur_"); + auto_vec vec_oprnds; + tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop)); + + tree vec_init = build_vector_from_val (vectype, preheader); + vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL); + vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, + !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1, + gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds); + /* Create the vectorized first-order PHI node. */ + gphi *new_phi = create_phi_node (vec_dest, bb); + add_phi_arg (new_phi, vec_oprnds[0], loop_latch_edge (loop), + UNKNOWN_LOCATION); + add_phi_arg (new_phi, vec_init, loop_preheader_edge (loop), UNKNOWN_LOCATION); + if (slp_node) + SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi); + else + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi); + if (!slp_node) + *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; + return true; +} + /* Return true if VECTYPE represents a vector that requires lowering by the vector lowering pass. */ @@ -10476,6 +10653,29 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree advance) epilogue_vinfo->shared->save_datarefs (); } +/* Return true if the stmt is the first statement that uses the result + of a first-order recurrence phi node. */ + +static gphi * +vect_use_first_order_phi_result_p (auto_vec &first_order_phi_vec, + gimple *stmt) +{ + for (unsigned int i = 0; i < first_order_phi_vec.length (); ++i) + { + gphi *phi = first_order_phi_vec[i]; + tree phi_result = PHI_RESULT (phi); + gimple *first_use; + use_operand_p use_p; + if (!single_imm_use (phi_result, &use_p, &first_use)) + continue; + + if (first_use == stmt) + return phi; + } + + return NULL; +} + /* Function vect_transform_loop. The analysis phase has determined that the loop is vectorizable. @@ -10624,6 +10824,31 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) basic_block bb = bbs[i]; stmt_vec_info stmt_info; + /* It's used to save the first-order recurrence phi node. + Consider the following case: + # t_19 = PHI <_4(6), 0(5)> + ... + _4 = *_3; + ... + _6 = _4 - t_19; + + The vectorization sequence should be: + 1. Vectorize _4 = *_3; + 2. Vectorize # t_19 = PHI <_4(6), 0(5)> + 3, Vectorize _6 = _4 - t_19; + + Flow: + 1. So we save the first-order recurrence PHI in first_order_phi_vec + and skip vectorizing it tentatively when iterating phi statement + (save t_19 = PHI <_4(6), 0(5)>). + 2. Vectorize all statements when iterating all statements + until we reach the statement who is using the result of + first-order recurrence PHI (vectorize _4 = *_3;). + 3. Stop iterating and return to vectorize the PHI saved + in first_order_phi_vec (# t_19 = PHI <_4(6), 0(5)>). + 4. Conintue iterating and vectorize the residual statements. */ + auto_vec first_order_phi_vec; + for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) { @@ -10677,9 +10902,16 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle - || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def) + || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def + || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence) && ! PURE_SLP_STMT (stmt_info)) maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info); + + if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence) + { + first_order_phi_vec.safe_push (phi); + continue; + } } for (gimple_stmt_iterator si = gsi_start_bb (bb); @@ -10698,6 +10930,18 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) /* Ignore vector stmts created in the outer loop. */ stmt_info = loop_vinfo->lookup_stmt (stmt); + gphi *first_order_phi + = vect_use_first_order_phi_result_p (first_order_phi_vec, stmt); + if (first_order_phi) + { + stmt_vec_info first_order_stmt_info + = loop_vinfo->lookup_stmt (first_order_phi); + gimple_stmt_iterator first_order_si + = gsi_for_stmt (first_order_phi); + vect_transform_loop_stmt (loop_vinfo, first_order_stmt_info, + &first_order_si, NULL); + } + /* vector stmts created in the outer-loop during vectorization of stmts in an inner-loop may not have a stmt_info, and do not need to be vectorized. */ diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index c8d1efc45e5..ad37bbea30c 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -1503,6 +1503,41 @@ vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info stmt_vinfo, while (ncopies--) vec_oprnds->quick_push (vop); } + else if (dt == vect_first_order_recurrence) + { + def_stmt_info = vect_stmt_to_vectorize (def_stmt_info); + gcc_assert (STMT_VINFO_VEC_STMTS (def_stmt_info).length () == ncopies); + tree vector_type; + + if (vectype) + vector_type = vectype; + else + vector_type = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op)); + + /* Insert shufflevector to for first-order recurrence autovectorization. + result = VEC_PERM (STMT_VINFO_VEC_STMTS (def_stmt_info)[i]); + tree latch = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop)); + tree recur = gimple_phi_result (phi); + gassign *assign + = gimple_build_assign (recur, VEC_PERM_EXPR, recur, latch, perm); + gimple_assign_set_lhs (assign, recur); + gimple_stmt_iterator gsi = gsi_for_stmt (stmt_vinfo->stmt); + vect_finish_stmt_generation (vinfo, stmt_vinfo, assign, &gsi); + vec_oprnds->quick_push (recur); + } + } else { def_stmt_info = vect_stmt_to_vectorize (def_stmt_info); @@ -11404,6 +11439,12 @@ vect_transform_stmt (vec_info *vinfo, gcc_assert (done); break; + case dep_phi_info_type: + done = vectorizable_dep_phi (as_a (vinfo), + stmt_info, &vec_stmt, slp_node); + gcc_assert (done); + break; + case phi_info_type: done = vectorizable_phi (vinfo, stmt_info, &vec_stmt, slp_node, NULL); gcc_assert (done); @@ -11804,6 +11845,9 @@ vect_is_simple_use (tree operand, vec_info *vinfo, enum vect_def_type *dt, case vect_nested_cycle: dump_printf (MSG_NOTE, "nested cycle\n"); break; + case vect_first_order_recurrence: + dump_printf (MSG_NOTE, "first order recurrence\n"); + break; case vect_unknown_def_type: dump_printf (MSG_NOTE, "unknown\n"); break; @@ -11852,7 +11896,8 @@ vect_is_simple_use (tree operand, vec_info *vinfo, enum vect_def_type *dt, || *dt == vect_induction_def || *dt == vect_reduction_def || *dt == vect_double_reduction_def - || *dt == vect_nested_cycle) + || *dt == vect_nested_cycle + || *dt == vect_first_order_recurrence) { *vectype = STMT_VINFO_VECTYPE (def_stmt_info); gcc_assert (*vectype != NULL_TREE); diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 4870c754499..2c8e7660b4e 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -65,6 +65,7 @@ enum vect_def_type { vect_reduction_def, vect_double_reduction_def, vect_nested_cycle, + vect_first_order_recurrence, vect_unknown_def_type }; @@ -1027,6 +1028,7 @@ enum stmt_vec_info_type { cycle_phi_info_type, lc_phi_info_type, phi_info_type, + dep_phi_info_type, loop_exit_ctrl_vec_info_type }; @@ -2331,6 +2333,8 @@ extern bool vectorizable_lc_phi (loop_vec_info, stmt_vec_info, gimple **, slp_tree); extern bool vectorizable_phi (vec_info *, stmt_vec_info, gimple **, slp_tree, stmt_vector_for_cost *); +extern bool vectorizable_dep_phi (loop_vec_info, stmt_vec_info, + gimple **, slp_tree); extern bool vect_emulated_vector_p (tree); extern bool vect_can_vectorize_without_simd_p (tree_code); extern bool vect_can_vectorize_without_simd_p (code_helper); -- 2.36.1