> --param vect-partial-vector-usage=2 would, no? > I.. didn't even know it went to 2! > > In principal I suppose I could mask the individual stmts, that should handle the > future case when > > This is relaxed to supposed non-fix length buffers? > > Well, it looks wrong - either put in an assert that we start with a > single stmt or assert !masked_loop_p instead? Better ICE than > generate wrong code. > > That said, I think you need to apply the masking on the original > stmts[], before reducing them, no? Yeah, I've done so now. For simplicity I've just kept the final masking always as well and just leave it up to the optimizers to drop it when it's superfluous. Simple testcase: #ifndef N #define N 837 #endif float vect_a[N]; unsigned vect_b[N]; unsigned test4(double x) { unsigned ret = 0; for (int i = 0; i < N; i++) { if (vect_a[i] > x) break; vect_a[i] = x; } return ret; } Looks good now. After this one there's only one patch left, the dependency analysis. I'm almost done with the cleanup/respin, but want to take the weekend to double check and will post it first thing Monday morning. Did you want to see the testsuite changes as well again? I've basically just added the right dg-requires-effective and add-options etc. Thanks for all the reviews! Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * tree-vect-patterns.cc (vect_init_pattern_stmt): Support gconds. (check_bool_pattern, adjust_bool_pattern, adjust_bool_stmts, vect_recog_bool_pattern, sort_after_uid): Support gconds type analysis. * tree-vect-stmts.cc (vectorizable_comparison_1): Support stmts without lhs. (vectorizable_early_exit): New. (vect_analyze_stmt, vect_transform_stmt): Use it. (vect_is_simple_use, vect_get_vector_types_for_stmt): Support gcond. --- inline copy of patch --- diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc index 7debe7f0731673cd1bf25cd39d55e23990a73d0e..8865cde9f3481a474d31848ae12523576d29744d 100644 --- a/gcc/tree-vect-patterns.cc +++ b/gcc/tree-vect-patterns.cc @@ -132,6 +132,7 @@ vect_init_pattern_stmt (vec_info *vinfo, gimple *pattern_stmt, if (!STMT_VINFO_VECTYPE (pattern_stmt_info)) { gcc_assert (!vectype + || is_a (pattern_stmt) || (VECTOR_BOOLEAN_TYPE_P (vectype) == vect_use_mask_type_p (orig_stmt_info))); STMT_VINFO_VECTYPE (pattern_stmt_info) = vectype; @@ -5210,19 +5211,28 @@ vect_recog_mixed_size_cond_pattern (vec_info *vinfo, true if bool VAR can and should be optimized that way. Assume it shouldn't in case it's a result of a comparison which can be directly vectorized into a vector comparison. Fills in STMTS with all stmts visited during the - walk. */ + walk. if ANALYZE_ONLY then only analyze the booleans but do not perform any + codegen associated with the boolean condition. */ static bool -check_bool_pattern (tree var, vec_info *vinfo, hash_set &stmts) +check_bool_pattern (tree var, vec_info *vinfo, hash_set &stmts, + bool analyze_only) { tree rhs1; enum tree_code rhs_code; + gassign *def_stmt = NULL; stmt_vec_info def_stmt_info = vect_get_internal_def (vinfo, var); - if (!def_stmt_info) + if (!def_stmt_info && !analyze_only) return false; + else if (!def_stmt_info) + /* If we're a only analyzing we won't be codegen-ing the statements and are + only after if the types match. In that case we can accept loop invariant + values. */ + def_stmt = dyn_cast (SSA_NAME_DEF_STMT (var)); + else + def_stmt = dyn_cast (def_stmt_info->stmt); - gassign *def_stmt = dyn_cast (def_stmt_info->stmt); if (!def_stmt) return false; @@ -5234,27 +5244,28 @@ check_bool_pattern (tree var, vec_info *vinfo, hash_set &stmts) switch (rhs_code) { case SSA_NAME: - if (! check_bool_pattern (rhs1, vinfo, stmts)) + if (! check_bool_pattern (rhs1, vinfo, stmts, analyze_only)) return false; break; CASE_CONVERT: if (!VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (rhs1))) return false; - if (! check_bool_pattern (rhs1, vinfo, stmts)) + if (! check_bool_pattern (rhs1, vinfo, stmts, analyze_only)) return false; break; case BIT_NOT_EXPR: - if (! check_bool_pattern (rhs1, vinfo, stmts)) + if (! check_bool_pattern (rhs1, vinfo, stmts, analyze_only)) return false; break; case BIT_AND_EXPR: case BIT_IOR_EXPR: case BIT_XOR_EXPR: - if (! check_bool_pattern (rhs1, vinfo, stmts) - || ! check_bool_pattern (gimple_assign_rhs2 (def_stmt), vinfo, stmts)) + if (! check_bool_pattern (rhs1, vinfo, stmts, analyze_only) + || ! check_bool_pattern (gimple_assign_rhs2 (def_stmt), vinfo, stmts, + analyze_only)) return false; break; @@ -5275,6 +5286,7 @@ check_bool_pattern (tree var, vec_info *vinfo, hash_set &stmts) tree mask_type = get_mask_type_for_scalar_type (vinfo, TREE_TYPE (rhs1)); if (mask_type + && !analyze_only && expand_vec_cmp_expr_p (comp_vectype, mask_type, rhs_code)) return false; @@ -5289,7 +5301,8 @@ check_bool_pattern (tree var, vec_info *vinfo, hash_set &stmts) } else vecitype = comp_vectype; - if (! expand_vec_cond_expr_p (vecitype, comp_vectype, rhs_code)) + if (!analyze_only + && !expand_vec_cond_expr_p (vecitype, comp_vectype, rhs_code)) return false; } else @@ -5324,11 +5337,13 @@ adjust_bool_pattern_cast (vec_info *vinfo, VAR is an SSA_NAME that should be transformed from bool to a wider integer type, OUT_TYPE is the desired final integer type of the whole pattern. STMT_INFO is the info of the pattern root and is where pattern stmts should - be associated with. DEFS is a map of pattern defs. */ + be associated with. DEFS is a map of pattern defs. If TYPE_ONLY then don't + create new pattern statements and instead only fill LAST_STMT and DEFS. */ static void adjust_bool_pattern (vec_info *vinfo, tree var, tree out_type, - stmt_vec_info stmt_info, hash_map &defs) + stmt_vec_info stmt_info, hash_map &defs, + gimple *&last_stmt, bool type_only) { gimple *stmt = SSA_NAME_DEF_STMT (var); enum tree_code rhs_code, def_rhs_code; @@ -5492,28 +5507,38 @@ adjust_bool_pattern (vec_info *vinfo, tree var, tree out_type, } gimple_set_location (pattern_stmt, loc); - append_pattern_def_seq (vinfo, stmt_info, pattern_stmt, - get_vectype_for_scalar_type (vinfo, itype)); + if (!type_only) + append_pattern_def_seq (vinfo, stmt_info, pattern_stmt, + get_vectype_for_scalar_type (vinfo, itype)); + last_stmt = pattern_stmt; defs.put (var, gimple_assign_lhs (pattern_stmt)); } -/* Comparison function to qsort a vector of gimple stmts after UID. */ +/* Comparison function to qsort a vector of gimple stmts after BB and UID. + the def of one statement can be in an earlier block than the use, so if + the BB are different, first compare by BB. */ static int sort_after_uid (const void *p1, const void *p2) { const gimple *stmt1 = *(const gimple * const *)p1; const gimple *stmt2 = *(const gimple * const *)p2; + if (gimple_bb (stmt1)->index != gimple_bb (stmt2)->index) + return gimple_bb (stmt1)->index - gimple_bb (stmt2)->index; + return gimple_uid (stmt1) - gimple_uid (stmt2); } /* Create pattern stmts for all stmts participating in the bool pattern specified by BOOL_STMT_SET and its root STMT_INFO with the desired type - OUT_TYPE. Return the def of the pattern root. */ + OUT_TYPE. Return the def of the pattern root. If TYPE_ONLY the new + statements are not emitted as pattern statements and the tree returned is + only useful for type queries. */ static tree adjust_bool_stmts (vec_info *vinfo, hash_set &bool_stmt_set, - tree out_type, stmt_vec_info stmt_info) + tree out_type, stmt_vec_info stmt_info, + bool type_only = false) { /* Gather original stmts in the bool pattern in their order of appearance in the IL. */ @@ -5523,16 +5548,16 @@ adjust_bool_stmts (vec_info *vinfo, hash_set &bool_stmt_set, bool_stmts.quick_push (*i); bool_stmts.qsort (sort_after_uid); + gimple *last_stmt = NULL; + /* Now process them in that order, producing pattern stmts. */ hash_map defs; - for (unsigned i = 0; i < bool_stmts.length (); ++i) - adjust_bool_pattern (vinfo, gimple_assign_lhs (bool_stmts[i]), - out_type, stmt_info, defs); + for (auto bool_stmt : bool_stmts) + adjust_bool_pattern (vinfo, gimple_assign_lhs (bool_stmt), + out_type, stmt_info, defs, last_stmt, type_only); /* Pop the last pattern seq stmt and install it as pattern root for STMT. */ - gimple *pattern_stmt - = gimple_seq_last_stmt (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)); - return gimple_assign_lhs (pattern_stmt); + return gimple_assign_lhs (last_stmt); } /* Return the proper type for converting bool VAR into @@ -5608,13 +5633,27 @@ vect_recog_bool_pattern (vec_info *vinfo, enum tree_code rhs_code; tree var, lhs, rhs, vectype; gimple *pattern_stmt; - - if (!is_gimple_assign (last_stmt)) + gcond* cond = NULL; + if (!is_gimple_assign (last_stmt) + && !(cond = dyn_cast (last_stmt))) return NULL; - var = gimple_assign_rhs1 (last_stmt); - lhs = gimple_assign_lhs (last_stmt); - rhs_code = gimple_assign_rhs_code (last_stmt); + loop_vec_info loop_vinfo = dyn_cast (vinfo); + if (is_gimple_assign (last_stmt)) + { + var = gimple_assign_rhs1 (last_stmt); + lhs = gimple_assign_lhs (last_stmt); + rhs_code = gimple_assign_rhs_code (last_stmt); + } + else if (loop_vinfo && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) + { + /* If not multiple exits, and loop vectorization don't bother analyzing + the gcond as we don't support SLP today. */ + lhs = var = gimple_cond_lhs (last_stmt); + rhs_code = gimple_cond_code (last_stmt); + } + else + return NULL; if (rhs_code == VIEW_CONVERT_EXPR) var = TREE_OPERAND (var, 0); @@ -5632,7 +5671,7 @@ vect_recog_bool_pattern (vec_info *vinfo, return NULL; vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (lhs)); - if (check_bool_pattern (var, vinfo, bool_stmts)) + if (check_bool_pattern (var, vinfo, bool_stmts, false)) { rhs = adjust_bool_stmts (vinfo, bool_stmts, TREE_TYPE (lhs), stmt_vinfo); @@ -5680,7 +5719,7 @@ vect_recog_bool_pattern (vec_info *vinfo, return pattern_stmt; } - else if (rhs_code == COND_EXPR + else if ((rhs_code == COND_EXPR || cond) && TREE_CODE (var) == SSA_NAME) { vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (lhs)); @@ -5700,18 +5739,31 @@ vect_recog_bool_pattern (vec_info *vinfo, if (get_vectype_for_scalar_type (vinfo, type) == NULL_TREE) return NULL; - if (check_bool_pattern (var, vinfo, bool_stmts)) - var = adjust_bool_stmts (vinfo, bool_stmts, type, stmt_vinfo); + if (check_bool_pattern (var, vinfo, bool_stmts, cond)) + var = adjust_bool_stmts (vinfo, bool_stmts, type, stmt_vinfo, cond); else if (integer_type_for_mask (var, vinfo)) return NULL; - lhs = vect_recog_temp_ssa_var (TREE_TYPE (lhs), NULL); - pattern_stmt - = gimple_build_assign (lhs, COND_EXPR, - build2 (NE_EXPR, boolean_type_node, - var, build_int_cst (TREE_TYPE (var), 0)), - gimple_assign_rhs2 (last_stmt), - gimple_assign_rhs3 (last_stmt)); + if (!cond) + { + lhs = vect_recog_temp_ssa_var (TREE_TYPE (lhs), NULL); + pattern_stmt + = gimple_build_assign (lhs, COND_EXPR, + build2 (NE_EXPR, boolean_type_node, var, + build_int_cst (TREE_TYPE (var), 0)), + gimple_assign_rhs2 (last_stmt), + gimple_assign_rhs3 (last_stmt)); + } + else + { + pattern_stmt + = gimple_build_cond (gimple_cond_code (cond), + gimple_cond_lhs (cond), gimple_cond_rhs (cond), + gimple_cond_true_label (cond), + gimple_cond_false_label (cond)); + vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (var)); + vectype = truth_type_for (vectype); + } *type_out = vectype; vect_pattern_detected ("vect_recog_bool_pattern", last_stmt); @@ -5725,7 +5777,7 @@ vect_recog_bool_pattern (vec_info *vinfo, if (!vectype || !VECTOR_MODE_P (TYPE_MODE (vectype))) return NULL; - if (check_bool_pattern (var, vinfo, bool_stmts)) + if (check_bool_pattern (var, vinfo, bool_stmts, false)) rhs = adjust_bool_stmts (vinfo, bool_stmts, TREE_TYPE (vectype), stmt_vinfo); else diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 582c5e678fad802d6e76300fe3c939b9f2978f17..e9116d184149826ba436b0f562721c140d586c94 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -12489,7 +12489,7 @@ vectorizable_comparison_1 (vec_info *vinfo, tree vectype, vec vec_oprnds0 = vNULL; vec vec_oprnds1 = vNULL; tree mask_type; - tree mask; + tree mask = NULL_TREE; if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo) return false; @@ -12629,8 +12629,9 @@ vectorizable_comparison_1 (vec_info *vinfo, tree vectype, /* Transform. */ /* Handle def. */ - lhs = gimple_assign_lhs (STMT_VINFO_STMT (stmt_info)); - mask = vect_create_destination_var (lhs, mask_type); + lhs = gimple_get_lhs (STMT_VINFO_STMT (stmt_info)); + if (lhs) + mask = vect_create_destination_var (lhs, mask_type); vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies, rhs1, &vec_oprnds0, vectype, @@ -12644,7 +12645,10 @@ vectorizable_comparison_1 (vec_info *vinfo, tree vectype, gimple *new_stmt; vec_rhs2 = vec_oprnds1[i]; - new_temp = make_ssa_name (mask); + if (lhs) + new_temp = make_ssa_name (mask); + else + new_temp = make_temp_ssa_name (mask_type, NULL, "cmp"); if (bitop1 == NOP_EXPR) { new_stmt = gimple_build_assign (new_temp, code, @@ -12723,6 +12727,184 @@ vectorizable_comparison (vec_info *vinfo, return true; } +/* Check to see if the current early break given in STMT_INFO is valid for + vectorization. */ + +static bool +vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info, + gimple_stmt_iterator *gsi, gimple **vec_stmt, + slp_tree slp_node, stmt_vector_for_cost *cost_vec) +{ + loop_vec_info loop_vinfo = dyn_cast (vinfo); + if (!loop_vinfo + || !is_a (STMT_VINFO_STMT (stmt_info))) + return false; + + if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_condition_def) + return false; + + if (!STMT_VINFO_RELEVANT_P (stmt_info)) + return false; + + auto code = gimple_cond_code (STMT_VINFO_STMT (stmt_info)); + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + gcc_assert (vectype); + + tree vectype_op0 = NULL_TREE; + slp_tree slp_op0; + tree op0; + enum vect_def_type dt0; + if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 0, &op0, &slp_op0, &dt0, + &vectype_op0)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "use not simple.\n"); + return false; + } + + machine_mode mode = TYPE_MODE (vectype); + int ncopies; + + if (slp_node) + ncopies = 1; + else + ncopies = vect_get_num_copies (loop_vinfo, vectype); + + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); + + /* Analyze only. */ + if (!vec_stmt) + { + if (direct_optab_handler (cbranch_optab, mode) == CODE_FOR_nothing) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't vectorize early exit because the " + "target doesn't support flag setting vector " + "comparisons.\n"); + return false; + } + + if (ncopies > 1 + && direct_optab_handler (ior_optab, mode) == CODE_FOR_nothing) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't vectorize early exit because the " + "target does not support boolean vector OR for " + "type %T.\n", vectype); + return false; + } + + if (!vectorizable_comparison_1 (vinfo, vectype, stmt_info, code, gsi, + vec_stmt, slp_node, cost_vec)) + return false; + + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) + { + if (direct_internal_fn_supported_p (IFN_VCOND_MASK_LEN, vectype, + OPTIMIZE_FOR_SPEED)) + return false; + else + vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, NULL); + } + + + return true; + } + + /* Tranform. */ + + tree new_temp = NULL_TREE; + gimple *new_stmt = NULL; + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "transform early-exit.\n"); + + if (!vectorizable_comparison_1 (vinfo, vectype, stmt_info, code, gsi, + vec_stmt, slp_node, cost_vec)) + gcc_unreachable (); + + gimple *stmt = STMT_VINFO_STMT (stmt_info); + basic_block cond_bb = gimple_bb (stmt); + gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb); + + auto_vec stmts; + + tree mask = NULL_TREE; + if (masked_loop_p) + mask = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies, vectype, 0); + + if (slp_node) + stmts.safe_splice (SLP_TREE_VEC_DEFS (slp_node)); + else + { + auto vec_stmts = STMT_VINFO_VEC_STMTS (stmt_info); + stmts.reserve_exact (vec_stmts.length ()); + for (auto stmt : vec_stmts) + stmts.quick_push (gimple_assign_lhs (stmt)); + } + + /* Determine if we need to reduce the final value. */ + if (stmts.length () > 1) + { + /* We build the reductions in a way to maintain as much parallelism as + possible. */ + auto_vec workset (stmts.length ()); + + /* Mask the statements as we queue them up. */ + if (masked_loop_p) + for (auto stmt : stmts) + workset.quick_push (prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), + mask, stmt, &cond_gsi)); + else + workset.splice (stmts); + + while (workset.length () > 1) + { + new_temp = make_temp_ssa_name (vectype, NULL, "vexit_reduc"); + tree arg0 = workset.pop (); + tree arg1 = workset.pop (); + new_stmt = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1); + vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, + &cond_gsi); + workset.quick_insert (0, new_temp); + } + } + else + new_temp = stmts[0]; + + gcc_assert (new_temp); + + tree cond = new_temp; + /* If we have multiple statements after reduction we should check all the + lanes and treat it as a full vector. */ + if (masked_loop_p) + cond = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask, cond, + &cond_gsi); + + /* Now build the new conditional. Pattern gimple_conds get dropped during + codegen so we must replace the original insn. */ + stmt = STMT_VINFO_STMT (vect_orig_stmt (stmt_info)); + gcond *cond_stmt = as_a (stmt); + gimple_cond_set_condition (cond_stmt, NE_EXPR, cond, + build_zero_cst (vectype)); + update_stmt (stmt); + + if (slp_node) + SLP_TREE_VEC_DEFS (slp_node).truncate (0); + else + STMT_VINFO_VEC_STMTS (stmt_info).truncate (0); + + + if (!slp_node) + *vec_stmt = stmt; + + return true; +} + /* If SLP_NODE is nonnull, return true if vectorizable_live_operation can handle all live statements in the node. Otherwise return true if STMT_INFO is not live or if vectorizable_live_operation can handle it. @@ -12949,7 +13131,9 @@ vect_analyze_stmt (vec_info *vinfo, || vectorizable_lc_phi (as_a (vinfo), stmt_info, NULL, node) || vectorizable_recurr (as_a (vinfo), - stmt_info, NULL, node, cost_vec)); + stmt_info, NULL, node, cost_vec) + || vectorizable_early_exit (vinfo, stmt_info, NULL, NULL, node, + cost_vec)); else { if (bb_vinfo) @@ -12972,7 +13156,10 @@ vect_analyze_stmt (vec_info *vinfo, NULL, NULL, node, cost_vec) || vectorizable_comparison (vinfo, stmt_info, NULL, NULL, node, cost_vec) - || vectorizable_phi (vinfo, stmt_info, NULL, node, cost_vec)); + || vectorizable_phi (vinfo, stmt_info, NULL, node, cost_vec) + || vectorizable_early_exit (vinfo, stmt_info, NULL, NULL, node, + cost_vec)); + } if (node) @@ -13131,6 +13318,12 @@ vect_transform_stmt (vec_info *vinfo, gcc_assert (done); break; + case loop_exit_ctrl_vec_info_type: + done = vectorizable_early_exit (vinfo, stmt_info, gsi, &vec_stmt, + slp_node, NULL); + gcc_assert (done); + break; + default: if (!STMT_VINFO_LIVE_P (stmt_info)) { @@ -14321,10 +14514,19 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info, } else { + gcond *cond = NULL; if (data_reference *dr = STMT_VINFO_DATA_REF (stmt_info)) scalar_type = TREE_TYPE (DR_REF (dr)); else if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3)); + else if ((cond = dyn_cast (stmt))) + { + /* We can't convert the scalar type to boolean yet, since booleans have a + single bit precision and we need the vector boolean to be a + representation of the integer mask. So set the correct integer type and + convert to boolean vector once we have a vectype. */ + scalar_type = TREE_TYPE (gimple_cond_lhs (cond)); + } else scalar_type = TREE_TYPE (gimple_get_lhs (stmt)); @@ -14339,12 +14541,18 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info, "get vectype for scalar type: %T\n", scalar_type); } vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size); + if (!vectype) return opt_result::failure_at (stmt, "not vectorized:" " unsupported data-type %T\n", scalar_type); + /* If we were a gcond, convert the resulting type to a vector boolean type now + that we have the correct integer mask type. */ + if (cond) + vectype = truth_type_for (vectype); + if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype); }