diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 6667cd0..5eccbdc 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -1912,6 +1912,16 @@ static const struct attribute_spec rs6000_attribute_table[] = #undef TARGET_PREDICT_DOLOOP_P #define TARGET_PREDICT_DOLOOP_P rs6000_predict_doloop_p +#undef TARGET_HAVE_COUNT_REG_DECR_P +#define TARGET_HAVE_COUNT_REG_DECR_P true + +/* 1000000000 is infinite cost in IVOPTs. */ +#undef TARGET_DOLOOP_COST_FOR_GENERIC +#define TARGET_DOLOOP_COST_FOR_GENERIC 1000000000 + +#undef TARGET_DOLOOP_COST_FOR_ADDRESS +#define TARGET_DOLOOP_COST_FOR_ADDRESS 1000000000 + #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV rs6000_atomic_assign_expand_fenv diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index c2aa4d0..9f3a08a 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -11618,6 +11618,29 @@ loops, and will help ivopts to make some decisions. The default version of this hook returns false. @end deftypefn +@deftypevr {Target Hook} bool TARGET_HAVE_COUNT_REG_DECR_P +Return true if the target supports hardware count register for decrement +and branch. This count register can't be used as general register since +moving to/from a general register from/to it is very expensive. +The default value is false. +@end deftypevr + +@deftypevr {Target Hook} int64_t TARGET_DOLOOP_COST_FOR_GENERIC +IVOPTs introduces one doloop dedicated IV candidate, this hook offers + target owner a way to adjust cost when selecting doloop IV candidate for a + generic IV use. At calcuation, this value will be added on normal cost + already calculated by current implementation. +The default value is zero. +@end deftypevr + +@deftypevr {Target Hook} int64_t TARGET_DOLOOP_COST_FOR_ADDRESS +IVOPTs introduces one doloop dedicated IV candidate, this hook offers + target owner a way to adjust cost when selecting doloop IV candidate for an + address IV use. At calcuation, this value will be added on normal cost + already calculated by current implementation. +The default value is zero. +@end deftypevr + @deftypefn {Target Hook} bool TARGET_CAN_USE_DOLOOP_P (const widest_int @var{&iterations}, const widest_int @var{&iterations_max}, unsigned int @var{loop_depth}, bool @var{entered_at_top}) Return true if it is possible to use low-overhead loops (@code{doloop_end} and @code{doloop_begin}) for a particular loop. @var{iterations} gives the diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index b4d57b8..4346773 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -7946,6 +7946,12 @@ to by @var{ce_info}. @hook TARGET_PREDICT_DOLOOP_P +@hook TARGET_HAVE_COUNT_REG_DECR_P + +@hook TARGET_DOLOOP_COST_FOR_GENERIC + +@hook TARGET_DOLOOP_COST_FOR_ADDRESS + @hook TARGET_CAN_USE_DOLOOP_P @hook TARGET_INVALID_WITHIN_DOLOOP diff --git a/gcc/target.def b/gcc/target.def index 71b6972..69e2844 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -4246,6 +4246,32 @@ The default version of this hook returns false.", bool, (struct loop *loop), default_predict_doloop_p) +DEFHOOKPOD +(have_count_reg_decr_p, + "Return true if the target supports hardware count register for decrement\n\ +and branch. This count register can't be used as general register since\n\ +moving to/from a general register from/to it is very expensive.\n\ +The default value is false.", + bool, false) + +DEFHOOKPOD +(doloop_cost_for_generic, + "IVOPTs introduces one doloop dedicated IV candidate, this hook offers\n\ + target owner a way to adjust cost when selecting doloop IV candidate for a\n\ + generic IV use. At calcuation, this value will be added on normal cost\n\ + already calculated by current implementation.\n\ +The default value is zero.", + int64_t, 0) + +DEFHOOKPOD +(doloop_cost_for_address, + "IVOPTs introduces one doloop dedicated IV candidate, this hook offers\n\ + target owner a way to adjust cost when selecting doloop IV candidate for an\n\ + address IV use. At calcuation, this value will be added on normal cost\n\ + already calculated by current implementation.\n\ +The default value is zero.", + int64_t, 0) + DEFHOOK (can_use_doloop_p, "Return true if it is possible to use low-overhead loops (@code{doloop_end}\n\ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c index 214e6a7..ce4b1d0 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c @@ -10,4 +10,6 @@ int main (void) f2 (); } -/* { dg-final { scan-tree-dump-times "!= 0" 5 "ivopts" } } */ +/* { dg-final { scan-tree-dump-times "!= 0" 5 "ivopts" { target { ! powerpc*-*-* } } } } */ +/* More debug information emitted for doloop on powerpc. */ +/* { dg-final { scan-tree-dump-times "!= 0" 6 "ivopts" { target { powerpc*-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c index 7d5859b..71d7f67 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c @@ -17,6 +17,7 @@ f1 (char *p, uintptr_t i, uintptr_t n) while (i < n); } -/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" } } */ -/* { dg-final { scan-tree-dump-times "PHI = 45) diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c index 530ea4a..88e7890 100644 --- a/gcc/tree-ssa-loop-ivopts.c +++ b/gcc/tree-ssa-loop-ivopts.c @@ -64,7 +64,30 @@ along with GCC; see the file COPYING3. If not see All of this is done loop by loop. Doing it globally is theoretically possible, it might give a better performance and it might enable us to decide costs more precisely, but getting all the interactions right - would be complicated. */ + would be complicated. + + For the targets supporting low-overhead loops, IVOPTs has to take care of + the loops which will probably be transformed in RTL doloop optimization, + to try to make selected IV candidate set optimal. The process of doloop + support includes: + + 1) Analyze the current loop will be transformed to doloop or not, find and + mark its compare type IV use as doloop use (iv_group field doloop_p), and + set flag doloop_use_p of ivopts_data to notify subsequent processings on + doloop. See analyze_and_mark_doloop_use and its callees for the details. + The target hook predict_doloop_p can be used for target specific checks. + + 2) Add one doloop dedicated IV cand {(may_be_zero ? 1 : (niter + 1)), +, -1}, + set flag doloop_p of iv_cand, step cost is set as zero and no extra cost + like biv. For cost determination between doloop IV cand and IV use, the + target hooks doloop_cost_for_generic and doloop_cost_for_address are + provided to add on extra costs for generic type and address type IV use. + Zero cost is assigned to the pair between doloop IV cand and doloop IV + use, and bound zero is set for IV elimination. + + 3) With the cost setting in step 2), the current cost model based IV + selection algorithm will process as usual, pick up doloop dedicated IV if + profitable. */ #include "config.h" #include "system.h" @@ -275,6 +298,9 @@ comp_cost::operator+= (comp_cost cost) comp_cost comp_cost::operator+= (HOST_WIDE_INT c) { + if (c >= INFTY) + this->cost = INFTY; + if (infinite_cost_p ()) return *this; @@ -399,6 +425,8 @@ struct iv_group struct cost_pair *cost_map; /* The selected candidate for the group. */ struct iv_cand *selected; + /* To indicate this is a doloop use group. */ + bool doloop_p; /* Uses in the group. */ vec vuses; }; @@ -439,6 +467,7 @@ struct iv_cand be hoisted out of loop. */ struct iv *orig_iv; /* The original iv if this cand is added from biv with smaller type. */ + bool doloop_p; /* Whether this is a doloop candidate. */ }; /* Hashtable entry for common candidate derived from iv uses. */ @@ -612,6 +641,9 @@ struct ivopts_data /* Whether the loop body can only be exited via single exit. */ bool loop_single_exit_p; + + /* Whether the loop has doloop comparison use. */ + bool doloop_use_p; }; /* An assignment of iv candidates to uses. */ @@ -1528,6 +1560,7 @@ record_group (struct ivopts_data *data, enum use_type type) group->type = type; group->related_cands = BITMAP_ALLOC (NULL); group->vuses.create (1); + group->doloop_p = false; data->vgroups.safe_push (group); return group; @@ -3017,10 +3050,10 @@ get_loop_invariant_expr (struct ivopts_data *data, tree inv_expr) replacement of the final value of the iv by a direct computation. */ static struct iv_cand * -add_candidate_1 (struct ivopts_data *data, - tree base, tree step, bool important, enum iv_position pos, - struct iv_use *use, gimple *incremented_at, - struct iv *orig_iv = NULL) +add_candidate_1 (struct ivopts_data *data, tree base, tree step, bool important, + enum iv_position pos, struct iv_use *use, + gimple *incremented_at, struct iv *orig_iv = NULL, + bool doloop = false) { unsigned i; struct iv_cand *cand = NULL; @@ -3079,11 +3112,15 @@ add_candidate_1 (struct ivopts_data *data, cand->pos = pos; if (pos != IP_ORIGINAL) { - cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "ivtmp"); + if (doloop) + cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "doloop"); + else + cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "ivtmp"); cand->var_after = cand->var_before; } cand->important = important; cand->incremented_at = incremented_at; + cand->doloop_p = doloop; data->vcands.safe_push (cand); if (!poly_int_tree_p (step)) @@ -3116,6 +3153,7 @@ add_candidate_1 (struct ivopts_data *data, } cand->important |= important; + cand->doloop_p |= doloop; /* Relate candidate to the group for which it is added. */ if (use) @@ -3209,14 +3247,16 @@ add_autoinc_candidates (struct ivopts_data *data, tree base, tree step, the end of loop. */ static void -add_candidate (struct ivopts_data *data, - tree base, tree step, bool important, struct iv_use *use, - struct iv *orig_iv = NULL) +add_candidate (struct ivopts_data *data, tree base, tree step, bool important, + struct iv_use *use, struct iv *orig_iv = NULL, + bool doloop = false) { if (ip_normal_pos (data->current_loop)) - add_candidate_1 (data, base, step, important, - IP_NORMAL, use, NULL, orig_iv); - if (ip_end_pos (data->current_loop) + add_candidate_1 (data, base, step, important, IP_NORMAL, use, NULL, orig_iv, + doloop); + /* Exclude doloop candidate here since it requires decrement then comparison + and jump, the IP_END position doesn't match. */ + if (!doloop && ip_end_pos (data->current_loop) && allow_ip_end_pos_p (data->current_loop)) add_candidate_1 (data, base, step, important, IP_END, use, NULL, orig_iv); } @@ -3724,7 +3764,7 @@ prepare_decl_rtl (tree *expr_p, int *ws, void *data) Some RTL specific checks seems unable to be checked in gimple, if any new checks or easy checks _are_ missing here, please add them. */ -static bool ATTRIBUTE_UNUSED +static bool generic_predict_doloop_p (struct ivopts_data *data) { struct loop *loop = data->current_loop; @@ -4177,6 +4217,36 @@ force_expr_to_var_cost (tree expr, bool speed) STRIP_NOPS (op0); op1 = NULL_TREE; break; + /* See add_iv_candidate_for_doloop, for doloop may_be_zero case, we + introduce COND_EXPR for IV base, need to support better cost estimation + for this COND_EXPR and tcc_comparison. */ + case COND_EXPR: + op0 = TREE_OPERAND (expr, 1); + STRIP_NOPS (op0); + op1 = TREE_OPERAND (expr, 2); + STRIP_NOPS (op1); + break; + case LT_EXPR: + case LE_EXPR: + case GT_EXPR: + case GE_EXPR: + case EQ_EXPR: + case NE_EXPR: + case UNORDERED_EXPR: + case ORDERED_EXPR: + case UNLT_EXPR: + case UNLE_EXPR: + case UNGT_EXPR: + case UNGE_EXPR: + case UNEQ_EXPR: + case LTGT_EXPR: + case MAX_EXPR: + case MIN_EXPR: + op0 = TREE_OPERAND (expr, 0); + STRIP_NOPS (op0); + op1 = TREE_OPERAND (expr, 1); + STRIP_NOPS (op1); + break; default: /* Just an arbitrary value, FIXME. */ @@ -4258,6 +4328,35 @@ force_expr_to_var_cost (tree expr, bool speed) case RSHIFT_EXPR: cost = comp_cost (add_cost (speed, mode), 0); break; + case COND_EXPR: + op0 = TREE_OPERAND (expr, 0); + STRIP_NOPS (op0); + if (op0 == NULL_TREE || TREE_CODE (op0) == SSA_NAME + || CONSTANT_CLASS_P (op0)) + cost = no_cost; + else + cost = force_expr_to_var_cost (op0, speed); + break; + case LT_EXPR: + case LE_EXPR: + case GT_EXPR: + case GE_EXPR: + case EQ_EXPR: + case NE_EXPR: + case UNORDERED_EXPR: + case ORDERED_EXPR: + case UNLT_EXPR: + case UNLE_EXPR: + case UNGT_EXPR: + case UNGE_EXPR: + case UNEQ_EXPR: + case LTGT_EXPR: + case MAX_EXPR: + case MIN_EXPR: + /* Simply use 1.5 * add cost for now, FIXME if there is some more accurate + cost evaluation way. */ + cost = comp_cost (1.5 * add_cost (speed, mode), 0); + break; default: gcc_unreachable (); @@ -4634,7 +4733,10 @@ get_computation_cost (struct ivopts_data *data, struct iv_use *use, { cost = get_address_cost (data, use, cand, &aff_inv, &aff_var, ratio, inv_vars, inv_expr, can_autoinc, speed); - return get_scaled_computation_cost_at (data, at, cost); + cost = get_scaled_computation_cost_at (data, at, cost); + /* For doloop IV cand, add on the extra cost. */ + cost += cand->doloop_p ? targetm.doloop_cost_for_address : 0; + return cost; } bool simple_inv = (aff_combination_const_p (&aff_inv) @@ -4684,6 +4786,10 @@ get_computation_cost (struct ivopts_data *data, struct iv_use *use, if (comp_inv && !integer_zerop (comp_inv)) cost += add_cost (speed, TYPE_MODE (utype)); + /* For doloop IV cand, add on the extra cost. */ + if (cand->doloop_p && use->type == USE_NONLINEAR_EXPR) + cost += targetm.doloop_cost_for_generic; + return get_scaled_computation_cost_at (data, at, cost); } @@ -5142,6 +5248,15 @@ may_eliminate_iv (struct ivopts_data *data, } } + /* For doloop IV cand, the bound would be zero. It's safe whether + may_be_zero set or not. */ + if (cand->doloop_p) + { + *bound = build_int_cst (TREE_TYPE (cand->iv->base), 0); + *comp = iv_elimination_compare (data, use); + return true; + } + cand_value_at (loop, cand, use->stmt, desc->niter, &bnd); *bound = fold_convert (TREE_TYPE (cand->iv->base), @@ -5264,6 +5379,9 @@ determine_group_iv_cost_cond (struct ivopts_data *data, inv_vars = inv_vars_elim; inv_vars_elim = NULL; inv_expr = inv_expr_elim; + /* For doloop candidate/use pair, adjust to zero cost. */ + if (group->doloop_p && cand->doloop_p && elim_cost.cost > no_cost.cost) + cost = no_cost; } else { @@ -5390,6 +5508,42 @@ relate_compare_use_with_all_cands (struct ivopts_data *data) } } +/* Add one doloop dedicated IV candidate: + - Base is (may_be_zero ? 1 : (niter + 1)). + - Step is -1. */ + +static void +add_iv_candidate_for_doloop (struct ivopts_data *data) +{ + tree_niter_desc *niter_desc = niter_for_single_dom_exit (data); + gcc_assert (niter_desc && niter_desc->assumptions); + + tree niter = niter_desc->niter; + tree ntype = TREE_TYPE (niter); + gcc_assert (TREE_CODE (ntype) == INTEGER_TYPE); + + tree may_be_zero = niter_desc->may_be_zero; + if (may_be_zero && integer_zerop (may_be_zero)) + may_be_zero = NULL_TREE; + if (may_be_zero) + { + if (COMPARISON_CLASS_P (may_be_zero)) + { + niter = fold_build3 (COND_EXPR, ntype, may_be_zero, + build_int_cst (ntype, 0), + rewrite_to_non_trapping_overflow (niter)); + } + /* Don't try to obtain the iteration count expression when may_be_zero is + integer_nonzerop (actually iteration count is one) or else. */ + else + return; + } + + tree base = fold_build2 (PLUS_EXPR, ntype, unshare_expr (niter), + build_int_cst (ntype, 1)); + add_candidate (data, base, build_int_cst (ntype, -1), true, NULL, NULL, true); +} + /* Finds the candidates for the induction variables. */ static void @@ -5398,6 +5552,10 @@ find_iv_candidates (struct ivopts_data *data) /* Add commonly used ivs. */ add_standard_iv_candidates (data); + /* Add doloop dedicate ivs. */ + if (data->doloop_use_p) + add_iv_candidate_for_doloop (data); + /* Add old induction variables. */ add_iv_candidate_for_bivs (data); @@ -5578,16 +5736,21 @@ determine_iv_cost (struct ivopts_data *data, struct iv_cand *cand) or a const set. */ if (cost_base.cost == 0) cost_base.cost = COSTS_N_INSNS (1); - cost_step = add_cost (data->speed, TYPE_MODE (TREE_TYPE (base))); - + /* Doloop decrement should be considered as zero cost. */ + if (cand->doloop_p) + cost_step = 0; + else + cost_step = add_cost (data->speed, TYPE_MODE (TREE_TYPE (base))); cost = cost_step + adjust_setup_cost (data, cost_base.cost); /* Prefer the original ivs unless we may gain something by replacing it. The reason is to make debugging simpler; so this is not relevant for artificial ivs created by other optimization passes. */ - if (cand->pos != IP_ORIGINAL - || !SSA_NAME_VAR (cand->var_before) - || DECL_ARTIFICIAL (SSA_NAME_VAR (cand->var_before))) + if ((cand->pos != IP_ORIGINAL + || !SSA_NAME_VAR (cand->var_before) + || DECL_ARTIFICIAL (SSA_NAME_VAR (cand->var_before))) + /* Prefer doloop as well. */ + && !cand->doloop_p) cost++; /* Prefer not to insert statements into latch unless there are some @@ -5832,7 +5995,8 @@ iv_ca_set_no_cp (struct ivopts_data *data, struct iv_ca *ivs, if (ivs->n_cand_uses[cid] == 0) { bitmap_clear_bit (ivs->cands, cid); - ivs->n_cands--; + if (!cp->cand->doloop_p || !targetm.have_count_reg_decr_p) + ivs->n_cands--; ivs->cand_cost -= cp->cand->cost; iv_ca_set_remove_invs (ivs, cp->cand->inv_vars, ivs->n_inv_var_uses); iv_ca_set_remove_invs (ivs, cp->cand->inv_exprs, ivs->n_inv_expr_uses); @@ -5889,7 +6053,8 @@ iv_ca_set_cp (struct ivopts_data *data, struct iv_ca *ivs, if (ivs->n_cand_uses[cid] == 1) { bitmap_set_bit (ivs->cands, cid); - ivs->n_cands++; + if (!cp->cand->doloop_p || !targetm.have_count_reg_decr_p) + ivs->n_cands++; ivs->cand_cost += cp->cand->cost; iv_ca_set_add_invs (ivs, cp->cand->inv_vars, ivs->n_inv_var_uses); iv_ca_set_add_invs (ivs, cp->cand->inv_exprs, ivs->n_inv_expr_uses); @@ -6134,6 +6299,8 @@ iv_ca_dump (struct ivopts_data *data, FILE *file, struct iv_ca *ivs) fprintf (file, " cost: %" PRId64 " (complexity %d)\n", cost.cost, cost.complexity); + fprintf (file, " reg_cost: %d\n", + ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands)); fprintf (file, " cand_cost: %" PRId64 "\n cand_group_cost: " "%" PRId64 " (complexity %d)\n", ivs->cand_cost, ivs->cand_use_cost.cost, ivs->cand_use_cost.complexity); @@ -7568,6 +7735,77 @@ determine_scaling_factor (struct ivopts_data *data, basic_block *body) } } +/* Find doloop comparison use and set its doloop_p on if found. */ + +static bool +find_doloop_use (struct ivopts_data *data) +{ + struct loop *loop = data->current_loop; + + for (unsigned i = 0; i < data->vgroups.length (); i++) + { + struct iv_group *group = data->vgroups[i]; + if (group->type == USE_COMPARE) + { + gcc_assert (group->vuses.length () == 1); + struct iv_use *use = group->vuses[0]; + gimple *stmt = use->stmt; + if (gimple_code (stmt) == GIMPLE_COND) + { + basic_block bb = gimple_bb (stmt); + edge true_edge, false_edge; + extract_true_false_edges_from_block (bb, &true_edge, &false_edge); + /* This comparison is used for loop latch. Require latch is empty + for now. */ + if ((loop->latch == true_edge->dest + || loop->latch == false_edge->dest) + && empty_block_p (loop->latch)) + { + group->doloop_p = true; + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Doloop cmp iv use: "); + print_gimple_stmt (dump_file, stmt, TDF_DETAILS); + } + return true; + } + } + } + } + + return false; +} + +/* For the targets which support doloop, to predict whether later RTL doloop + transformation will perform on this loop, further detect the doloop use and + mark the flag doloop_use_p if predicted. */ + +void +analyze_and_mark_doloop_use (struct ivopts_data *data) +{ + data->doloop_use_p = false; + + if (!flag_branch_on_count_reg) + return; + + if (!generic_predict_doloop_p (data)) + return; + + if (find_doloop_use (data)) + { + data->doloop_use_p = true; + if (dump_file && (dump_flags & TDF_DETAILS)) + { + struct loop *loop = data->current_loop; + fprintf (dump_file, + "Predict loop %d can perform" + " doloop optimization later.\n", + loop->num); + flow_loop_dump (loop, dump_file, NULL, 1); + } + } +} + /* Optimizes the LOOP. Returns true if anything changed. */ static bool @@ -7622,6 +7860,9 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop, /* Determine cost scaling factor for basic blocks in loop. */ determine_scaling_factor (data, body); + /* Analyze doloop possibility and mark the doloop use if predicted. */ + analyze_and_mark_doloop_use (data); + /* Finds candidates for the induction variables (item 2). */ find_iv_candidates (data);