From 092d6df49c0027001c3ed9343f0d1e8c02232d95 Mon Sep 17 00:00:00 2001 From: Xiong Hu Luo Date: Mon, 5 Jul 2021 03:57:11 -0500 Subject: [PATCH v4 1/2] Don't move cold code out of loop by checking bb count v4 changes: 1. Sort out profile_count comparision to function bb_cold_than_loop_preheader. 2. Update ref_in_loop_hot_body::operator () to find cold_loop before compare. 3. Split RTL invariant motion part out. 4. Remove aux changes. v3 changes: 1. Handle max_loop in determine_max_movement instead of outermost_invariant_loop. 2. Remove unnecessary changes. 3. Add for_all_locs_in_loop (loop, ref, ref_in_loop_hot_body) in can_sm_ref_p. 4. "gsi_next (&bsi);" in move_computations_worker is kept since it caused infinite loop when implementing v1 and the iteration is missed to be updated actually. v1: https://gcc.gnu.org/pipermail/gcc-patches/2021-August/576488.html v2: https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579086.html v3: https://gcc.gnu.org/pipermail/gcc-patches/2021-September/580211.html There was a patch trying to avoid move cold block out of loop: https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html Richard suggested to "never hoist anything from a bb with lower execution frequency to a bb with higher one in LIM invariantness_dom_walker before_dom_children". In gimple LIM analysis, add find_coldest_out_loop to move invariants to expected target loop, if profile count of the loop bb is colder than target loop preheader, it won't be hoisted out of loop. Likely for store motion, if all locations of the REF in loop is cold, don't do store motion of it. SPEC2017 performance evaluation shows 1% performance improvement for intrate GEOMEAN and no obvious regression for others. Especially, 500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00% on P8LE. gcc/ChangeLog: (find_invariants_body): Add argument. * tree-ssa-loop-im.c (bb_colder_than_loop_preheader): New function. (find_coldest_out_loop): New function. (determine_max_movement): Use find_coldest_out_loop. (move_computations_worker): Adjust and fix iteration udpate. (execute_sm_exit): Check pointer validness. (class ref_in_loop_hot_body): New functor. (ref_in_loop_hot_body::operator): New. (can_sm_ref_p): Use for_all_locs_in_loop. gcc/testsuite/ChangeLog: * gcc.dg/tree-ssa/recip-3.c: Adjust. * gcc.dg/tree-ssa/ssa-lim-18.c: New test. * gcc.dg/tree-ssa/ssa-lim-19.c: New test. * gcc.dg/tree-ssa/ssa-lim-20.c: New test. * gcc.dg/tree-ssa/ssa-lim-21.c: New test. --- gcc/tree-ssa-loop-im.c | 85 +++++++++++++++++++++- gcc/testsuite/gcc.dg/tree-ssa/recip-3.c | 2 +- gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-18.c | 20 +++++ gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c | 27 +++++++ gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-20.c | 25 +++++++ gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-21.c | 35 +++++++++ 6 files changed, 191 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-18.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-20.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-21.c diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index 4b187c2cdaf..870e0a00512 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -417,6 +417,46 @@ movement_possibility (gimple *stmt) return ret; } +/* Compare the profile count inequality of COUNT1 and COUNT2, it is three-state + as stated in profile-count.h, FALSE is returned if inequality cannot be + decided. */ +bool bb_colder_than_loop_preheader (profile_count count1, profile_count count2) +{ + if (count1 < count2) + return true; + else + return false; +} + +/* Find coldest loop between OUTMOST_LOOP and LOOP by comapring profile count. + */ + +static class loop * +find_coldest_out_loop (class loop *outmost_loop, class loop *loop, + basic_block curr_bb) +{ + class loop *cold_loop, *min_loop; + cold_loop = min_loop = outmost_loop; + profile_count min_count = loop_preheader_edge (min_loop)->src->count; + + /* If bb_colder_than_loop_preheader returns false due to three-state + comparision, OUTMOST_LOOP is returned finally to preserve the behavior. + Otherwise, return the coldest loop between OUTMOST_LOOP and LOOP. */ + if (curr_bb + && bb_colder_than_loop_preheader (curr_bb->count, + loop_preheader_edge (loop)->src->count)) + return NULL; + + while (min_loop != loop) + { + min_loop = superloop_at_depth (loop, loop_depth (min_loop) + 1); + if (bb_colder_than_loop_preheader ( + loop_preheader_edge (min_loop)->src->count, min_count)) + cold_loop = min_loop; + } + return cold_loop; +} + /* Suppose that operand DEF is used inside the LOOP. Returns the outermost loop to that we could move the expression using DEF if it did not have other operands, i.e. the outermost loop enclosing LOOP in that the value @@ -685,7 +725,9 @@ determine_max_movement (gimple *stmt, bool must_preserve_exec) level = ALWAYS_EXECUTED_IN (bb); else level = superloop_at_depth (loop, 1); - lim_data->max_loop = level; + lim_data->max_loop = find_coldest_out_loop (level, loop, bb); + if (!lim_data->max_loop) + return false; if (gphi *phi = dyn_cast (stmt)) { @@ -1221,7 +1263,10 @@ move_computations_worker (basic_block bb) /* We do not really want to move conditionals out of the loop; we just placed it here to force its operands to be moved if necessary. */ if (gimple_code (stmt) == GIMPLE_COND) - continue; + { + gsi_next (&bsi); + continue; + } if (dump_file && (dump_flags & TDF_DETAILS)) { @@ -2887,6 +2932,35 @@ ref_indep_loop_p (class loop *loop, im_mem_ref *ref, dep_kind kind) return indep_p; } +class ref_in_loop_hot_body +{ +public: + ref_in_loop_hot_body (loop *loop_) : l (loop_) {} + bool operator () (mem_ref_loc *loc); + class loop *l; +}; + +/* Find out the coldest loop between loop L and innermost loop, compare the + hotness between current BB and coldest loop preheader by profile count. */ +bool +ref_in_loop_hot_body::operator () (mem_ref_loc *loc) +{ + basic_block curr_bb = gimple_bb (loc->stmt); + class loop *inner_loop = curr_bb->loop_father; + class loop *cold_loop = l; + if (l != inner_loop) + cold_loop = find_coldest_out_loop (l, inner_loop, curr_bb); + if (!cold_loop) + return false; + edge e = loop_preheader_edge (cold_loop); + /* If bb_colder_than_loop_preheader is false due to three-state inequality + comparision, TRUE is returned to continue perform store motion. */ + if (bb_colder_than_loop_preheader (curr_bb->count, e->src->count)) + return false; + else + return true; +} + /* Returns true if we can perform store motion of REF from LOOP. */ @@ -2941,6 +3015,13 @@ can_sm_ref_p (class loop *loop, im_mem_ref *ref) if (!ref_indep_loop_p (loop, ref, sm_war)) return false; + /* Verify whether the candidate is hot for LOOP. Only do store motion if the + candidate's profile count is hot. Statement in cold BB shouldn't be moved + out of it's loop_father, also it shouldn't be moved out of LOOP if it is + colder than LOOP's preheader. See ssa-lim-21.c. */ + if (!for_all_locs_in_loop (loop, ref, ref_in_loop_hot_body (loop))) + return false; + return true; } diff --git a/gcc/testsuite/gcc.dg/tree-ssa/recip-3.c b/gcc/testsuite/gcc.dg/tree-ssa/recip-3.c index 638bf38db8c..641c91e719e 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/recip-3.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/recip-3.c @@ -23,4 +23,4 @@ float h () F[0] += E / d; } -/* { dg-final { scan-tree-dump-times " / " 1 "recip" } } */ +/* { dg-final { scan-tree-dump-times " / " 5 "recip" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-18.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-18.c new file mode 100644 index 00000000000..7326a230b3f --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-18.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-lim2-details" } */ + +volatile int x; +void +bar (int, char *, char *); +void +foo (int *a, int n, int k) +{ + int i; + + for (i = 0; i < n; i++) + { + if (__builtin_expect (x, 0)) + bar (k / 5, "one", "two"); + a[i] = k; + } +} + +/* { dg-final { scan-tree-dump-not "out of loop 1" "lim2" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c new file mode 100644 index 00000000000..f0a99fa42b4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-lim2-details" } */ + +volatile int x; +void +bar (int, char *, char *); +void +foo (int *a, int n, int m, int k, int s) +{ + int i; + int j; + + for (i = 0; i < m; i++) + { + if (__builtin_expect (x, 0)) + for (j = 0; j < n; j++) + { + bar (k / 5, "one", "two"); + a[s] = k; + } + a[s] = s; + } +} + +/* { dg-final { scan-tree-dump-times "out of loop 2" 4 "lim2" } } */ +/* { dg-final { scan-tree-dump-times "out of loop 1" 3 "lim2" } } */ + diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-20.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-20.c new file mode 100644 index 00000000000..bc60a040a70 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-20.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-lim2-details" } */ + +/* Test that `count' is not hoisted out of loop when bb is cold. */ + +int count; +volatile int x; + +struct obj { + int data; + struct obj *next; + +} *q; + +void +func (int m) +{ + struct obj *p; + for (int i = 0; i < m; i++) + if (__builtin_expect (x, 0)) + count++; + +} + +/* { dg-final { scan-tree-dump-not "Executing store motion of" "lim2" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-21.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-21.c new file mode 100644 index 00000000000..c38a858283f --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-21.c @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-lim2-details" } */ + +/* Test that `count' is not hoisted out of inner loop and outer loop when it is + in cold loop. */ + +int count; +volatile int x; + +struct obj { + int data; + int data1; + struct obj *next; +}; + +void +func (int m, int n, int k, struct obj *a) +{ + struct obj *q = a; + for (int j = 0; j < m; j++) + if (__builtin_expect (m, 0)) + for (int i = 0; i < m; i++) + { + if (__builtin_expect (x, 0)) + { + count++; + q->data += 3; /* Not hoisted out to inner loop. */ + } + count += n; + q->data1 += k; /* Not hoisted out to outer loop. */ + } +} + +/* { dg-final { scan-tree-dump-not "Executing store motion of" "lim2" } } */ + -- 2.27.0.90.geebb51ba8c