From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 22146 invoked by alias); 10 Jun 2011 17:45:30 -0000 Received: (qmail 22132 invoked by uid 22791); 10 Jun 2011 17:45:28 -0000 X-SWARE-Spam-Status: No, hits=-1.5 required=5.0 tests=AWL,BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,SPF_HELO_PASS,T_RP_MATCHES_RCVD X-Spam-Check-By: sourceware.org Received: from smtp-out.google.com (HELO smtp-out.google.com) (74.125.121.67) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Fri, 10 Jun 2011 17:45:12 +0000 Received: from hpaq5.eem.corp.google.com (hpaq5.eem.corp.google.com [172.25.149.5]) by smtp-out.google.com with ESMTP id p5AHjA5f026543; Fri, 10 Jun 2011 10:45:10 -0700 Received: from rong.mtv.corp.google.com (rong.mtv.corp.google.com [172.18.110.233]) by hpaq5.eem.corp.google.com with ESMTP id p5AHj9X0014563; Fri, 10 Jun 2011 10:45:09 -0700 Received: by rong.mtv.corp.google.com (Postfix, from userid 104659) id 076F8C4159; Fri, 10 Jun 2011 10:45:08 -0700 (PDT) To: reply@codereview.appspotmail.com, gcc-patches@gcc.gnu.org Subject: [google] limit excessive load/store motions (issue4563044) Message-Id: <20110610174509.076F8C4159@rong.mtv.corp.google.com> Date: Fri, 10 Jun 2011 17:56:00 -0000 From: xur@google.com (Rong Xu) X-System-Of-Record: true Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org X-SW-Source: 2011-06/txt/msg00867.txt.bz2 Use a counter to avoid excessive load/store motions in tree and RTL level. This recovers some of the performance loss in FDO mode for openssl. 2011-06-10 Rong Xu * gcc/tree-ssa-loop-im.c (maxmimu_lsm): New define. (find_refs_for_sm): Limit excessive lsm. * gcc/gcse.c (cfgloop.h): New include. (maximum_lsm_limit): New define. (struct loop_lsm_limit_map): Ditto. (loop_lsm_limit_map_htab): Ditto. (loops_lsm): Ditto. (dominator_info_avail_before_lsm_limit): Ditto. (compute_ld_motion_mems): Limit execssive lsm. (find_loop_lsm_limit): New functions. (adjust_loop_lsm_limit): Ditto. (init_loop_lsm_limit): Ditto. (fini_loop_lsm_limit): Ditto. (estimate_reg_pressure_before_lsm): Ditto. (loop_lsm_limit_map_hash): Ditto. (loop_lsm_limit_map_eq): Ditto. (free_loop_lsm_limit_map): Ditto. Index: gcc/tree-ssa-loop-im.c =================================================================== --- gcc/tree-ssa-loop-im.c (revision 174918) +++ gcc/tree-ssa-loop-im.c (working copy) @@ -97,6 +97,9 @@ MAX_LOOP loop. */ }; +/* limit for lsm that can be performed for one loop */ +static unsigned maximum_lsm; + /* Maps statements to their lim_aux_data. */ static struct pointer_map_t *lim_aux_data_map; @@ -2359,12 +2362,16 @@ unsigned i; bitmap_iterator bi; mem_ref_p ref; + unsigned sm_count = 0; EXECUTE_IF_AND_COMPL_IN_BITMAP (refs, sm_executed, 0, i, bi) { ref = VEC_index (mem_ref_p, memory_accesses.refs_list, i); - if (can_sm_ref_p (loop, ref)) - bitmap_set_bit (refs_to_sm, i); + if (sm_count < maximum_lsm && can_sm_ref_p (loop, ref)) + { + bitmap_set_bit (refs_to_sm, i); + ++sm_count; + } } } @@ -2524,6 +2531,10 @@ sbitmap_free (contains_call); lim_aux_data_map = pointer_map_create (); + + /* Supress execeesive store-motion. Minus target_avail_regs by 1 + for the induction varialbe. Maybe we should use even less? */ + maximum_lsm = (target_avail_regs < 1 ? 0 : target_avail_regs - 1); } /* Cleans up after the invariant motion pass. */ Index: gcc/gcse.c =================================================================== --- gcc/gcse.c (revision 174918) +++ gcc/gcse.c (working copy) @@ -171,6 +171,7 @@ #include "dbgcnt.h" #include "target.h" #include "gcse.h" +#include "cfgloop.h" /* We support GCSE via Partial Redundancy Elimination. PRE optimizations are a superset of those done by classic GCSE. @@ -543,6 +544,10 @@ static bool do_local_cprop (rtx, rtx); static int local_cprop_pass (void); static bool is_too_expensive (const char *); +static unsigned find_loop_lsm_limit (struct loop*); +static void adjust_loop_lsm_limit (struct loop*, unsigned); +static void init_loop_lsm_limit (void); +static void fini_loop_lsm_limit (void); #define GNEW(T) ((T *) gmalloc (sizeof (T))) #define GCNEW(T) ((T *) gcalloc (1, sizeof (T))) @@ -4979,6 +4984,10 @@ } } +/* The maximum number of load/store motions that can be performed for one + loop. */ +static unsigned maximum_lsm_limit; + /* Find all the 'simple' MEMs which are used in LOADs and STORES. Simple being defined as MEM loads and stores to symbols, with no side effects and no registers in the expression. For a MEM destination, we also @@ -4994,12 +5003,18 @@ basic_block bb; rtx insn; + init_loop_lsm_limit (); + pre_ldst_mems = NULL; pre_ldst_table = htab_create (13, pre_ldst_expr_hash, pre_ldst_expr_eq, NULL); FOR_EACH_BB (bb) { + struct loop *loop = bb->loop_father; + unsigned ld_motion_limit = find_loop_lsm_limit (loop); + unsigned ld_motion_count = 0; + FOR_BB_INSNS (bb, insn) { if (NONDEBUG_INSN_P (insn)) @@ -5032,12 +5047,25 @@ { ptr = ldst_entry (dest); - if (! MEM_P (src) - && GET_CODE (src) != ASM_OPERANDS - /* Check for REG manually since want_to_gcse_p - returns 0 for all REGs. */ - && can_assign_to_reg_without_clobbers_p (src)) - ptr->stores = alloc_INSN_LIST (insn, ptr->stores); + if (ld_motion_count >= ld_motion_limit) + { + if (dump_file) + fprintf (dump_file, + "suppress ld_motion in loop=%d bb=%d due" + " to reg pressure.\n", + loop->num, bb->index); + + ptr->invalid = 1; + } + else if (! MEM_P (src) + && GET_CODE (src) != ASM_OPERANDS + /* Check for REG manually since want_to_gcse_p + returns 0 for all REGs. */ + && can_assign_to_reg_without_clobbers_p (src)) + { + ptr->stores = alloc_INSN_LIST (insn, ptr->stores); + ld_motion_count++; + } else ptr->invalid = 1; } @@ -5046,9 +5074,201 @@ invalidate_any_buried_refs (PATTERN (insn)); } } + adjust_loop_lsm_limit (loop, ld_motion_count); } + + fini_loop_lsm_limit (); } +typedef struct +{ + struct loop *loop; + unsigned lsm_limit; +}loop_lsm_limit_map; + +/* hash_table that maps loop to lsm_limit */ +static htab_t loop_lsm_limit_map_htab = NULL; + +/* hash function for loop_lsm_limit_map_htab */ + +static hashval_t +loop_lsm_limit_map_hash (const void *p) +{ + const loop_lsm_limit_map *const x = (const loop_lsm_limit_map*) p; + return htab_hash_pointer (x->loop); +} + +/* hash equal function for loop_lsm_limit_map_htab */ + +static int +loop_lsm_limit_map_eq (const void *p1, const void *p2) +{ + const loop_lsm_limit_map *const ptr1 = (const loop_lsm_limit_map *) p1, + *const ptr2 = (const loop_lsm_limit_map *) p2; + + return htab_eq_pointer (ptr1->loop, ptr2->loop); +} + +/* free one entry in loop_lsm_limit_map_htab */ + +static int +free_loop_lsm_limit_map (void **slot, void *data ATTRIBUTE_UNUSED) +{ + free(*(loop_lsm_limit_map **) slot); + return 1; +} + +/* use the loops strcture to find enclosing loop for each basic_block */ +static struct loops loops_lsm; +static bool dominator_info_avail_before_lsm_limit = false; + +/* Initalization function for suppressing execessive load/store motions. + Build the loop structure so that we can count the number of motions + performed. Set the maximum number of motions for a loop. */ + +static void +init_loop_lsm_limit (void) +{ + dominator_info_avail_before_lsm_limit = dom_info_available_p(CDI_DOMINATORS); + flow_loops_find (&loops_lsm); + + loop_lsm_limit_map_htab = htab_create (59, loop_lsm_limit_map_hash, + loop_lsm_limit_map_eq, NULL); + + /* avail_regs minus the induction variables */ + maximum_lsm_limit = target_avail_regs - 1; +} + +/* Cleanup function: destroy loop structure and free space. */ + +static void +fini_loop_lsm_limit (void) +{ + basic_block bb; + + flow_loops_free (&loops_lsm); + + if (!dominator_info_avail_before_lsm_limit) + free_dominance_info (CDI_DOMINATORS); + + FOR_ALL_BB (bb) + bb->loop_father = NULL; + + if (loop_lsm_limit_map_htab) + { + htab_traverse (loop_lsm_limit_map_htab, free_loop_lsm_limit_map, NULL); + htab_delete (loop_lsm_limit_map_htab); + } +} + +/* This routine tries to find the number of registers that + (1) live across the iteration, and + (2) referenced in the loop. + those store-motions performed in tree-lim falls into this category. */ + +static unsigned +estimate_reg_pressure_before_lsm (struct loop *loop) +{ + basic_block *body; + unsigned int i; + unsigned int count; + regset live_across_iteration; + regset used_in_loop; + basic_block header, latch; + + if (!loop) + return 0; + + gcc_assert (loop != loops_lsm.tree_root); + + latch = loop->latch; + /* don't handle multiple latches */ + if (!latch) + return 0; + + header = loop->header; + gcc_assert (header); + + live_across_iteration = ALLOC_REG_SET (®_obstack); + COPY_REG_SET (live_across_iteration, DF_LR_IN (header)); + AND_REG_SET (live_across_iteration, DF_LR_OUT (latch)); + + used_in_loop = ALLOC_REG_SET (®_obstack); + CLEAR_REG_SET (used_in_loop); + + body = get_loop_body (loop); + /* this loop computes a subset of must-use */ + for (i=0; inum_nodes; i++) + { + basic_block bb = *(body + i); + + if (dominated_by_p (CDI_DOMINATORS, latch, bb)) + IOR_REG_SET (used_in_loop, &(DF_LR_BB_INFO (bb)->use)); + } + + + AND_REG_SET (live_across_iteration, used_in_loop); + + count = bitmap_count_bits (live_across_iteration); + + FREE_REG_SET (live_across_iteration); + FREE_REG_SET (used_in_loop); + + return count; +} + +/* Find the number of load/store motion we can perform without + create excessive register pressure to loop LOOP */ + +static unsigned +find_loop_lsm_limit (struct loop *loop) +{ + void **slot; + loop_lsm_limit_map t, *ret; + unsigned reg_pressure; + unsigned limit = maximum_lsm_limit; + + if (!loop || loop == loops_lsm.tree_root) + return (unsigned)-1; + + t.loop = loop; + slot = htab_find_slot (loop_lsm_limit_map_htab, &t, NO_INSERT); + if (slot) + return (*(loop_lsm_limit_map**)slot)->lsm_limit; + + /* insert this loop and initialize it */ + reg_pressure = estimate_reg_pressure_before_lsm (loop); + ret = (loop_lsm_limit_map*) gmalloc (sizeof (loop_lsm_limit_map)); + gcc_assert (ret); + + limit = (limit > reg_pressure ? limit - reg_pressure : 0); + ret->loop = loop; + ret->lsm_limit = limit; + slot = htab_find_slot (loop_lsm_limit_map_htab, ret, INSERT); + gcc_assert (slot); + (*slot) = (void**)ret; + + return limit; +} + +static void +adjust_loop_lsm_limit (struct loop *loop, unsigned performed) +{ + void **slot; + loop_lsm_limit_map t; + unsigned limit; + + if (performed == 0 || loop == NULL || loop == loops_lsm.tree_root) + return; + + t.loop = loop; + slot = htab_find_slot (loop_lsm_limit_map_htab, &t, NO_INSERT); + gcc_assert (slot); + limit = (*(loop_lsm_limit_map**)slot)->lsm_limit; + gcc_assert (limit >= performed); + (*(loop_lsm_limit_map**)slot)->lsm_limit = limit - performed; +} + /* Remove any references that have been either invalidated or are not in the expression list for pre gcse. */ -- This patch is available for review at http://codereview.appspot.com/4563044