* [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) @ 2019-03-15 8:22 JiangNing OS 2019-03-18 21:53 ` Jeff Law ` (3 more replies) 0 siblings, 4 replies; 15+ messages in thread From: JiangNing OS @ 2019-03-15 8:22 UTC (permalink / raw) To: gcc-patches [-- Attachment #1: Type: text/plain, Size: 571 bytes --] This patch is to fix a missing ifcvt opportunity in back-end. For the simple case below, if (...) x = a; /* x is memory */ /* no else */ We can generate conditional move and remove the branch as below if the target cost is acceptable. r1 = x r2 = a cmp ... csel r3, r1, r2, cond x = r3 This could be safe if x is a stack variable, and there isn't any address taken in current function, so the store speculation can be avoided. In practice, this optimization can improve a real application performance by %4 on aarch64. Thanks, -Jiangning [-- Attachment #2: csel3.patch --] [-- Type: application/octet-stream, Size: 18938 bytes --] Index: ChangeLog =================================================================== --- ChangeLog (revision 269698) +++ ChangeLog (working copy) @@ -1,3 +1,25 @@ +2019-03-15 Jiangning Liu <jiangning@os.amperecomputing.com> + + PR rtl-optimization/89430 + * cse.c (fixed_base_plus_p): Move from here... + * rtlanal.c (fixed_base_plus_p): ...to here... + * rtl.h (fixed_base_plus_p): Add extern declaration. + * ifcvt.c + (noce_process_if_block): For no else_bb and memory store ifcvt case, + early exit if only it is global and current function has address + taken. + (noce_try_cmove_arith): Do ifcvt for no else_bb case, if the variable + is on stack and dominating access can prove the newly introduce memory + access is safe. + (find_all_must_be_sfp_insns): New function. + (find_all_may_be_sfp_insns): New function. + (no_stack_address_taken): New function. + (noce_process_if_block): New function. + (noce_mem_is_on_stack): New function. + (noce_valid_for_dominating): New function. + * ifcvt.h : Add new fields. + * testsuite/gcc.dg/ifcvt-6.c: New. + 2019-03-14 Jason Merrill <jason@redhat.com> Jakub Jelinek <jakub@redhat.com> Index: cse.c =================================================================== --- cse.c (revision 269698) +++ cse.c (working copy) @@ -540,7 +540,6 @@ already as part of an already processed extended basic block. */ static sbitmap cse_visited_basic_blocks; -static bool fixed_base_plus_p (rtx x); static int notreg_cost (rtx, machine_mode, enum rtx_code, int); static int preferable (int, int, int, int); static void new_basic_block (void); @@ -606,30 +605,7 @@ static const struct rtl_hooks cse_rtl_hooks = RTL_HOOKS_INITIALIZER; \f -/* Nonzero if X has the form (PLUS frame-pointer integer). */ -static bool -fixed_base_plus_p (rtx x) -{ - switch (GET_CODE (x)) - { - case REG: - if (x == frame_pointer_rtx || x == hard_frame_pointer_rtx) - return true; - if (x == arg_pointer_rtx && fixed_regs[ARG_POINTER_REGNUM]) - return true; - return false; - - case PLUS: - if (!CONST_INT_P (XEXP (x, 1))) - return false; - return fixed_base_plus_p (XEXP (x, 0)); - - default: - return false; - } -} - /* Dump the expressions in the equivalence class indicated by CLASSP. This function is used only for debugging. */ DEBUG_FUNCTION void Index: ifcvt.c =================================================================== --- ifcvt.c (revision 269698) +++ ifcvt.c (working copy) @@ -76,6 +76,15 @@ /* Whether conditional execution changes were made. */ static int cond_exec_changed_p; +/* REGNO bitmap for defs that may be stack frame address. */ +static bitmap bba_sets_may_be_sfp; + +/* REGNO bitmap for defs that must be stack frame address. */ +static bitmap bba_sets_must_be_sfp; + +/* true if current function doesn't have local address taken. */ +static bool cfun_no_stack_address_taken; + /* Forward references. */ static int count_bb_insns (const_basic_block); static bool cheap_bb_rtx_cost_p (const_basic_block, profile_probability, int); @@ -99,6 +108,13 @@ edge, int); static void noce_emit_move_insn (rtx, rtx); static rtx_insn *block_has_only_trap (basic_block); +static bool noce_mem_is_on_stack (rtx_insn *a_insn, const_rtx x); +static bool noce_valid_for_dominating (basic_block bb, rtx_insn *a_insn, + const_rtx x, bool is_store); +static bool noce_mem_maybe_invalid_p (struct noce_if_info *if_info); +static void find_all_must_be_sfp_insns (void); +static void find_all_may_be_sfp_insns (void); +static bool no_stack_address_taken (void); \f /* Count the number of non-jump active insns in BB. */ @@ -2029,6 +2045,93 @@ return true; } +/* Return true if MEM x is on stack. a_insn contains x, if it exists. */ + +static bool +noce_mem_is_on_stack (rtx_insn *a_insn, const_rtx x) +{ + df_ref use; + + gcc_assert (x); + gcc_assert (MEM_P (x)); + + /* Early exits if find base is a stack register. */ + rtx a = XEXP (x, 0); + if (fixed_base_plus_p(a)) + return true; + + if (!a_insn) + return false; + + if (!reg_mentioned_p (x, a_insn)) + return false; + + /* Check if x is on stack. Assume a mem expression using registers + related to stack register is always on stack. */ + FOR_EACH_INSN_USE (use, a_insn) + if (reg_mentioned_p (DF_REF_REG (use), x) + && bitmap_bit_p (bba_sets_must_be_sfp, DF_REF_REGNO (use))) + return true; + + return false; +} + +/* Always return true, if there is a dominating write. + + When there is a dominating read from memory on stack, + 1) if x = a is a memory read, return true. + 2) if x = a is a memory write, return true if the memory is on stack. + This is the guarantee the memory is *not* readonly. */ + +static bool +noce_valid_for_dominating (basic_block bb, rtx_insn *a_insn, + const_rtx x, bool is_store) +{ + rtx_insn *insn; + rtx set; + + gcc_assert (MEM_P (x)); + + FOR_BB_INSNS (bb, insn) + { + set = single_set (insn); + if (!set) + continue; + + /* Dominating store */ + if (rtx_equal_p (x, SET_DEST (set))) + return true; + + /* Dominating load */ + if (rtx_equal_p (x, SET_SRC (set))) + if (is_store && noce_mem_is_on_stack (a_insn, x)) + return true; + } + + return false; +} + +/* Return false if the memory a or b must be valid. + This function must be called before latent swap of a and b. */ + +static bool +noce_mem_maybe_invalid_p (struct noce_if_info *if_info) +{ + if (!if_info->set_b && MEM_P(if_info->orig_x)) + { + if (!if_info->else_bb && MEM_P (if_info->b)) + return !noce_valid_for_dominating (if_info->test_bb, + if_info->insn_a, + if_info->b, true); + } + + /* ??? We could handle this if we knew that a load from A or B could + not trap or fault. This is also true if we've already loaded + from the address along the path from ENTRY. */ + return (may_trap_or_fault_p (if_info->a) + || may_trap_or_fault_p (if_info->b)); +} + /* Try more complex cases involving conditional_move. */ static int @@ -2065,10 +2168,7 @@ is_mem = 1; } - /* ??? We could handle this if we knew that a load from A or B could - not trap or fault. This is also true if we've already loaded - from the address along the path from ENTRY. */ - else if (may_trap_or_fault_p (a) || may_trap_or_fault_p (b)) + else if (noce_mem_maybe_invalid_p (if_info)) return FALSE; /* if (test) x = a + b; else x = c - d; @@ -2234,7 +2334,7 @@ /* If insn to set up A clobbers any registers B depends on, try to swap insn that sets up A with the one that sets up B. If even that doesn't help, punt. */ - if (modified_in_a && !modified_in_b) + if (!modified_in_a && modified_in_b) { if (!noce_emit_bb (emit_b, else_bb, b_simple)) goto end_seq_and_fail; @@ -2242,7 +2342,7 @@ if (!noce_emit_bb (emit_a, then_bb, a_simple)) goto end_seq_and_fail; } - else if (!modified_in_a) + else if (!modified_in_b) { if (!noce_emit_bb (emit_a, then_bb, a_simple)) goto end_seq_and_fail; @@ -3563,13 +3663,28 @@ } if (!set_b && MEM_P (orig_x)) - /* We want to avoid store speculation to avoid cases like - if (pthread_mutex_trylock(mutex)) - ++global_variable; - Rather than go to much effort here, we rely on the SSA optimizers, - which do a good enough job these days. */ - return FALSE; + { + /* We want to avoid store speculation to avoid cases like + if (pthread_mutex_trylock(mutex)) + ++global_variable; + Tree if conversion cannot handle this case well, and it intends to + help vectorization for loops only. */ + if (!noce_mem_is_on_stack(insn_a, orig_x)) + return FALSE; + /* For case like, + if (pthread_mutex_trylock(mutex)) + ++local_variable; + If any stack variable address is taken, potentially this local + variable could be modified by other threads and introduce store + speculation. */ + if (!cfun_no_stack_address_taken) + return FALSE; + } + + if_info->set_a = set_a; + if_info->set_b = set_b; + if (noce_try_move (if_info)) goto success; if (noce_try_ifelse_collapse (if_info)) @@ -5347,6 +5462,234 @@ return FALSE; } + + +/* Find all insns that must be stack address and store REGNO into + bitmap bba_sets_must_be_sfp. */ + +static void +find_all_must_be_sfp_insns (void) +{ + rtx_insn *a_insn; + basic_block bb; + bool sfp_found; + auto_vec<int, 1> def_count; + df_ref def, use; + + /* Calculate def counts for each insn. */ + def_count.safe_grow_cleared (max_reg_num () + 1); + FOR_ALL_BB_FN (bb, cfun) + FOR_BB_INSNS (bb, a_insn) + { + if (!INSN_P (a_insn)) + continue; + + FOR_EACH_INSN_DEF (def, a_insn) + def_count[DF_REF_REGNO (def)]++; + } + + /* Iterate all insns until finding all stack pointers, and store + them into bba_sets_must_be_sfp. */ + bba_sets_must_be_sfp = BITMAP_ALLOC (®_obstack); + do + { + sfp_found = false; + FOR_ALL_BB_FN (bb, cfun) + FOR_BB_INSNS (bb, a_insn) + { + rtx sset_a = single_set (a_insn); + if (!sset_a) + continue; + rtx src = SET_SRC (sset_a); + rtx dest = SET_DEST (sset_a); + + if (!REG_P (dest)) + continue; + + /* For the case below, + Control flow: B1->B3, B2->B3 + B1: p = &local_var + B2: p = &global_var + B3: ... = *p + pointer p is an address for either local or global variable. + so we don't treat p as a stack address pointer. To make + algorithm simple, we ignore all non-SSA cases. */ + bool skip_insn = false; + unsigned int dest_regno = 0; + FOR_EACH_INSN_DEF (def, a_insn) + { + dest_regno = DF_REF_REGNO (def); + /* Skip current insn if + 1) already marked as stack pointer. + 2) or see multiple definition points. */ + if (bitmap_bit_p (bba_sets_must_be_sfp, dest_regno) + || def_count[dest_regno] > 1) + { + skip_insn = true; + break; + } + } + if (skip_insn) + continue; + + /* Handle case like "x1 = sp + offset". */ + if (fixed_base_plus_p (src)) + { + bitmap_set_bit (bba_sets_must_be_sfp, dest_regno); + sfp_found = true; + continue; + } + + /* Handle case like "x2 = x1 + offset", in which x1 is already + identified as a stack pointer. */ + if ((GET_CODE (src) == PLUS || GET_CODE (src) == MINUS) + && CONST_INT_P (XEXP (src, 1))) + { + rtx x1 = XEXP (src, 0); + if (!REG_P (x1)) + continue; + + FOR_EACH_INSN_USE (use, a_insn) + { + if (!rtx_equal_p (x1, DF_REF_REG (use))) + continue; + + if (bitmap_bit_p (bba_sets_must_be_sfp, DF_REF_REGNO (use))) + { + bitmap_set_bit (bba_sets_must_be_sfp, dest_regno); + sfp_found = true; + break; + } + } + } + } + } + while (sfp_found); +} + +/* Find all insns that may be stack pointer and store REGNO into + bitmap bba_sets_may_be_sfp. We iterate all insns in current func + until no more latent insns generating stack address are found. + The collection of stack pointer bba_sets_may_be_sfp will be used + to help analyze local stack variable address taken. Stack variable + address can be passed out of current frame if only any stack pointer + is passed to hard register or stored into memory. */ + +static void +find_all_may_be_sfp_insns (void) +{ + rtx_insn *a_insn; + basic_block bb; + bool sfp_found; + bba_sets_may_be_sfp = BITMAP_ALLOC (®_obstack); + + /* Iterate all insns that may be stack pointers, and store them into + bitmap bba_sets_must_be_sfp. */ + do + { + sfp_found = false; + FOR_ALL_BB_FN (bb, cfun) + FOR_BB_INSNS (bb, a_insn) + { + /* Assume stack pointers can only be generated by single SET. */ + rtx sset_a = single_set (a_insn); + if (!sset_a) + continue; + rtx src = SET_SRC (sset_a); + rtx dest = SET_DEST (sset_a); + + /* If we see "hard_register = ...", which implies passing out + of current frame potentially, we don't collect this case. + It can be treated as address taken in no_stack_address_taken */ + if (HARD_REGISTER_P (dest)) + continue; + + /* Memory load and store are not to generate stack pointer. */ + if (MEM_P (src) || MEM_P (dest)) + continue; + + /* Skip insn that is already identified as stack pointer. */ + bool skip_insn = false; + df_ref def; + unsigned int dest_regno = 0; + FOR_EACH_INSN_DEF (def, a_insn) + { + dest_regno = DF_REF_REGNO (def); + if (bitmap_bit_p (bba_sets_may_be_sfp, dest_regno)) + { + skip_insn = true; + break; + } + } + if (skip_insn) + continue; + + /* Collect all latent stack pointer insns. */ + df_ref use; + FOR_EACH_INSN_USE (use, a_insn) + { + rtx x = DF_REF_REG (use); + if (x == frame_pointer_rtx || x == hard_frame_pointer_rtx + || bitmap_bit_p (bba_sets_may_be_sfp, DF_REF_REGNO (use))) + { + bitmap_set_bit (bba_sets_may_be_sfp, dest_regno); + sfp_found = true; + break; + } + } + } + } + while (sfp_found); +} + +/* Return true if current function doesn't pass stack address out of frame. */ + +static bool +no_stack_address_taken (void) +{ + basic_block bb; + rtx_insn *a_insn; + df_ref use; + + FOR_ALL_BB_FN (bb, cfun) + FOR_BB_INSNS (bb, a_insn) + { + if (!INSN_P (a_insn)) + continue; + + rtx sset_a = single_set (a_insn); + if (!sset_a) + continue; + rtx src = SET_SRC (sset_a); + rtx dest = SET_DEST (sset_a); + + /* Skip insn that is already identified as frame pointers. */ + if (bitmap_bit_p (bba_sets_may_be_sfp, REGNO (dest))) + continue; + + FOR_EACH_INSN_USE (use, a_insn) + { + /* Check if current insn is using any stack pointer. Ignore + if it doesn't use frame pointers at all. */ + if (!bitmap_bit_p (bba_sets_may_be_sfp, DF_REF_REGNO (use))) + continue; + + /* Load cannot introduce address taken. */ + if (MEM_P (src)) + continue; + + /* Store is safe if only the src doesn't contain stack pointer. */ + if (MEM_P (dest) + && !reg_mentioned_p (DF_REF_REG (use), src)) + continue; + + /* All other cases potentially introduce address taken. */ + return false; + } + } + return true; +} + \f /* Main entry point for all if-conversion. AFTER_COMBINE is true if we are after combine pass. */ @@ -5381,6 +5724,11 @@ df_set_flags (DF_LR_RUN_DCE); + /* Prepare for stack variable anslysis. */ + find_all_must_be_sfp_insns (); + find_all_may_be_sfp_insns (); + cfun_no_stack_address_taken = no_stack_address_taken (); + /* Go through each of the basic blocks looking for things to convert. If we have conditional execution, we make multiple passes to allow us to handle IF-THEN{-ELSE} blocks within other IF-THEN{-ELSE} blocks. */ @@ -5413,6 +5761,9 @@ } while (cond_exec_changed_p); + BITMAP_FREE (bba_sets_may_be_sfp); + BITMAP_FREE (bba_sets_must_be_sfp); + #ifdef IFCVT_MULTIPLE_DUMPS if (dump_file) fprintf (dump_file, "\n\n========== no more changes\n"); Index: ifcvt.h =================================================================== --- ifcvt.h (revision 269698) +++ ifcvt.h (working copy) @@ -73,6 +73,8 @@ /* The SET_SRC of INSN_A and INSN_B. */ rtx a, b; + /* The SET of INSN_A and INSN_B. */ + rtx set_a, set_b; /* The SET_DEST of INSN_A. */ rtx x; Index: rtl.h =================================================================== --- rtl.h (revision 269698) +++ rtl.h (working copy) @@ -3751,6 +3751,8 @@ #define hard_frame_pointer_rtx (global_rtl[GR_HARD_FRAME_POINTER]) #define arg_pointer_rtx (global_rtl[GR_ARG_POINTER]) +extern bool fixed_base_plus_p (rtx x); + #ifndef GENERATOR_FILE /* Return the attributes of a MEM rtx. */ static inline const struct mem_attrs * Index: rtlanal.c =================================================================== --- rtlanal.c (revision 269698) +++ rtlanal.c (working copy) @@ -341,6 +341,30 @@ return 0; } +/* Nonzero if X has the form (PLUS frame-pointer integer). */ + +bool +fixed_base_plus_p (rtx x) +{ + switch (GET_CODE (x)) + { + case REG: + if (x == frame_pointer_rtx || x == hard_frame_pointer_rtx) + return true; + if (x == arg_pointer_rtx && fixed_regs[ARG_POINTER_REGNUM]) + return true; + return false; + + case PLUS: + if (!CONST_INT_P (XEXP (x, 1))) + return false; + return fixed_base_plus_p (XEXP (x, 0)); + + default: + return false; + } +} + /* Compute an approximation for the offset between the register FROM and TO for the current function, as it was at the start of the routine. */ Index: testsuite/gcc.dg/ifcvt-6.c =================================================================== --- testsuite/gcc.dg/ifcvt-6.c (revision 0) +++ testsuite/gcc.dg/ifcvt-6.c (working copy) @@ -0,0 +1,12 @@ +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-options "-fdump-rtl-ce1 -O2" } */ + +unsigned test_ifcvt (unsigned k, unsigned b) { + unsigned a[2]; + if (b < a[k]) { + a[k] = b; + } + return a[0]+a[1]; +} + +/* { dg-final { scan-rtl-dump "if-conversion succeeded through noce_try_cmove_arith" "ce1" } } */ ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) 2019-03-15 8:22 [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) JiangNing OS @ 2019-03-18 21:53 ` Jeff Law 2019-04-30 16:00 ` Jeff Law ` (2 subsequent siblings) 3 siblings, 0 replies; 15+ messages in thread From: Jeff Law @ 2019-03-18 21:53 UTC (permalink / raw) To: JiangNing OS, gcc-patches On 3/14/19 10:46 PM, JiangNing OS wrote: > This patch is to fix a missing ifcvt opportunity in back-end. For the simple case below, > > if (...) > x = a; /* x is memory */ > /* no else */ > > We can generate conditional move and remove the branch as below if the target cost is acceptable. > > r1 = x > r2 = a > cmp ... > csel r3, r1, r2, cond > x = r3 > > This could be safe if x is a stack variable, and there isn't any address taken in current function, so the store speculation can be avoided. > > In practice, this optimization can improve a real application performance by %4 on aarch64. This seems like something that should be addressed for gcc-10 rather than gcc-9. Can we come back to it once gcc-10 development starts? jeff ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) 2019-03-15 8:22 [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) JiangNing OS 2019-03-18 21:53 ` Jeff Law @ 2019-04-30 16:00 ` Jeff Law 2019-05-05 0:11 ` JiangNing OS 2019-05-24 14:54 ` Kyrill Tkachov 2019-06-17 21:59 ` Jeff Law 3 siblings, 1 reply; 15+ messages in thread From: Jeff Law @ 2019-04-30 16:00 UTC (permalink / raw) To: JiangNing OS, gcc-patches On 3/14/19 10:46 PM, JiangNing OS wrote: > This patch is to fix a missing ifcvt opportunity in back-end. For the simple case below, > > if (...) > x = a; /* x is memory */ > /* no else */ > > We can generate conditional move and remove the branch as below if the target cost is acceptable. > > r1 = x > r2 = a > cmp ... > csel r3, r1, r2, cond > x = r3 > > This could be safe if x is a stack variable, and there isn't any address taken in current function, so the store speculation can be avoided. > > In practice, this optimization can improve a real application performance by %4 on aarch64. [ ... ] I notice that a couple weeks you'd sent a similar patch under the heading "Fixing ifcvt issue as exposed by BZ89430". Does this patch supersede the earlier patch? Jeff ^ permalink raw reply [flat|nested] 15+ messages in thread
* RE: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) 2019-04-30 16:00 ` Jeff Law @ 2019-05-05 0:11 ` JiangNing OS 0 siblings, 0 replies; 15+ messages in thread From: JiangNing OS @ 2019-05-05 0:11 UTC (permalink / raw) To: Jeff Law, gcc-patches Hi Jeff, Yes. The latter one "[PATCH] improve ifcvt optimization (PR rtl-optimization/89430)" supersedes the earlier one " Fixing ifcvt issue as exposed by BZ89430". Thanks, -Jiangning -----Original Message----- From: Jeff Law <law@redhat.com> Sent: Tuesday, April 30, 2019 11:54 PM To: JiangNing OS <jiangning@os.amperecomputing.com>; gcc-patches@gcc.gnu.org Subject: Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) On 3/14/19 10:46 PM, JiangNing OS wrote: > This patch is to fix a missing ifcvt opportunity in back-end. For the > simple case below, > > if (...) > x = a; /* x is memory */ > /* no else */ > > We can generate conditional move and remove the branch as below if the target cost is acceptable. > > r1 = x > r2 = a > cmp ... > csel r3, r1, r2, cond > x = r3 > > This could be safe if x is a stack variable, and there isn't any address taken in current function, so the store speculation can be avoided. > > In practice, this optimization can improve a real application performance by %4 on aarch64. [ ... ] I notice that a couple weeks you'd sent a similar patch under the heading "Fixing ifcvt issue as exposed by BZ89430". Does this patch supersede the earlier patch? Jeff ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) 2019-03-15 8:22 [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) JiangNing OS 2019-03-18 21:53 ` Jeff Law 2019-04-30 16:00 ` Jeff Law @ 2019-05-24 14:54 ` Kyrill Tkachov 2019-06-17 0:36 ` JiangNing OS 2019-06-17 21:59 ` Jeff Law 3 siblings, 1 reply; 15+ messages in thread From: Kyrill Tkachov @ 2019-05-24 14:54 UTC (permalink / raw) To: JiangNing OS, gcc-patches; +Cc: ebotcazou, steven, law Hi Jiangning On 3/15/19 4:46 AM, JiangNing OS wrote: > This patch is to fix a missing ifcvt opportunity in back-end. For the > simple case below, > > if (...) > Â Â Â x = a;Â /* x is memory */ > /* no else */ > > We can generate conditional move and remove the branch as below if the > target cost is acceptable. > > r1 = x > r2 = a > cmp ... > csel r3, r1, r2, cond > x = r3 > > This could be safe if x is a stack variable, and there isn't any > address taken in current function, so the store speculation can be > avoided. > > In practice, this optimization can improve a real application > performance by %4 on aarch64. > Now that GCC 10 development is open, this should appropriate for considering. I've cc'ed folks who are either listed maintainers in this area or have reviewed patches in this area in my recent memory. Thanks, Kyrill > Thanks, > -Jiangning ^ permalink raw reply [flat|nested] 15+ messages in thread
* RE: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) 2019-05-24 14:54 ` Kyrill Tkachov @ 2019-06-17 0:36 ` JiangNing OS 0 siblings, 0 replies; 15+ messages in thread From: JiangNing OS @ 2019-06-17 0:36 UTC (permalink / raw) To: Kyrill Tkachov, gcc-patches; +Cc: ebotcazou, steven, law Hi, May I know any feedback comments to this patch? Thanks, -Jiangning From: Kyrill Tkachov <kyrylo.tkachov@foss.arm.com> Sent: Friday, May 24, 2019 10:55 PM To: JiangNing OS <jiangning@os.amperecomputing.com>; gcc-patches@gcc.gnu.org Cc: ebotcazou@adacore.com; steven@gcc.gnu.org; law@redhat.com Subject: Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) Hi Jiangning On 3/15/19 4:46 AM, JiangNing OS wrote: This patch is to fix a missing ifcvt opportunity in back-end. For the simple case below, if (...) x = a; /* x is memory */ /* no else */ We can generate conditional move and remove the branch as below if the target cost is acceptable. r1 = x r2 = a cmp ... csel r3, r1, r2, cond x = r3 This could be safe if x is a stack variable, and there isn't any address taken in current function, so the store speculation can be avoided. In practice, this optimization can improve a real application performance by %4 on aarch64. Now that GCC 10 development is open, this should appropriate for considering. I've cc'ed folks who are either listed maintainers in this area or have reviewed patches in this area in my recent memory. Thanks, Kyrill Thanks, -Jiangning ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) 2019-03-15 8:22 [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) JiangNing OS ` (2 preceding siblings ...) 2019-05-24 14:54 ` Kyrill Tkachov @ 2019-06-17 21:59 ` Jeff Law 2019-06-20 9:53 ` JiangNing OS 3 siblings, 1 reply; 15+ messages in thread From: Jeff Law @ 2019-06-17 21:59 UTC (permalink / raw) To: JiangNing OS, gcc-patches On 3/14/19 10:46 PM, JiangNing OS wrote: > This patch is to fix a missing ifcvt opportunity in back-end. For the simple case below, > > if (...) > x = a; /* x is memory */ > /* no else */ > > We can generate conditional move and remove the branch as below if the target cost is acceptable. > > r1 = x > r2 = a > cmp ... > csel r3, r1, r2, cond > x = r3 > > This could be safe if x is a stack variable, and there isn't any address taken in current function, so the store speculation can be avoided. So at a high level should we be doing this in gimple rather than RTL? We're going to have a lot more information about types, better infrastructure for looking at uses/defs, access to the alias oracle, we should be able to accurately distinguish between potentially shared objects vs those which are local to the thread, etc. We lose the low level costing information though. I'm still going to go through the patch and do some level of review, but I do think we need to answer the higher level question though. > > Index: ifcvt.c > =================================================================== > --- ifcvt.c (revision 269698) > +++ ifcvt.c (working copy) > @@ -2029,6 +2045,93 @@ > return true; > } > > +/* Return true if MEM x is on stack. a_insn contains x, if it exists. */ Nits: We typically refer to parameters, variables, etc in comments using upper case. You'll need to review the entire patch for these its. So perhaps the comment should be something like: /* Return true of X, a MEM expression, is on the stack. A_INSN contains X if A_INSN exists. */ Just from a design standpoint, what are the consequences if this function returns true for something that isn't actually in the stack or false for something that is on the stack? > + > +static bool > +noce_mem_is_on_stack (rtx_insn *a_insn, const_rtx x) > +{ > + df_ref use; > + > + gcc_assert (x); > + gcc_assert (MEM_P (x)); > + > + /* Early exits if find base is a stack register. */ > + rtx a = XEXP (x, 0); > + if (fixed_base_plus_p(a)) > + return true; Nit: Space between the function name and its open paren for arguments. ie if (fixed_base_plus_p (a) ^ I see other instances of this nit, please review the patch and correct them. > + > + if (!a_insn) > + return false; I'm not sure what the calling profile is for this function, but this is a cheaper test, so you might consider moving it before the test of fixed_base_plus_p. > + > + if (!reg_mentioned_p (x, a_insn)) > + return false; > + > + /* Check if x is on stack. Assume a mem expression using registers > + related to stack register is always on stack. */ > + FOR_EACH_INSN_USE (use, a_insn) > + if (reg_mentioned_p (DF_REF_REG (use), x) > + && bitmap_bit_p (bba_sets_must_be_sfp, DF_REF_REGNO (use))) > + return true; > + > + return false; > +} So is X always a MEM? Just wanted to make sure I understand. reg_mentioned_p will do the right thing (using rtx_equal_p) for the comparison. > + > +/* Always return true, if there is a dominating write. > + > + When there is a dominating read from memory on stack, > + 1) if x = a is a memory read, return true. > + 2) if x = a is a memory write, return true if the memory is on stack. > + This is the guarantee the memory is *not* readonly. */ > + > +static bool > +noce_valid_for_dominating (basic_block bb, rtx_insn *a_insn, > + const_rtx x, bool is_store) > +{ > + rtx_insn *insn; > + rtx set; > + > + gcc_assert (MEM_P (x)); > + > + FOR_BB_INSNS (bb, insn) > + { > + set = single_set (insn); > + if (!set) > + continue; > + > + /* Dominating store */ > + if (rtx_equal_p (x, SET_DEST (set))) > + return true; > + > + /* Dominating load */ > + if (rtx_equal_p (x, SET_SRC (set))) > + if (is_store && noce_mem_is_on_stack (a_insn, x)) > + return true; > + } > + > + return false; > +} So what would be the consequences here of returning false when in fact there was a dominating read or write? That could easily happen if the dominating read or write was not a single_set. I'm guessing that from a design standpoint you're trying to find cases where you know an object was written to within the block and does not escape. So returning false when there was a dominating write is safe. Returning true when there was not would be disastrous. Right? > @@ -5347,6 +5462,234 @@ > > return FALSE; > } > + > + > +/* Find all insns that must be stack address and store REGNO into > + bitmap bba_sets_must_be_sfp. */ > + > +static void > +find_all_must_be_sfp_insns (void) > +{ > + rtx_insn *a_insn; > + basic_block bb; > + bool sfp_found; > + auto_vec<int, 1> def_count; > + df_ref def, use; > + > + /* Calculate def counts for each insn. */ > + def_count.safe_grow_cleared (max_reg_num () + 1); > + FOR_ALL_BB_FN (bb, cfun) > + FOR_BB_INSNS (bb, a_insn) > + { > + if (!INSN_P (a_insn)) > + continue; > + > + FOR_EACH_INSN_DEF (def, a_insn) > + def_count[DF_REF_REGNO (def)]++; > + }Is there a reason why you can't use the information computed by reginfo? > + > + /* Iterate all insns until finding all stack pointers, and store > + them into bba_sets_must_be_sfp. */ > + bba_sets_must_be_sfp = BITMAP_ALLOC (®_obstack); > + do > + { > + sfp_found = false; > + FOR_ALL_BB_FN (bb, cfun) > + FOR_BB_INSNS (bb, a_insn) > + { > + rtx sset_a = single_set (a_insn); > + if (!sset_a) > + continue; > + rtx src = SET_SRC (sset_a); > + rtx dest = SET_DEST (sset_a); So again, we can get things that aren't a single_set here and they'll be ignored. But I believe that's safe as you're trying to identify insns which store stack addresses into a REG. Missing a more complicated case where a stack address is stored into a REG is safe. Right? > + > + if (!REG_P (dest)) > + continue; > + > + /* For the case below, > + Control flow: B1->B3, B2->B3 > + B1: p = &local_var > + B2: p = &global_var > + B3: ... = *p > + pointer p is an address for either local or global variable. > + so we don't treat p as a stack address pointer. To make > + algorithm simple, we ignore all non-SSA cases. */ > + bool skip_insn = false; > + unsigned int dest_regno = 0; > + FOR_EACH_INSN_DEF (def, a_insn) > + { > + dest_regno = DF_REF_REGNO (def); > + /* Skip current insn if > + 1) already marked as stack pointer. > + 2) or see multiple definition points. */ > + if (bitmap_bit_p (bba_sets_must_be_sfp, dest_regno) > + || def_count[dest_regno] > 1) > + { > + skip_insn = true; > + break; > + } > + } > + if (skip_insn) > + continue; Right. Again I wonder if you should be using the info computed by regstat. You can get single use information easily from that. > + > + /* Handle case like "x1 = sp + offset". */ > + if (fixed_base_plus_p (src)) > + { > + bitmap_set_bit (bba_sets_must_be_sfp, dest_regno); > + sfp_found = true; > + continue; > + } Looks like your you've got an indentation problem here. Most likely you've got a case where you've got 8 spaces that should be replaced with a tab. > + > + /* Handle case like "x2 = x1 + offset", in which x1 is already > + identified as a stack pointer. */ > + if ((GET_CODE (src) == PLUS || GET_CODE (src) == MINUS) > + && CONST_INT_P (XEXP (src, 1))) > + { > + rtx x1 = XEXP (src, 0); > + if (!REG_P (x1)) > + continue; > + > + FOR_EACH_INSN_USE (use, a_insn) > + { > + if (!rtx_equal_p (x1, DF_REF_REG (use))) > + continue; > + > + if (bitmap_bit_p (bba_sets_must_be_sfp, DF_REF_REGNO (use))) > + { > + bitmap_set_bit (bba_sets_must_be_sfp, dest_regno); > + sfp_found = true; > + break; > + } > + } So how much is there to be gained from going from this kind of localized computation to global? It shouldn't be hard to turn this into a truly global algorithm, but I'm not sure it's worth the effort. I _do_ think it's worth visiting blocks in dominator order. It's better than what you're doing, but nowhere near as expensive as a global propagation engine. Is it worth handling simple copies in the code above? > + } > + } > + } > + while (sfp_found); > +} > + > +/* Find all insns that may be stack pointer and store REGNO into > + bitmap bba_sets_may_be_sfp. We iterate all insns in current func > + until no more latent insns generating stack address are found. > + The collection of stack pointer bba_sets_may_be_sfp will be used > + to help analyze local stack variable address taken. Stack variable > + address can be passed out of current frame if only any stack pointer > + is passed to hard register or stored into memory. */ Shouldn't "may be stack pointer" be "must be stack pointer"? > + > +static void > +find_all_may_be_sfp_insns (void) > +{ > + rtx_insn *a_insn; > + basic_block bb; > + bool sfp_found; > + bba_sets_may_be_sfp = BITMAP_ALLOC (®_obstack); > + > + /* Iterate all insns that may be stack pointers, and store them into > + bitmap bba_sets_must_be_sfp. */ > + do > + { > + sfp_found = false; > + FOR_ALL_BB_FN (bb, cfun) Again, you may ultimately be better off with a dominator walk here. > + FOR_BB_INSNS (bb, a_insn) > + { > + /* Assume stack pointers can only be generated by single SET. */ > + rtx sset_a = single_set (a_insn); > + if (!sset_a) > + continue; > + rtx src = SET_SRC (sset_a); > + rtx dest = SET_DEST (sset_a); I'm not sure that's a safe assumption. > + > + /* If we see "hard_register = ...", which implies passing out > + of current frame potentially, we don't collect this case. > + It can be treated as address taken in no_stack_address_taken */ > + if (HARD_REGISTER_P (dest)) > + continue; > + > + /* Memory load and store are not to generate stack pointer. */ > + if (MEM_P (src) || MEM_P (dest)) > + continue; > + > + /* Skip insn that is already identified as stack pointer. */ > + bool skip_insn = false; > + df_ref def; > + unsigned int dest_regno = 0; > + FOR_EACH_INSN_DEF (def, a_insn) > + { > + dest_regno = DF_REF_REGNO (def); > + if (bitmap_bit_p (bba_sets_may_be_sfp, dest_regno)) > + { > + skip_insn = true; > + break; > + } > + } So earlier you've verified you've got a single set. I'm not sure you need an iterator over the defs. Can't you just look at SET_DEST (sset_a) which is conveniently in DEST? I don't like the term "stack pointer" here -- most of us read "stack pointer" and we think the register holding the current stack pointer. Would "pointer into the stack" be more appropriate? > +/* Return true if current function doesn't pass stack address out of frame. */ > + > +static bool > +no_stack_address_taken (void) > +{ > + basic_block bb; > + rtx_insn *a_insn; > + df_ref use; > + > + FOR_ALL_BB_FN (bb, cfun) > + FOR_BB_INSNS (bb, a_insn) > + { > + if (!INSN_P (a_insn)) > + continue; > + > + rtx sset_a = single_set (a_insn); > + if (!sset_a) > + continue; > + rtx src = SET_SRC (sset_a); > + rtx dest = SET_DEST (sset_a); > + > + /* Skip insn that is already identified as frame pointers. */ > + if (bitmap_bit_p (bba_sets_may_be_sfp, REGNO (dest))) > + continue; > + > + FOR_EACH_INSN_USE (use, a_insn) > + { > + /* Check if current insn is using any stack pointer. Ignore > + if it doesn't use frame pointers at all. */ > + if (!bitmap_bit_p (bba_sets_may_be_sfp, DF_REF_REGNO (use))) > + continue; > + > + /* Load cannot introduce address taken. */ > + if (MEM_P (src)) > + continue; > + > + /* Store is safe if only the src doesn't contain stack pointer. */ > + if (MEM_P (dest) > + && !reg_mentioned_p (DF_REF_REG (use), src)) > + continue; > + > + /* All other cases potentially introduce address taken. */ > + return false; > + } So what if you have a PARALLEL where one entry causes an escape of a pointer into the stack and another entry does some other operation? If so, don't you miss the fact that you've got an escaping pointer into the stack? I don't think you can restrict yourself to single_sets here. Jeff ^ permalink raw reply [flat|nested] 15+ messages in thread
* RE: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) 2019-06-17 21:59 ` Jeff Law @ 2019-06-20 9:53 ` JiangNing OS 2019-06-20 23:13 ` Jeff Law 0 siblings, 1 reply; 15+ messages in thread From: JiangNing OS @ 2019-06-20 9:53 UTC (permalink / raw) To: Jeff Law, gcc-patches [-- Attachment #1: Type: text/plain, Size: 16040 bytes --] Hi Jeff, Appreciate your effort to review my patch! I've updated my patch as attached. See my answers below. > in current function, so the store speculation can be avoided. > So at a high level should we be doing this in gimple rather than RTL? > We're going to have a lot more information about types, better > infrastructure for looking at uses/defs, access to the alias oracle, we should > be able to accurately distinguish between potentially shared objects vs those > which are local to the thread, etc. We lose the low level costing information > though. > > I'm still going to go through the patch and do some level of review, but I do > think we need to answer the higher level question though. > I have the following reasons, 1) Following the clue Richard B gave me before about parameter --param allow-store-data-races, I did check the middle-end pass tree-if-conv, but I think this pass at the moment doesn't work for the issue I'm trying to solve. Tree-if-conv is to do if conversion for loop, and its final goal is to help loop vectorization, while my case doesn't have a loop at all. 2) My current solution fits into current back-end if-conversion pass very well. I don't want to invent a new framework to solve this relatively small issue. Besides, this back-end patch doesn't only enhance store speculation detection, but also fix a bug in the original code. > Nits: We typically refer to parameters, variables, etc in comments using > upper case. You'll need to review the entire patch for these its. > > So perhaps the comment should be something like: > > /* Return true of X, a MEM expression, is on the stack. A_INSN contains > X if A_INSN exists. */ > Fixed in attached new patch. > > Just from a design standpoint, what are the consequences if this function > returns true for something that isn't actually in the stack or false for > something that is on the stack? > If noce_mem_is_on_stack returns true for something that isn't actually in the stack, it could potentially introduce store speculation, then the if-conversion optimization will be incorrect. If this function returns false for something that is on stack, it doesn't matter, because the optimization will not be triggered. > Nit: Space between the function name and its open paren for arguments. ie > > if (fixed_base_plus_p (a) > ^ > I see other instances of this nit, please review the patch and correct them. > Fixed in attached new patch. > > > + > > + if (!a_insn) > > + return false; > I'm not sure what the calling profile is for this function, but this is a cheaper > test, so you might consider moving it before the test of fixed_base_plus_p. > Fixed in attached new patch. > > > + > > + if (!reg_mentioned_p (x, a_insn)) > > + return false; > > + > > + /* Check if x is on stack. Assume a mem expression using registers > > + related to stack register is always on stack. */ > > + FOR_EACH_INSN_USE (use, a_insn) > > + if (reg_mentioned_p (DF_REF_REG (use), x) > > + && bitmap_bit_p (bba_sets_must_be_sfp, DF_REF_REGNO (use))) > > + return true; > > + > > + return false; > > +} > So is X always a MEM? Just wanted to make sure I understand. > reg_mentioned_p will do the right thing (using rtx_equal_p) for the > comparison. > Yes. X is always a MEM. There is an assertion for this in the code above. > > > + > > +/* Always return true, if there is a dominating write. > > + > > + When there is a dominating read from memory on stack, > > + 1) if x = a is a memory read, return true. > > + 2) if x = a is a memory write, return true if the memory is on stack. > > + This is the guarantee the memory is *not* readonly. */ > > + > > +static bool > > +noce_valid_for_dominating (basic_block bb, rtx_insn *a_insn, > > + const_rtx x, bool is_store) { > > + rtx_insn *insn; > > + rtx set; > > + > > + gcc_assert (MEM_P (x)); > > + > > + FOR_BB_INSNS (bb, insn) > > + { > > + set = single_set (insn); > > + if (!set) > > + continue; > > + > > + /* Dominating store */ > > + if (rtx_equal_p (x, SET_DEST (set))) > > + return true; > > + > > + /* Dominating load */ > > + if (rtx_equal_p (x, SET_SRC (set))) > > + if (is_store && noce_mem_is_on_stack (a_insn, x)) > > + return true; > > + } > > + > > + return false; > > +} > So what would be the consequences here of returning false when in fact > there was a dominating read or write? That could easily happen if the > dominating read or write was not a single_set. If return false when in fact there is a dominating read or write, the optimization will not be triggered, because noce_mem_maybe_invalid_p will be true, which is playing the same role as may_trap_or_fault_p. > > I'm guessing that from a design standpoint you're trying to find cases where > you know an object was written to within the block and does not escape. So > returning false when there was a dominating write is safe. > Returning true when there was not would be disastrous. Right? > YES! You've completely understood my code! 😊 > > > > > > @@ -5347,6 +5462,234 @@ > > > > return FALSE; > > } > > + > > + > > +/* Find all insns that must be stack address and store REGNO into > > + bitmap bba_sets_must_be_sfp. */ > > + > > +static void > > +find_all_must_be_sfp_insns (void) > > +{ > > + rtx_insn *a_insn; > > + basic_block bb; > > + bool sfp_found; > > + auto_vec<int, 1> def_count; > > + df_ref def, use; > > + > > + /* Calculate def counts for each insn. */ > > + def_count.safe_grow_cleared (max_reg_num () + 1); FOR_ALL_BB_FN > > + (bb, cfun) > > + FOR_BB_INSNS (bb, a_insn) > > + { > > + if (!INSN_P (a_insn)) > > + continue; > > + > > + FOR_EACH_INSN_DEF (def, a_insn) > > + def_count[DF_REF_REGNO (def)]++; > > + }Is there a reason why you can't use the information computed by > reginfo? > Fixed in attached new patch. > > > > > + > > + /* Iterate all insns until finding all stack pointers, and store > > + them into bba_sets_must_be_sfp. */ bba_sets_must_be_sfp = > > + BITMAP_ALLOC (®_obstack); do > > + { > > + sfp_found = false; > > + FOR_ALL_BB_FN (bb, cfun) > > + FOR_BB_INSNS (bb, a_insn) > > + { > > + rtx sset_a = single_set (a_insn); > > + if (!sset_a) > > + continue; > > + rtx src = SET_SRC (sset_a); > > + rtx dest = SET_DEST (sset_a); > So again, we can get things that aren't a single_set here and they'll be > ignored. But I believe that's safe as you're trying to identify insns which store > stack addresses into a REG. Missing a more complicated case where a stack > address is stored into a REG is safe. Right? > Yes. Missing some pointers to stack is safe here, because we will use it to check if a memory in if-conversion candidate doesn't have store speculation. If we miss it, the optimization will not be triggered, so there will not be risky. > > > > > + > > + if (!REG_P (dest)) > > + continue; > > + > > + /* For the case below, > > + Control flow: B1->B3, B2->B3 > > + B1: p = &local_var > > + B2: p = &global_var > > + B3: ... = *p > > + pointer p is an address for either local or global variable. > > + so we don't treat p as a stack address pointer. To make > > + algorithm simple, we ignore all non-SSA cases. */ > > + bool skip_insn = false; > > + unsigned int dest_regno = 0; > > + FOR_EACH_INSN_DEF (def, a_insn) > > + { > > + dest_regno = DF_REF_REGNO (def); > > + /* Skip current insn if > > + 1) already marked as stack pointer. > > + 2) or see multiple definition points. */ > > + if (bitmap_bit_p (bba_sets_must_be_sfp, dest_regno) > > + || def_count[dest_regno] > 1) > > + { > > + skip_insn = true; > > + break; > > + } > > + } > > + if (skip_insn) > > + continue; > Right. Again I wonder if you should be using the info computed by regstat. > You can get single use information easily from that. > Yes. Fixed in attached new patch. > > + > > + /* Handle case like "x1 = sp + offset". */ > > + if (fixed_base_plus_p (src)) > > + { > > + bitmap_set_bit (bba_sets_must_be_sfp, dest_regno); > > + sfp_found = true; > > + continue; > > + } > Looks like your you've got an indentation problem here. Most likely you've > got a case where you've got 8 spaces that should be replaced with a tab. > Fixed in attached new patch. > > > + > > + /* Handle case like "x2 = x1 + offset", in which x1 is already > > + identified as a stack pointer. */ > > + if ((GET_CODE (src) == PLUS || GET_CODE (src) == MINUS) > > + && CONST_INT_P (XEXP (src, 1))) > > + { > > + rtx x1 = XEXP (src, 0); > > + if (!REG_P (x1)) > > + continue; > > + > > + FOR_EACH_INSN_USE (use, a_insn) > > + { > > + if (!rtx_equal_p (x1, DF_REF_REG (use))) > > + continue; > > + > > + if (bitmap_bit_p (bba_sets_must_be_sfp, DF_REF_REGNO > (use))) > > + { > > + bitmap_set_bit (bba_sets_must_be_sfp, dest_regno); > > + sfp_found = true; > > + break; > > + } > > + } > So how much is there to be gained from going from this kind of localized > computation to global? It shouldn't be hard to turn this into a truly global > algorithm, but I'm not sure it's worth the effort. > > I _do_ think it's worth visiting blocks in dominator order. It's better than > what you're doing, but nowhere near as expensive as a global propagation > engine. > Fixed in attached new patch using dom_walk class. Please review again. > Is it worth handling simple copies in the code above? I thought it less likely to have a copy for pointer to stack before register allocation. Right? > > > > + } > > + } > > + } > > + while (sfp_found); > > +} > > + > > +/* Find all insns that may be stack pointer and store REGNO into > > + bitmap bba_sets_may_be_sfp. We iterate all insns in current func > > + until no more latent insns generating stack address are found. > > + The collection of stack pointer bba_sets_may_be_sfp will be used > > + to help analyze local stack variable address taken. Stack variable > > + address can be passed out of current frame if only any stack pointer > > + is passed to hard register or stored into memory. */ > Shouldn't "may be stack pointer" be "must be stack pointer"? > There is a typo in the comments. Fixed in attached new patch. > > > + > > +static void > > +find_all_may_be_sfp_insns (void) > > +{ > > + rtx_insn *a_insn; > > + basic_block bb; > > + bool sfp_found; > > + bba_sets_may_be_sfp = BITMAP_ALLOC (®_obstack); > > + > > + /* Iterate all insns that may be stack pointers, and store them into > > + bitmap bba_sets_must_be_sfp. */ > > + do > > + { > > + sfp_found = false; > > + FOR_ALL_BB_FN (bb, cfun) > Again, you may ultimately be better off with a dominator walk here. Yes. Fixed in attached new patch. > > > > + FOR_BB_INSNS (bb, a_insn) > > + { > > + /* Assume stack pointers can only be generated by single SET. */ > > + rtx sset_a = single_set (a_insn); > > + if (!sset_a) > > + continue; > > + rtx src = SET_SRC (sset_a); > > + rtx dest = SET_DEST (sset_a); > I'm not sure that's a safe assumption. Fixed in attached new patch. > > > > + > > + /* If we see "hard_register = ...", which implies passing out > > + of current frame potentially, we don't collect this case. > > + It can be treated as address taken in no_stack_address_taken */ > > + if (HARD_REGISTER_P (dest)) > > + continue; > > + > > + /* Memory load and store are not to generate stack pointer. */ > > + if (MEM_P (src) || MEM_P (dest)) > > + continue; > > + > > + /* Skip insn that is already identified as stack pointer. */ > > + bool skip_insn = false; > > + df_ref def; > > + unsigned int dest_regno = 0; > > + FOR_EACH_INSN_DEF (def, a_insn) > > + { > > + dest_regno = DF_REF_REGNO (def); > > + if (bitmap_bit_p (bba_sets_may_be_sfp, dest_regno)) > > + { > > + skip_insn = true; > > + break; > > + } > > + } > So earlier you've verified you've got a single set. I'm not sure you need an > iterator over the defs. Can't you just look at SET_DEST > (sset_a) which is conveniently in DEST? Fixed in attached new patch. > > > I don't like the term "stack pointer" here -- most of us read "stack pointer" > and we think the register holding the current stack pointer. > Would "pointer into the stack" be more appropriate? > Fixed in attached new patch. > > > +/* Return true if current function doesn't pass stack address out of > > +frame. */ > > + > > +static bool > > +no_stack_address_taken (void) > > +{ > > + basic_block bb; > > + rtx_insn *a_insn; > > + df_ref use; > > + > > + FOR_ALL_BB_FN (bb, cfun) > > + FOR_BB_INSNS (bb, a_insn) > > + { > > + if (!INSN_P (a_insn)) > > + continue; > > + > > + rtx sset_a = single_set (a_insn); > > + if (!sset_a) > > + continue; > > + rtx src = SET_SRC (sset_a); > > + rtx dest = SET_DEST (sset_a); > > + > > + /* Skip insn that is already identified as frame pointers. */ > > + if (bitmap_bit_p (bba_sets_may_be_sfp, REGNO (dest))) > > + continue; > > + > > + FOR_EACH_INSN_USE (use, a_insn) > > + { > > + /* Check if current insn is using any stack pointer. Ignore > > + if it doesn't use frame pointers at all. */ > > + if (!bitmap_bit_p (bba_sets_may_be_sfp, DF_REF_REGNO (use))) > > + continue; > > + > > + /* Load cannot introduce address taken. */ > > + if (MEM_P (src)) > > + continue; > > + > > + /* Store is safe if only the src doesn't contain stack pointer. */ > > + if (MEM_P (dest) > > + && !reg_mentioned_p (DF_REF_REG (use), src)) > > + continue; > > + > > + /* All other cases potentially introduce address taken. */ > > + return false; > > + } > So what if you have a PARALLEL where one entry causes an escape of a > pointer into the stack and another entry does some other operation? If so, > don't you miss the fact that you've got an escaping pointer into the stack? I > don't think you can restrict yourself to single_sets here. Yes. You are right. I have added code to handle PARALLEL case. I handle it in a rather conservative way though. Review again, please! > > Jeff Thanks, -Jiangning [-- Attachment #2: csel4.patch --] [-- Type: application/octet-stream, Size: 20111 bytes --] diff --git a/gcc/ChangeLog b/gcc/ChangeLog index c75b08e..59c8dd6 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,25 @@ +2019-06-18 Jiangning Liu <jiangning@os.amperecomputing.com> + + PR rtl-optimization/89430 + * cse.c (fixed_base_plus_p): Move from here... + * rtlanal.c (fixed_base_plus_p): ...to here... + * rtl.h (fixed_base_plus_p): Add extern declaration. + * ifcvt.c + (noce_process_if_block): For no else_bb and memory store ifcvt case, + early exit if only it is global and current function has address + taken. + (noce_try_cmove_arith): Do ifcvt for no else_bb case, if the variable + is on stack and dominating access can prove the newly introduce memory + access is safe. + (find_all_must_be_sfp_insns): New function. + (find_all_may_be_sfp_insns): New function. + (no_stack_address_taken): New function. + (no_need_to_analyze_sfp): New function. + (noce_mem_is_on_stack): New function. + (noce_valid_for_dominating): New function. + * ifcvt.h : Add new fields. + * testsuite/gcc.dg/ifcvt-6.c: New. + 2019-06-17 Jakub Jelinek <jakub@redhat.com> * omp-low.c (struct omp_context): Add scan_inclusive field. diff --git a/gcc/cse.c b/gcc/cse.c index 35840a6d..a6fcf92 100644 --- a/gcc/cse.c +++ b/gcc/cse.c @@ -540,7 +540,6 @@ static bitmap cse_ebb_live_in, cse_ebb_live_out; already as part of an already processed extended basic block. */ static sbitmap cse_visited_basic_blocks; -static bool fixed_base_plus_p (rtx x); static int notreg_cost (rtx, machine_mode, enum rtx_code, int); static int preferable (int, int, int, int); static void new_basic_block (void); @@ -606,29 +605,6 @@ static machine_mode cse_cc_succs (basic_block, basic_block, rtx, rtx, static const struct rtl_hooks cse_rtl_hooks = RTL_HOOKS_INITIALIZER; \f -/* Nonzero if X has the form (PLUS frame-pointer integer). */ - -static bool -fixed_base_plus_p (rtx x) -{ - switch (GET_CODE (x)) - { - case REG: - if (x == frame_pointer_rtx || x == hard_frame_pointer_rtx) - return true; - if (x == arg_pointer_rtx && fixed_regs[ARG_POINTER_REGNUM]) - return true; - return false; - - case PLUS: - if (!CONST_INT_P (XEXP (x, 1))) - return false; - return fixed_base_plus_p (XEXP (x, 0)); - - default: - return false; - } -} /* Dump the expressions in the equivalence class indicated by CLASSP. This function is used only for debugging. */ diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c index 7b2f6e6..d04040e 100644 --- a/gcc/ifcvt.c +++ b/gcc/ifcvt.c @@ -46,6 +46,7 @@ #include "rtl-iter.h" #include "ifcvt.h" #include "params.h" +#include "domwalk.h" #ifndef MAX_CONDITIONAL_EXECUTE #define MAX_CONDITIONAL_EXECUTE \ @@ -76,6 +77,18 @@ static int num_true_changes; /* Whether conditional execution changes were made. */ static int cond_exec_changed_p; +/* REGNO bitmap for defs that may be stack frame address. */ +static bitmap bba_sets_may_be_sfp; + +/* REGNO bitmap for defs that must be stack frame address. */ +static bitmap bba_sets_must_be_sfp; + +/* true if current function doesn't have local address taken. */ +static bool cfun_no_stack_address_taken; + +/* A flag to indicate if a pointer to stack is found. */ +static bool sfp_found; + /* Forward references. */ static int count_bb_insns (const_basic_block); static bool cheap_bb_rtx_cost_p (const_basic_block, profile_probability, int); @@ -99,6 +112,14 @@ static int dead_or_predicable (basic_block, basic_block, basic_block, edge, int); static void noce_emit_move_insn (rtx, rtx); static rtx_insn *block_has_only_trap (basic_block); +static bool noce_mem_is_on_stack (rtx_insn *a_insn, const_rtx x); +static bool noce_valid_for_dominating (basic_block bb, rtx_insn *a_insn, + const_rtx x, bool is_store); +static bool noce_mem_maybe_invalid_p (struct noce_if_info *if_info); +static bool no_need_to_analyze_sfp (const_rtx set); +static void find_all_must_be_sfp_insns (void); +static void find_all_may_be_sfp_insns (void); +static bool no_stack_address_taken (void); \f /* Count the number of non-jump active insns in BB. */ @@ -2029,6 +2050,94 @@ noce_emit_bb (rtx last_insn, basic_block bb, bool simple) return true; } +/* Return true of X, a MEM expression, is on the stack. A_INSN contains + X if A_INSN exists. */ + +static bool +noce_mem_is_on_stack (rtx_insn *a_insn, const_rtx x) +{ + df_ref use; + + gcc_assert (x); + gcc_assert (MEM_P (x)); + + if (!a_insn) + return false; + + /* Early exits if find base is a stack register. */ + rtx a = XEXP (x, 0); + if (fixed_base_plus_p (a)) + return true; + + if (!reg_mentioned_p (x, a_insn)) + return false; + + /* Check if x is on stack. Assume a mem expression using registers + related to stack register is always on stack. */ + FOR_EACH_INSN_USE (use, a_insn) + if (reg_mentioned_p (DF_REF_REG (use), x) + && bitmap_bit_p (bba_sets_must_be_sfp, DF_REF_REGNO (use))) + return true; + + return false; +} + +/* Always return true, if there is a dominating store. + + When there is a dominating load from memory on stack, + 1) if A_INSN is a memory read, return true. + 2) if A_INSN is a memory write, return true if the memory is on stack. + This is to guarantee the memory is *not* readonly. */ + +static bool +noce_valid_for_dominating (basic_block bb, rtx_insn *a_insn, + const_rtx x, bool is_store) +{ + rtx_insn *insn; + rtx set; + + gcc_assert (MEM_P (x)); + + FOR_BB_INSNS (bb, insn) + { + set = single_set (insn); + if (!set) + continue; + + /* Dominating store. */ + if (rtx_equal_p (x, SET_DEST (set))) + return true; + + /* Dominating load. */ + if (rtx_equal_p (x, SET_SRC (set))) + if (is_store && noce_mem_is_on_stack (a_insn, x)) + return true; + } + + return false; +} + +/* Return false if the memory A or B must be valid. + This function must be called before latent swap of A and B. */ + +static bool +noce_mem_maybe_invalid_p (struct noce_if_info *if_info) +{ + if (!if_info->set_b && MEM_P (if_info->orig_x)) + { + if (!if_info->else_bb && MEM_P (if_info->b)) + return !noce_valid_for_dominating (if_info->test_bb, + if_info->insn_a, + if_info->b, true); + } + + /* ??? We could handle this if we knew that a load from A or B could + not trap or fault. This is also true if we've already loaded + from the address along the path from ENTRY. */ + return (may_trap_or_fault_p (if_info->a) + || may_trap_or_fault_p (if_info->b)); +} + /* Try more complex cases involving conditional_move. */ static int @@ -2065,10 +2174,7 @@ noce_try_cmove_arith (struct noce_if_info *if_info) is_mem = 1; } - /* ??? We could handle this if we knew that a load from A or B could - not trap or fault. This is also true if we've already loaded - from the address along the path from ENTRY. */ - else if (may_trap_or_fault_p (a) || may_trap_or_fault_p (b)) + else if (noce_mem_maybe_invalid_p (if_info)) return FALSE; /* if (test) x = a + b; else x = c - d; @@ -2234,7 +2340,7 @@ noce_try_cmove_arith (struct noce_if_info *if_info) /* If insn to set up A clobbers any registers B depends on, try to swap insn that sets up A with the one that sets up B. If even that doesn't help, punt. */ - if (modified_in_a && !modified_in_b) + if (!modified_in_a && modified_in_b) { if (!noce_emit_bb (emit_b, else_bb, b_simple)) goto end_seq_and_fail; @@ -2242,7 +2348,7 @@ noce_try_cmove_arith (struct noce_if_info *if_info) if (!noce_emit_bb (emit_a, then_bb, a_simple)) goto end_seq_and_fail; } - else if (!modified_in_a) + else if (!modified_in_b) { if (!noce_emit_bb (emit_a, then_bb, a_simple)) goto end_seq_and_fail; @@ -3563,12 +3669,27 @@ noce_process_if_block (struct noce_if_info *if_info) } if (!set_b && MEM_P (orig_x)) - /* We want to avoid store speculation to avoid cases like - if (pthread_mutex_trylock(mutex)) - ++global_variable; - Rather than go to much effort here, we rely on the SSA optimizers, - which do a good enough job these days. */ - return FALSE; + { + /* We want to avoid store speculation to avoid cases like + if (pthread_mutex_trylock (mutex)) + ++global_variable; + Tree if conversion cannot handle this case well, and it intends to + help vectorization for loops only. */ + if (!noce_mem_is_on_stack (insn_a, orig_x)) + return FALSE; + + /* For case like, + if (pthread_mutex_trylock (mutex)) + ++local_variable; + If any stack variable address is taken, potentially this local + variable could be modified by other threads and introduce store + speculation. */ + if (!cfun_no_stack_address_taken) + return FALSE; + } + + if_info->set_a = set_a; + if_info->set_b = set_b; if (noce_try_move (if_info)) goto success; @@ -5347,6 +5468,297 @@ dead_or_predicable (basic_block test_bb, basic_block merge_bb, return FALSE; } + +class must_be_sfp_dom_walker : public dom_walker +{ +public: + must_be_sfp_dom_walker (cdi_direction direction) : dom_walker (direction) + {} + + virtual edge before_dom_children (basic_block); +}; + +edge +must_be_sfp_dom_walker::before_dom_children (basic_block bb) +{ + rtx_insn *a_insn; + + FOR_BB_INSNS (bb, a_insn) + { + df_ref def, use; + + rtx sset_a = single_set (a_insn); + if (!sset_a) + continue; + rtx src = SET_SRC (sset_a); + rtx dest = SET_DEST (sset_a); + + if (!REG_P (dest)) + continue; + + /* For the case below, + Control flow: B1->B3, B2->B3 + B1: p = &local_var + B2: p = &global_var + B3: ... = *p + pointer p is an address for either local or global variable. + so we don't treat p as a stack address pointer. To make + algorithm simple, we ignore all non-SSA cases. */ + bool skip_insn = false; + unsigned int dest_regno = 0; + FOR_EACH_INSN_DEF (def, a_insn) + { + dest_regno = DF_REF_REGNO (def); + /* Skip current insn if + 1) it is already marked as a pointer to the stack. + 2) or we see multiple definition points. */ + if (bitmap_bit_p (bba_sets_must_be_sfp, dest_regno) + || REG_N_SETS (dest_regno) > 1) + { + skip_insn = true; + break; + } + } + if (skip_insn) + continue; + + /* Handle case like "x1 = sp + offset". */ + if (fixed_base_plus_p (src)) + { + bitmap_set_bit (bba_sets_must_be_sfp, dest_regno); + sfp_found = true; + continue; + } + + /* Handle case like "x2 = x1 + offset", in which x1 is already + identified as a pointer to the stack. */ + if ((GET_CODE (src) == PLUS || GET_CODE (src) == MINUS) + && CONST_INT_P (XEXP (src, 1))) + { + rtx x1 = XEXP (src, 0); + if (!REG_P (x1)) + continue; + + FOR_EACH_INSN_USE (use, a_insn) + { + if (!rtx_equal_p (x1, DF_REF_REG (use))) + continue; + + if (bitmap_bit_p (bba_sets_must_be_sfp, DF_REF_REGNO (use))) + { + bitmap_set_bit (bba_sets_must_be_sfp, dest_regno); + sfp_found = true; + break; + } + } + } + } + return NULL; +} + +/* Find all insns that must be stack address and store REGNO into + bitmap BBA_SETS_MUST_BE_SFP. */ + +static void +find_all_must_be_sfp_insns (void) +{ + do { + sfp_found = false; + must_be_sfp_dom_walker (CDI_DOMINATORS) + .walk (ENTRY_BLOCK_PTR_FOR_FN (cfun)); + } while (sfp_found); +} + +class may_be_sfp_dom_walker : public dom_walker +{ +public: + may_be_sfp_dom_walker (cdi_direction direction) : dom_walker (direction) + {} + + virtual edge before_dom_children (basic_block); +}; + +/* Return true if we are sure SET doesn't need to be further analyzed + to calcluate BBA_SETS_MAY_BE_SFP. */ + +static bool no_need_to_analyze_sfp (const_rtx set) +{ + rtx src = SET_SRC (set); + rtx dest = SET_DEST (set); + + /* Skip insn that is already identified as a pointer to the stack. */ + unsigned int dest_regno = REGNO (dest); + if (bitmap_bit_p (bba_sets_may_be_sfp, dest_regno)) + return true; + + /* If we see "hard_register = ...", which implies passing out + current frame potentially, we don't collect this case. + can be treated as address taken in no_stack_address_taken. */ + if (HARD_REGISTER_P (dest)) + return true; + + /* Memory load and store are not to generate a pointer to the stack. */ + if (MEM_P (src) || MEM_P (dest)) + return true; + + return false; +} + +edge +may_be_sfp_dom_walker::before_dom_children (basic_block bb) +{ + rtx_insn *a_insn; + const_rtx pat; + + FOR_BB_INSNS (bb, a_insn) + { + if (!INSN_P (a_insn)) + continue; + + pat = PATTERN (a_insn); + if (GET_CODE (pat) == SET) + { + if (no_need_to_analyze_sfp (pat)) + continue; + + /* Collect all latent insns that generate pointers to the stack. */ + df_ref use; + FOR_EACH_INSN_USE (use, a_insn) + { + rtx x = DF_REF_REG (use); + if (x == frame_pointer_rtx || x == hard_frame_pointer_rtx + || bitmap_bit_p (bba_sets_may_be_sfp, DF_REF_REGNO (use))) + { + bitmap_set_bit (bba_sets_may_be_sfp, REGNO (SET_DEST (pat))); + sfp_found = true; + break; + } + } + } + else if (GET_CODE (pat) == PARALLEL) + { + for (int i = 0; i < XVECLEN (pat, 0); i++) + { + rtx sub = XVECEXP (pat, 0, i); + if (GET_CODE (sub) == SET) + { + if (no_need_to_analyze_sfp (sub)) + continue; + + /* Aggressively mark it as a pointer to the stack to + avoid any local stack address to escape out of current + frame. */ + bitmap_set_bit (bba_sets_may_be_sfp, + REGNO (SET_DEST (sub))); + sfp_found = true; + } + } + } + } + + return NULL; +} + +/* Find all insns that may be pointers to the stack and store REGNO into + bitmap BBA_SETS_MAY_BE_SFP. We iterate all insns in current func + until no more latent insns generating stack address are found. The + collection of pointers to the stack BBA_SETS_MAY_BE_SFP will be used + to help analyze local stack variable address taken. Stack variable + address can be passed out of current frame if only any pointer to the + stack is passed to hard register or stored into memory. */ + +static void +find_all_may_be_sfp_insns (void) +{ + do { + sfp_found = false; + may_be_sfp_dom_walker (CDI_DOMINATORS) + .walk (ENTRY_BLOCK_PTR_FOR_FN (cfun)); + } while (sfp_found); +} + +/* Return true if current function doesn't pass stack address out of frame. */ + +static bool +no_stack_address_taken (void) +{ + basic_block bb; + rtx_insn *a_insn; + const_rtx pat; + df_ref use; + rtx src, dest; + + FOR_ALL_BB_FN (bb, cfun) + FOR_BB_INSNS (bb, a_insn) + { + if (!INSN_P (a_insn)) + continue; + + pat = PATTERN (a_insn); + if (GET_CODE (pat) == SET) + { + src = SET_SRC (pat); + dest = SET_DEST (pat); + + /* Skip if it is already identified as pointers to the stack. */ + if (bitmap_bit_p (bba_sets_may_be_sfp, REGNO (dest))) + continue; + + /* Load cannot introduce address taken. */ + if (MEM_P (src)) + continue; + + FOR_EACH_INSN_USE (use, a_insn) + { + /* Skip if it doesn't use any pointers to the stack at all. */ + if (!bitmap_bit_p (bba_sets_may_be_sfp, DF_REF_REGNO (use))) + continue; + + /* Store is safe if src doesn't contain any pointers to the + stack. */ + if (MEM_P (dest) + && !reg_mentioned_p (DF_REF_REG (use), src)) + continue; + + /* All other cases potentially introduce address taken. */ + return false; + } + } + else if (GET_CODE (pat) == PARALLEL) + { + for (int i = 0; i < XVECLEN (pat, 0); i++) + { + rtx sub = XVECEXP (pat, 0, i); + src = SET_SRC (sub); + dest = SET_DEST (sub); + + if (GET_CODE (sub) == SET) + { + /* Skip if it is already identified as pointers to + the stack. */ + if (bitmap_bit_p (bba_sets_may_be_sfp, REGNO (dest))) + continue; + + /* Load cannot introduce address taken. */ + if (MEM_P (src)) + continue; + + /* For all other cases, conservatively treat it as + assdress taken if only a pointer to the stack is + used. */ + FOR_EACH_INSN_USE (use, a_insn) + { + if (bitmap_bit_p (bba_sets_may_be_sfp, + DF_REF_REGNO (use))) + return false; + } + } + } + } + } + return true; +} + \f /* Main entry point for all if-conversion. AFTER_COMBINE is true if we are after combine pass. */ @@ -5381,6 +5793,18 @@ if_convert (bool after_combine) df_set_flags (DF_LR_RUN_DCE); + bba_sets_must_be_sfp = BITMAP_ALLOC (®_obstack); + bba_sets_may_be_sfp = BITMAP_ALLOC (®_obstack); + + /* Prepare for stack variable anslysis. */ + regstat_init_n_sets_and_refs (); + calculate_dominance_info (CDI_DOMINATORS); + find_all_must_be_sfp_insns (); + find_all_may_be_sfp_insns (); + cfun_no_stack_address_taken = no_stack_address_taken (); + free_dominance_info (CDI_DOMINATORS); + regstat_free_n_sets_and_refs (); + /* Go through each of the basic blocks looking for things to convert. If we have conditional execution, we make multiple passes to allow us to handle IF-THEN{-ELSE} blocks within other IF-THEN{-ELSE} blocks. */ @@ -5413,6 +5837,9 @@ if_convert (bool after_combine) } while (cond_exec_changed_p); + BITMAP_FREE (bba_sets_may_be_sfp); + BITMAP_FREE (bba_sets_must_be_sfp); + #ifdef IFCVT_MULTIPLE_DUMPS if (dump_file) fprintf (dump_file, "\n\n========== no more changes\n"); diff --git a/gcc/ifcvt.h b/gcc/ifcvt.h index 153ad96..46a32ad 100644 --- a/gcc/ifcvt.h +++ b/gcc/ifcvt.h @@ -73,6 +73,8 @@ struct noce_if_info /* The SET_SRC of INSN_A and INSN_B. */ rtx a, b; + /* The SET of INSN_A and INSN_B. */ + rtx set_a, set_b; /* The SET_DEST of INSN_A. */ rtx x; diff --git a/gcc/rtl.h b/gcc/rtl.h index 31fba82..5c5e018 100644 --- a/gcc/rtl.h +++ b/gcc/rtl.h @@ -3754,6 +3754,8 @@ extern struct target_rtl *this_target_rtl; #define hard_frame_pointer_rtx (global_rtl[GR_HARD_FRAME_POINTER]) #define arg_pointer_rtx (global_rtl[GR_ARG_POINTER]) +extern bool fixed_base_plus_p (rtx x); + #ifndef GENERATOR_FILE /* Return the attributes of a MEM rtx. */ static inline const struct mem_attrs * diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c index 268a387..48c5c2f 100644 --- a/gcc/rtlanal.c +++ b/gcc/rtlanal.c @@ -341,6 +341,30 @@ rtx_varies_p (const_rtx x, bool for_alias) return 0; } +/* Nonzero if X has the form (PLUS frame-pointer integer). */ + +bool +fixed_base_plus_p (rtx x) +{ + switch (GET_CODE (x)) + { + case REG: + if (x == frame_pointer_rtx || x == hard_frame_pointer_rtx) + return true; + if (x == arg_pointer_rtx && fixed_regs[ARG_POINTER_REGNUM]) + return true; + return false; + + case PLUS: + if (!CONST_INT_P (XEXP (x, 1))) + return false; + return fixed_base_plus_p (XEXP (x, 0)); + + default: + return false; + } +} + /* Compute an approximation for the offset between the register FROM and TO for the current function, as it was at the start of the routine. */ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 27a522e..c50cbce 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2019-06-18 Jiangning Liu <jiangning.liu@os.amperecomputing.com> + + * gcc.dg/ifcvt-6.c: New test. + 2019-06-17 Jakub Jelinek <jakub@redhat.com> * gcc.dg/vect/vect-simd-8.c: New test. diff --git a/gcc/testsuite/gcc.dg/ifcvt-6.c b/gcc/testsuite/gcc.dg/ifcvt-6.c new file mode 100644 index 0000000..0c9dac42 --- /dev/null +++ b/gcc/testsuite/gcc.dg/ifcvt-6.c @@ -0,0 +1,12 @@ +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-options "-fdump-rtl-ce1 -O2" } */ + +unsigned test_ifcvt (unsigned k, unsigned b) { + unsigned a[2]; + if (b < a[k]) { + a[k] = b; + } + return a[0]+a[1]; +} + +/* { dg-final { scan-rtl-dump "if-conversion succeeded through noce_try_cmove_arith" "ce1" } } */ ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) 2019-06-20 9:53 ` JiangNing OS @ 2019-06-20 23:13 ` Jeff Law 2019-06-21 12:24 ` Richard Biener 0 siblings, 1 reply; 15+ messages in thread From: Jeff Law @ 2019-06-20 23:13 UTC (permalink / raw) To: JiangNing OS, gcc-patches On 6/20/19 3:53 AM, JiangNing OS wrote: > Hi Jeff, > > Appreciate your effort to review my patch! I've updated my patch as attached. See my answers below. > >> in current function, so the store speculation can be avoided. >> So at a high level should we be doing this in gimple rather than RTL? >> We're going to have a lot more information about types, better >> infrastructure for looking at uses/defs, access to the alias oracle, we should >> be able to accurately distinguish between potentially shared objects vs those >> which are local to the thread, etc. We lose the low level costing information >> though. >> >> I'm still going to go through the patch and do some level of review, but I do >> think we need to answer the higher level question though. >> > I have the following reasons, > > 1) Following the clue Richard B gave me before about parameter --param allow-store-data-races, > I did check the middle-end pass tree-if-conv, but I think this pass at the moment doesn't work > for the issue I'm trying to solve. Tree-if-conv is to do if conversion for loop, and its final goal is to > help loop vectorization, while my case doesn't have a loop at all. I think the fact that it's focused so much on loops is a historical accident. We certainly have a variety of places in the gimple optimizers that do if-conversion, and they're not all in tree-if-conv :( For example, some are done in tree-ssa-phiopt. In the gimple optimizers the testcase from 89430 is going to look something like this: > ; basic block 2, loop depth 0, count 1073741824 (estimated locally), maybe hot > ;; prev block 0, next block 3, flags: (NEW, REACHABLE, VISITED) > ;; pred: ENTRY [always] count:1073741824 (estimated locally) (FALLTHRU,EXECUTABLE) > a.0_1 = a; > _2 = (long unsigned int) k_8(D); > _3 = _2 * 4; > _4 = a.0_1 + _3; > _5 = *_4; > if (_5 > b_9(D)) > goto <bb 3>; [50.00%] > else > goto <bb 4>; [50.00%] > ;; succ: 3 [50.0% (guessed)] count:536870912 (estimated locally) (TRUE_VALUE,EXECUTABLE) > ;; 4 [50.0% (guessed)] count:536870912 (estimated locally) (FALSE_VALUE,EXECUTABLE) > > ;; basic block 3, loop depth 0, count 536870913 (estimated locally), maybe hot > ;; prev block 2, next block 4, flags: (NEW, REACHABLE, VISITED) > ;; pred: 2 [50.0% (guessed)] count:536870912 (estimated locally) (TRUE_VALUE,EXECUTABLE) > *_4 = b_9(D); > ;; succ: 4 [always] count:536870913 (estimated locally) (FALLTHRU,EXECUTABLE) > > ;; basic block 4, loop depth 0, count 1073741824 (estimated locally), maybe hot > ;; prev block 3, next block 1, flags: (NEW, REACHABLE, VISITED) > ;; pred: 3 [always] count:536870913 (estimated locally) (FALLTHRU,EXECUTABLE) > ;; 2 [50.0% (guessed)] count:536870912 (estimated locally) (FALSE_VALUE,EXECUTABLE) > return; That looks like a pretty easy form to analyze. I'd suggest looking through tree-ssa-phiopt.c closely. There's several transformations in there that share similarities with yours. > 2) My current solution fits into current back-end if-conversion pass very well. I don't want to invent > a new framework to solve this relatively small issue. Besides, this back-end patch doesn't only > enhance store speculation detection, but also fix a bug in the original code. Understood, but I still wonder if we're better off addressing this in gimple. >> Just from a design standpoint, what are the consequences if this function >> returns true for something that isn't actually in the stack or false for >> something that is on the stack? >> > If noce_mem_is_on_stack returns true for something that isn't actually in the stack, > it could potentially introduce store speculation, then the if-conversion optimization > will be incorrect. If this function returns false for something that is on stack, it doesn't > matter, because the optimization will not be triggered. OK. That's what I expected. > > > > >> >>> + >>> +/* Always return true, if there is a dominating write. >>> + >>> + When there is a dominating read from memory on stack, >>> + 1) if x = a is a memory read, return true. >>> + 2) if x = a is a memory write, return true if the memory is on stack. >>> + This is the guarantee the memory is *not* readonly. */ >>> + >>> +static bool >>> +noce_valid_for_dominating (basic_block bb, rtx_insn *a_insn, >>> + const_rtx x, bool is_store) { >>> + rtx_insn *insn; >>> + rtx set; >>> + >>> + gcc_assert (MEM_P (x)); >>> + >>> + FOR_BB_INSNS (bb, insn) >>> + { >>> + set = single_set (insn); >>> + if (!set) >>> + continue; >>> + >>> + /* Dominating store */ >>> + if (rtx_equal_p (x, SET_DEST (set))) >>> + return true; >>> + >>> + /* Dominating load */ >>> + if (rtx_equal_p (x, SET_SRC (set))) >>> + if (is_store && noce_mem_is_on_stack (a_insn, x)) >>> + return true; >>> + } >>> + >>> + return false; >>> +} >> So what would be the consequences here of returning false when in fact >> there was a dominating read or write? That could easily happen if the >> dominating read or write was not a single_set. > If return false when in fact there is a dominating read or write, the optimization will not > be triggered, because noce_mem_maybe_invalid_p will be true, which is playing the same > role as may_trap_or_fault_p. > >> >> I'm guessing that from a design standpoint you're trying to find cases where >> you know an object was written to within the block and does not escape. So >> returning false when there was a dominating write is safe. >> Returning true when there was not would be disastrous. Right? >> > YES! You've completely understood my code! ð So in this context, only handling single_sets is safe. Perfect. > >> >> >> >>> + >>> + /* Iterate all insns until finding all stack pointers, and store >>> + them into bba_sets_must_be_sfp. */ bba_sets_must_be_sfp = >>> + BITMAP_ALLOC (®_obstack); do >>> + { >>> + sfp_found = false; >>> + FOR_ALL_BB_FN (bb, cfun) >>> + FOR_BB_INSNS (bb, a_insn) >>> + { >>> + rtx sset_a = single_set (a_insn); >>> + if (!sset_a) >>> + continue; >>> + rtx src = SET_SRC (sset_a); >>> + rtx dest = SET_DEST (sset_a); >> So again, we can get things that aren't a single_set here and they'll be >> ignored. But I believe that's safe as you're trying to identify insns which store >> stack addresses into a REG. Missing a more complicated case where a stack >> address is stored into a REG is safe. Right? >> > Yes. Missing some pointers to stack is safe here, because we will use it to check if a memory > in if-conversion candidate doesn't have store speculation. If we miss it, the optimization will > not be triggered, so there will not be risky. Thanks for the confirmation. >> >>> + >>> + /* Handle case like "x2 = x1 + offset", in which x1 is already >>> + identified as a stack pointer. */ >>> + if ((GET_CODE (src) == PLUS || GET_CODE (src) == MINUS) >>> + && CONST_INT_P (XEXP (src, 1))) >>> + { >>> + rtx x1 = XEXP (src, 0); >>> + if (!REG_P (x1)) >>> + continue; >>> + >>> + FOR_EACH_INSN_USE (use, a_insn) >>> + { >>> + if (!rtx_equal_p (x1, DF_REF_REG (use))) >>> + continue; >>> + >>> + if (bitmap_bit_p (bba_sets_must_be_sfp, DF_REF_REGNO >> (use))) >>> + { >>> + bitmap_set_bit (bba_sets_must_be_sfp, dest_regno); >>> + sfp_found = true; >>> + break; >>> + } >>> + } >> So how much is there to be gained from going from this kind of localized >> computation to global? It shouldn't be hard to turn this into a truly global >> algorithm, but I'm not sure it's worth the effort. >> >> I _do_ think it's worth visiting blocks in dominator order. It's better than >> what you're doing, but nowhere near as expensive as a global propagation >> engine. >> > Fixed in attached new patch using dom_walk class. Please review again. > >> Is it worth handling simple copies in the code above? > I thought it less likely to have a copy for pointer to stack before register allocation. Right? We certainly try to eliminate as many copies as possible, but experience would tell us that they often sneak through. I guess some quick instrumentation would tell us if it's worth the effort. > >> >>> +/* Return true if current function doesn't pass stack address out of >>> +frame. */ >>> + >>> +static bool >>> +no_stack_address_taken (void) >>> +{ >>> + basic_block bb; >>> + rtx_insn *a_insn; >>> + df_ref use; >>> + >>> + FOR_ALL_BB_FN (bb, cfun) >>> + FOR_BB_INSNS (bb, a_insn) >>> + { >>> + if (!INSN_P (a_insn)) >>> + continue; >>> + >>> + rtx sset_a = single_set (a_insn); >>> + if (!sset_a) >>> + continue; >>> + rtx src = SET_SRC (sset_a); >>> + rtx dest = SET_DEST (sset_a); >>> + >>> + /* Skip insn that is already identified as frame pointers. */ >>> + if (bitmap_bit_p (bba_sets_may_be_sfp, REGNO (dest))) >>> + continue; >>> + >>> + FOR_EACH_INSN_USE (use, a_insn) >>> + { >>> + /* Check if current insn is using any stack pointer. Ignore >>> + if it doesn't use frame pointers at all. */ >>> + if (!bitmap_bit_p (bba_sets_may_be_sfp, DF_REF_REGNO (use))) >>> + continue; >>> + >>> + /* Load cannot introduce address taken. */ >>> + if (MEM_P (src)) >>> + continue; >>> + >>> + /* Store is safe if only the src doesn't contain stack pointer. */ >>> + if (MEM_P (dest) >>> + && !reg_mentioned_p (DF_REF_REG (use), src)) >>> + continue; >>> + >>> + /* All other cases potentially introduce address taken. */ >>> + return false; >>> + } >> So what if you have a PARALLEL where one entry causes an escape of a >> pointer into the stack and another entry does some other operation? If so, >> don't you miss the fact that you've got an escaping pointer into the stack? I >> don't think you can restrict yourself to single_sets here. > Yes. You are right. I have added code to handle PARALLEL case. I handle it in a rather conservative way though. Review again, please! I'll take another look at the patch. I'll also do a little experimentation to see if we're seeing enough copies to make handling them worth the effort. While I'm doing that, if you could walk through tree-ssa-phiopt.c and ponder if your transformation could fit into that framework it'd be appreciated (note the comment at the start of the file only covers one class of transformations, there are others later). jeff ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) 2019-06-20 23:13 ` Jeff Law @ 2019-06-21 12:24 ` Richard Biener 2019-06-21 16:10 ` Jeff Law 0 siblings, 1 reply; 15+ messages in thread From: Richard Biener @ 2019-06-21 12:24 UTC (permalink / raw) To: Jeff Law; +Cc: JiangNing OS, gcc-patches On Fri, Jun 21, 2019 at 1:13 AM Jeff Law <law@redhat.com> wrote: > On 6/20/19 3:53 AM, JiangNing OS wrote: > > Hi Jeff, > > > > Appreciate your effort to review my patch! I've updated my patch as > attached. See my answers below. > > > >> in current function, so the store speculation can be avoided. > >> So at a high level should we be doing this in gimple rather than RTL? > >> We're going to have a lot more information about types, better > >> infrastructure for looking at uses/defs, access to the alias oracle, we > should > >> be able to accurately distinguish between potentially shared objects vs > those > >> which are local to the thread, etc. We lose the low level costing > information > >> though. > >> > >> I'm still going to go through the patch and do some level of review, > but I do > >> think we need to answer the higher level question though. > >> > > I have the following reasons, > > > > 1) Following the clue Richard B gave me before about parameter --param > allow-store-data-races, > > I did check the middle-end pass tree-if-conv, but I think this pass at > the moment doesn't work > > for the issue I'm trying to solve. Tree-if-conv is to do if conversion > for loop, and its final goal is to > > help loop vectorization, while my case doesn't have a loop at all. > I think the fact that it's focused so much on loops is a historical > accident. We certainly have a variety of places in the gimple > optimizers that do if-conversion, and they're not all in tree-if-conv :( > For example, some are done in tree-ssa-phiopt. > > In the gimple optimizers the testcase from 89430 is going to look > something like this: > > > > ; basic block 2, loop depth 0, count 1073741824 (estimated locally), > maybe hot > > ;; prev block 0, next block 3, flags: (NEW, REACHABLE, VISITED) > > ;; pred: ENTRY [always] count:1073741824 (estimated locally) > (FALLTHRU,EXECUTABLE) > > a.0_1 = a; > > _2 = (long unsigned int) k_8(D); > > _3 = _2 * 4; > > _4 = a.0_1 + _3; > > _5 = *_4; > > if (_5 > b_9(D)) > > goto <bb 3>; [50.00%] > > else > > goto <bb 4>; [50.00%] > > ;; succ: 3 [50.0% (guessed)] count:536870912 (estimated > locally) (TRUE_VALUE,EXECUTABLE) > > ;; 4 [50.0% (guessed)] count:536870912 (estimated > locally) (FALSE_VALUE,EXECUTABLE) > > > > ;; basic block 3, loop depth 0, count 536870913 (estimated locally), > maybe hot > > ;; prev block 2, next block 4, flags: (NEW, REACHABLE, VISITED) > > ;; pred: 2 [50.0% (guessed)] count:536870912 (estimated > locally) (TRUE_VALUE,EXECUTABLE) > > *_4 = b_9(D); > > ;; succ: 4 [always] count:536870913 (estimated locally) > (FALLTHRU,EXECUTABLE) > > > > ;; basic block 4, loop depth 0, count 1073741824 (estimated locally), > maybe hot > > ;; prev block 3, next block 1, flags: (NEW, REACHABLE, VISITED) > > ;; pred: 3 [always] count:536870913 (estimated locally) > (FALLTHRU,EXECUTABLE) > > ;; 2 [50.0% (guessed)] count:536870912 (estimated > locally) (FALSE_VALUE,EXECUTABLE) > > return; > > That looks like a pretty easy form to analyze. I'd suggest looking > through tree-ssa-phiopt.c closely. There's several transformations in > there that share similarities with yours. > More specifically there is the conditional store elimination (cselim) pass inside this file. > > > > > > > > > > 2) My current solution fits into current back-end if-conversion pass > very well. I don't want to invent > > a new framework to solve this relatively small issue. Besides, this > back-end patch doesn't only > > enhance store speculation detection, but also fix a bug in the original > code. > Understood, but I still wonder if we're better off addressing this in > gimple. > Traditionally if-conversions profitability heavily depends on the target and esp. if memory is involved costing on GIMPLE is hard. That's also one reason why we turned off loop if-conversion on GIMPLE for non-vectorized code. phiopt is really about followup optimization opportunities in passes that do not handle conditionals well. cselim is on the border... > >> Just from a design standpoint, what are the consequences if this > function > >> returns true for something that isn't actually in the stack or false for > >> something that is on the stack? > >> > > If noce_mem_is_on_stack returns true for something that isn't actually > in the stack, > > it could potentially introduce store speculation, then the if-conversion > optimization > > will be incorrect. If this function returns false for something that is > on stack, it doesn't > > matter, because the optimization will not be triggered. > OK. That's what I expected. > > > > > > > > > >> > >>> + > >>> +/* Always return true, if there is a dominating write. > >>> + > >>> + When there is a dominating read from memory on stack, > >>> + 1) if x = a is a memory read, return true. > >>> + 2) if x = a is a memory write, return true if the memory is on > stack. > >>> + This is the guarantee the memory is *not* readonly. */ > >>> + > >>> +static bool > >>> +noce_valid_for_dominating (basic_block bb, rtx_insn *a_insn, > >>> + const_rtx x, bool is_store) { > >>> + rtx_insn *insn; > >>> + rtx set; > >>> + > >>> + gcc_assert (MEM_P (x)); > >>> + > >>> + FOR_BB_INSNS (bb, insn) > >>> + { > >>> + set = single_set (insn); > >>> + if (!set) > >>> + continue; > >>> + > >>> + /* Dominating store */ > >>> + if (rtx_equal_p (x, SET_DEST (set))) > >>> + return true; > >>> + > >>> + /* Dominating load */ > >>> + if (rtx_equal_p (x, SET_SRC (set))) > >>> + if (is_store && noce_mem_is_on_stack (a_insn, x)) > >>> + return true; > >>> + } > >>> + > >>> + return false; > >>> +} > >> So what would be the consequences here of returning false when in fact > >> there was a dominating read or write? That could easily happen if the > >> dominating read or write was not a single_set. > > If return false when in fact there is a dominating read or write, the > optimization will not > > be triggered, because noce_mem_maybe_invalid_p will be true, which is > playing the same > > role as may_trap_or_fault_p. > > > >> > >> I'm guessing that from a design standpoint you're trying to find cases > where > >> you know an object was written to within the block and does not > escape. So > >> returning false when there was a dominating write is safe. > >> Returning true when there was not would be disastrous. Right? > >> > > YES! You've completely understood my code! 😊 > So in this context, only handling single_sets is safe. Perfect. > > > > > >> > >> > >> > >>> + > >>> + /* Iterate all insns until finding all stack pointers, and store > >>> + them into bba_sets_must_be_sfp. */ bba_sets_must_be_sfp = > >>> + BITMAP_ALLOC (®_obstack); do > >>> + { > >>> + sfp_found = false; > >>> + FOR_ALL_BB_FN (bb, cfun) > >>> + FOR_BB_INSNS (bb, a_insn) > >>> + { > >>> + rtx sset_a = single_set (a_insn); > >>> + if (!sset_a) > >>> + continue; > >>> + rtx src = SET_SRC (sset_a); > >>> + rtx dest = SET_DEST (sset_a); > >> So again, we can get things that aren't a single_set here and they'll be > >> ignored. But I believe that's safe as you're trying to identify insns > which store > >> stack addresses into a REG. Missing a more complicated case where a > stack > >> address is stored into a REG is safe. Right? > >> > > Yes. Missing some pointers to stack is safe here, because we will use it > to check if a memory > > in if-conversion candidate doesn't have store speculation. If we miss > it, the optimization will > > not be triggered, so there will not be risky. > Thanks for the confirmation. > > >> > >>> + > >>> + /* Handle case like "x2 = x1 + offset", in which x1 is > already > >>> + identified as a stack pointer. */ > >>> + if ((GET_CODE (src) == PLUS || GET_CODE (src) == MINUS) > >>> + && CONST_INT_P (XEXP (src, 1))) > >>> + { > >>> + rtx x1 = XEXP (src, 0); > >>> + if (!REG_P (x1)) > >>> + continue; > >>> + > >>> + FOR_EACH_INSN_USE (use, a_insn) > >>> + { > >>> + if (!rtx_equal_p (x1, DF_REF_REG (use))) > >>> + continue; > >>> + > >>> + if (bitmap_bit_p (bba_sets_must_be_sfp, > DF_REF_REGNO > >> (use))) > >>> + { > >>> + bitmap_set_bit (bba_sets_must_be_sfp, > dest_regno); > >>> + sfp_found = true; > >>> + break; > >>> + } > >>> + } > >> So how much is there to be gained from going from this kind of localized > >> computation to global? It shouldn't be hard to turn this into a truly > global > >> algorithm, but I'm not sure it's worth the effort. > >> > >> I _do_ think it's worth visiting blocks in dominator order. It's > better than > >> what you're doing, but nowhere near as expensive as a global propagation > >> engine. > >> > > Fixed in attached new patch using dom_walk class. Please review again. > > > >> Is it worth handling simple copies in the code above? > > I thought it less likely to have a copy for pointer to stack before > register allocation. Right? > We certainly try to eliminate as many copies as possible, but experience > would tell us that they often sneak through. I guess some quick > instrumentation would tell us if it's worth the effort. > > > > > > > >> > >>> +/* Return true if current function doesn't pass stack address out of > >>> +frame. */ > >>> + > >>> +static bool > >>> +no_stack_address_taken (void) > >>> +{ > >>> + basic_block bb; > >>> + rtx_insn *a_insn; > >>> + df_ref use; > >>> + > >>> + FOR_ALL_BB_FN (bb, cfun) > >>> + FOR_BB_INSNS (bb, a_insn) > >>> + { > >>> + if (!INSN_P (a_insn)) > >>> + continue; > >>> + > >>> + rtx sset_a = single_set (a_insn); > >>> + if (!sset_a) > >>> + continue; > >>> + rtx src = SET_SRC (sset_a); > >>> + rtx dest = SET_DEST (sset_a); > >>> + > >>> + /* Skip insn that is already identified as frame pointers. */ > >>> + if (bitmap_bit_p (bba_sets_may_be_sfp, REGNO (dest))) > >>> + continue; > >>> + > >>> + FOR_EACH_INSN_USE (use, a_insn) > >>> + { > >>> + /* Check if current insn is using any stack pointer. > Ignore > >>> + if it doesn't use frame pointers at all. */ > >>> + if (!bitmap_bit_p (bba_sets_may_be_sfp, DF_REF_REGNO > (use))) > >>> + continue; > >>> + > >>> + /* Load cannot introduce address taken. */ > >>> + if (MEM_P (src)) > >>> + continue; > >>> + > >>> + /* Store is safe if only the src doesn't contain stack > pointer. */ > >>> + if (MEM_P (dest) > >>> + && !reg_mentioned_p (DF_REF_REG (use), src)) > >>> + continue; > >>> + > >>> + /* All other cases potentially introduce address taken. */ > >>> + return false; > >>> + } > >> So what if you have a PARALLEL where one entry causes an escape of a > >> pointer into the stack and another entry does some other operation? If > so, > >> don't you miss the fact that you've got an escaping pointer into the > stack? I > >> don't think you can restrict yourself to single_sets here. > > Yes. You are right. I have added code to handle PARALLEL case. I handle > it in a rather conservative way though. Review again, please! > I'll take another look at the patch. I'll also do a little > experimentation to see if we're seeing enough copies to make handling > them worth the effort. > > While I'm doing that, if you could walk through tree-ssa-phiopt.c and > ponder if your transformation could fit into that framework it'd be > appreciated (note the comment at the start of the file only covers one > class of transformations, there are others later). > > jeff > ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) 2019-06-21 12:24 ` Richard Biener @ 2019-06-21 16:10 ` Jeff Law 2019-07-08 7:43 ` JiangNing OS 0 siblings, 1 reply; 15+ messages in thread From: Jeff Law @ 2019-06-21 16:10 UTC (permalink / raw) To: Richard Biener; +Cc: JiangNing OS, gcc-patches On 6/21/19 6:23 AM, Richard Biener wrote: > > That looks like a pretty easy form to analyze. I'd suggest looking > through tree-ssa-phiopt.c closely. There's several transformations > in there that share similarities with yours. > > > More specifically there is the conditional store elimination (cselim) > pass inside this file. That's the one I was thinking of. It looks reasonably close to the cases JiangNing is trying to capture. > > > > > > > > > > >> 2) My current solution fits into current back-end if-conversion > pass very well. I don't want to invent >> a new framework to solve this relatively small issue. Besides, > this back-end patch doesn't only >> enhance store speculation detection, but also fix a bug in the > original code. Understood, but I still wonder if we're better off > addressing this in gimple. > > > Traditionally if-conversions profitability heavily depends on the > target and esp. if memory is involved costing on GIMPLE is hard. > That's also one reason why we turned off loop if-conversion on GIMPLE > for non-vectorized code. Yea. That's always the concern for transformations that aren't trivial to show are better. Conditional store elimination as implemented right now in phiopt requires a single store in the then/else clauses. So we're really just sinking the stores through a PHI. We're really not changing the number of loads or stores on any path. In the cases I think JiangNing is trying to handle we are adding a store on a path where it didn't previously exist because there is no else clause. So we likely need to check the edge probabilities to avoid doing something dumb. I don't have a good sense where the cutoff should be. tree-ssa-sink might have some nuggets of information to help guide. > > phiopt is really about followup optimization opportunities in passes > that do not handle conditionals well. > > cselim is on the border... ACK. In fact, looking at it it I wonder if it'd be better in tree-ssa-sink.c :-) jeff ^ permalink raw reply [flat|nested] 15+ messages in thread
* RE: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) 2019-06-21 16:10 ` Jeff Law @ 2019-07-08 7:43 ` JiangNing OS 2019-07-12 5:22 ` Andrew Pinski 2019-07-12 16:40 ` Jeff Law 0 siblings, 2 replies; 15+ messages in thread From: JiangNing OS @ 2019-07-08 7:43 UTC (permalink / raw) To: Jeff Law, Richard Biener; +Cc: gcc-patches [-- Attachment #1: Type: text/plain, Size: 2682 bytes --] Hi Jeff and Richard B., Following your tips, I've found a much simpler solution in tree-ssa-phiopt.c. Attached is the new patch. Review again, please! Thanks a lot! -Jiangning > -----Original Message----- > From: Jeff Law <law@redhat.com> > Sent: Saturday, June 22, 2019 12:10 AM > To: Richard Biener <richard.guenther@gmail.com> > Cc: JiangNing OS <jiangning@os.amperecomputing.com>; gcc- > patches@gcc.gnu.org > Subject: Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) > > On 6/21/19 6:23 AM, Richard Biener wrote: > > > > That looks like a pretty easy form to analyze. I'd suggest looking > > through tree-ssa-phiopt.c closely. There's several transformations in > > there that share similarities with yours. > > > > > > More specifically there is the conditional store elimination (cselim) > > pass inside this file. > That's the one I was thinking of. It looks reasonably close to the cases > JiangNing is trying to capture. > > > > > > > > > > > > > > > > > > > > > > > >> 2) My current solution fits into current back-end if-conversion > > pass very well. I don't want to invent > >> a new framework to solve this relatively small issue. Besides, > > this back-end patch doesn't only > >> enhance store speculation detection, but also fix a bug in the > > original code. Understood, but I still wonder if we're better off > > addressing this in gimple. > > > > > > Traditionally if-conversions profitability heavily depends on the > > target and esp. if memory is involved costing on GIMPLE is hard. > > That's also one reason why we turned off loop if-conversion on GIMPLE > > for non-vectorized code. > Yea. That's always the concern for transformations that aren't trivial to show > are better. > > Conditional store elimination as implemented right now in phiopt requires a > single store in the then/else clauses. So we're really just sinking the stores > through a PHI. We're really not changing the number of loads or stores on > any path. > > In the cases I think JiangNing is trying to handle we are adding a store on a > path where it didn't previously exist because there is no else clause. So we > likely need to check the edge probabilities to avoid doing something dumb. I > don't have a good sense where the cutoff should be. tree-ssa-sink might > have some nuggets of information to help guide. > > > > > phiopt is really about followup optimization opportunities in passes > > that do not handle conditionals well. > > > > cselim is on the border... > ACK. In fact, looking at it it I wonder if it'd be better in tree-ssa-sink.c :-) > > > jeff [-- Attachment #2: csel5.patch --] [-- Type: application/octet-stream, Size: 6269 bytes --] diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 9f83f75b0bf..1a7acf7f1ba 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,9 @@ +2019-07-08 Jiangning Liu <jiangning.liu@amperecomputing.com> + + PR tree-optimization/89430 + * tree-ssa-phiopt.c (cond_store_replacement): Support conditional + store elimination for local variable without address escape. + 2019-07-07 Jeff Law <law@redhat.com> PR tree-optimization/91090 diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 12e5bc167e0..65a34b43fb5 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,13 @@ +2019-07-08 Jiangning Liu <jiangning.liu@amperecomputing.com> + + PR tree-optimization/89430 + * gcc.dg/tree-ssa/pr89430-1.c: New test. + * gcc.dg/tree-ssa/pr89430-2.c: New test. + * gcc.dg/tree-ssa/pr89430-3.c: New test. + * gcc.dg/tree-ssa/pr89430-4.c: New test. + * gcc.dg/tree-ssa/pr89430-5.c: New test. + * gcc.dg/tree-ssa/pr89430-6.c: New test. + 2019-07-07 Paul Thomas <pault@gcc.gnu.org> PR fortran/91077 diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-1.c new file mode 100644 index 00000000000..8ee1850ac63 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-1.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-cselim-details" } */ + +unsigned test(unsigned k, unsigned b) { + unsigned a[2]; + if (b < a[k]) { + a[k] = b; + } + return a[0]+a[1]; +} + +/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-2.c new file mode 100644 index 00000000000..9b96875ac7a --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-2.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-cselim-details" } */ + +int c; +unsigned test(unsigned k, unsigned b) { + unsigned a[2]; + a[k] = c; + if (b < a[k]) { + a[k] = b; + } + return a[0]+a[1]; +} + +/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-3.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-3.c new file mode 100644 index 00000000000..0fac9f9b9c7 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-3.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-cselim-details" } */ + +unsigned a[2]; +unsigned test(unsigned k, unsigned b) { + if (b < a[k]) { + a[k] = b; + } + return a[0]+a[1]; +} + +/* { dg-final { scan-tree-dump-not "Conditional store replacement" "cselim" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-4.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-4.c new file mode 100644 index 00000000000..54b8c11a407 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-4.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-cselim-details" } */ + +int *p; +unsigned test(unsigned k, unsigned b) { + unsigned a[2]; + p = a; + if (b < a[k]) { + a[k] = b; + } + return a[0]+a[1]; +} + +/* { dg-final { scan-tree-dump-not "Conditional store replacement" "cselim" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-5.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-5.c new file mode 100644 index 00000000000..b2d04119381 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-5.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-cselim-details" } */ + +int test(int b, int k) { + struct { + int data[2]; + } a; + + if (b < a.data[k]) { + a.data[k] = b; + } + + return a.data[0] + a.data[1]; +} + +/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-6.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-6.c new file mode 100644 index 00000000000..8d3c4f7cc6a --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-6.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-cselim-details" } */ + +int test(int b, int k) { + typedef struct { + int x; + } SS; + struct { + SS data[2]; + } a; + + if (b < a.data[k].x) { + a.data[k].x = b; + } + + return a.data[0].x + a.data[1].x; +} + +/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" } } */ diff --git a/gcc/tree-ssa-phiopt.c b/gcc/tree-ssa-phiopt.c index 7088ff91998..2678f58be74 100644 --- a/gcc/tree-ssa-phiopt.c +++ b/gcc/tree-ssa-phiopt.c @@ -2218,8 +2218,9 @@ cond_store_replacement (basic_block middle_bb, basic_block join_bb, locus = gimple_location (assign); lhs = gimple_assign_lhs (assign); rhs = gimple_assign_rhs1 (assign); - if (TREE_CODE (lhs) != MEM_REF - || TREE_CODE (TREE_OPERAND (lhs, 0)) != SSA_NAME + if ((TREE_CODE (lhs) != MEM_REF + && TREE_CODE (lhs) != ARRAY_REF + && TREE_CODE (lhs) != COMPONENT_REF) || !is_gimple_reg_type (TREE_TYPE (lhs))) return false; @@ -2227,7 +2228,13 @@ cond_store_replacement (basic_block middle_bb, basic_block join_bb, TREE_THIS_NOTRAP here, but in that case we also could move stores, whose value is not available readily, which we want to avoid. */ if (!nontrap->contains (lhs)) - return false; + { + /* If LHS is a local variable without address-taken, we could + always safely move down the store. */ + tree base = get_base_address (lhs); + if (!auto_var_p (base) || TREE_ADDRESSABLE (base)) + return false; + } /* Now we've checked the constraints, so do the transformation: 1) Remove the single store. */ @@ -2280,6 +2287,14 @@ cond_store_replacement (basic_block middle_bb, basic_block join_bb, else gsi_insert_before (&gsi, new_stmt, GSI_NEW_STMT); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "\nConditional store replacement happened!"); + fprintf (dump_file, "\nReplaced the store with a load."); + fprintf (dump_file, "\nInserted a new PHI statement in joint block:\n"); + print_gimple_stmt (dump_file, new_stmt, 0, TDF_VOPS|TDF_MEMSYMS); + } + return true; } ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) 2019-07-08 7:43 ` JiangNing OS @ 2019-07-12 5:22 ` Andrew Pinski 2019-07-12 16:11 ` Jeff Law 2019-07-12 16:40 ` Jeff Law 1 sibling, 1 reply; 15+ messages in thread From: Andrew Pinski @ 2019-07-12 5:22 UTC (permalink / raw) To: JiangNing OS; +Cc: Jeff Law, Richard Biener, gcc-patches On Mon, Jul 8, 2019 at 12:39 AM JiangNing OS <jiangning@os.amperecomputing.com> wrote: > > Hi Jeff and Richard B., > > Following your tips, I've found a much simpler solution in tree-ssa-phiopt.c. Attached is the new patch. Review again, please! /* Prove that we can move the store down. We could also check TREE_THIS_NOTRAP here, but in that case we also could move stores, whose value is not available readily, which we want to avoid. */ I think the comment above the change needs to be changed or extended slightly. Thanks, Andrew Pinski > > Thanks a lot! > -Jiangning > > > -----Original Message----- > > From: Jeff Law <law@redhat.com> > > Sent: Saturday, June 22, 2019 12:10 AM > > To: Richard Biener <richard.guenther@gmail.com> > > Cc: JiangNing OS <jiangning@os.amperecomputing.com>; gcc- > > patches@gcc.gnu.org > > Subject: Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) > > > > On 6/21/19 6:23 AM, Richard Biener wrote: > > > > > > That looks like a pretty easy form to analyze. I'd suggest looking > > > through tree-ssa-phiopt.c closely. There's several transformations in > > > there that share similarities with yours. > > > > > > > > > More specifically there is the conditional store elimination (cselim) > > > pass inside this file. > > That's the one I was thinking of. It looks reasonably close to the cases > > JiangNing is trying to capture. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >> 2) My current solution fits into current back-end if-conversion > > > pass very well. I don't want to invent > > >> a new framework to solve this relatively small issue. Besides, > > > this back-end patch doesn't only > > >> enhance store speculation detection, but also fix a bug in the > > > original code. Understood, but I still wonder if we're better off > > > addressing this in gimple. > > > > > > > > > Traditionally if-conversions profitability heavily depends on the > > > target and esp. if memory is involved costing on GIMPLE is hard. > > > That's also one reason why we turned off loop if-conversion on GIMPLE > > > for non-vectorized code. > > Yea. That's always the concern for transformations that aren't trivial to show > > are better. > > > > Conditional store elimination as implemented right now in phiopt requires a > > single store in the then/else clauses. So we're really just sinking the stores > > through a PHI. We're really not changing the number of loads or stores on > > any path. > > > > In the cases I think JiangNing is trying to handle we are adding a store on a > > path where it didn't previously exist because there is no else clause. So we > > likely need to check the edge probabilities to avoid doing something dumb. I > > don't have a good sense where the cutoff should be. tree-ssa-sink might > > have some nuggets of information to help guide. > > > > > > > > phiopt is really about followup optimization opportunities in passes > > > that do not handle conditionals well. > > > > > > cselim is on the border... > > ACK. In fact, looking at it it I wonder if it'd be better in tree-ssa-sink.c :-) > > > > > > jeff ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) 2019-07-12 5:22 ` Andrew Pinski @ 2019-07-12 16:11 ` Jeff Law 0 siblings, 0 replies; 15+ messages in thread From: Jeff Law @ 2019-07-12 16:11 UTC (permalink / raw) To: Andrew Pinski, JiangNing OS; +Cc: Richard Biener, gcc-patches On 7/11/19 10:08 PM, Andrew Pinski wrote: > On Mon, Jul 8, 2019 at 12:39 AM JiangNing OS > <jiangning@os.amperecomputing.com> wrote: >> >> Hi Jeff and Richard B., >> >> Following your tips, I've found a much simpler solution in tree-ssa-phiopt.c. Attached is the new patch. Review again, please! > > /* Prove that we can move the store down. We could also check > TREE_THIS_NOTRAP here, but in that case we also could move stores, > whose value is not available readily, which we want to avoid. */ > > I think the comment above the change needs to be changed or extended slightly. Actually, I don't think that comment needs to change. But the one before cond_store_replacement needs minor updating which I'll take care of. My reasoning is that addressables in the local stack are most likely going to already have their lines in the cache and thus are available readily. Jeff ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) 2019-07-08 7:43 ` JiangNing OS 2019-07-12 5:22 ` Andrew Pinski @ 2019-07-12 16:40 ` Jeff Law 1 sibling, 0 replies; 15+ messages in thread From: Jeff Law @ 2019-07-12 16:40 UTC (permalink / raw) To: JiangNing OS, Richard Biener; +Cc: gcc-patches On 7/8/19 1:39 AM, JiangNing OS wrote: > Hi Jeff and Richard B., > > Following your tips, I've found a much simpler solution in tree-ssa-phiopt.c. Attached is the new patch. Review again, please! > > Thanks a lot! > -Jiangning > >> -----Original Message----- >> From: Jeff Law <law@redhat.com> >> Sent: Saturday, June 22, 2019 12:10 AM >> To: Richard Biener <richard.guenther@gmail.com> >> Cc: JiangNing OS <jiangning@os.amperecomputing.com>; gcc- >> patches@gcc.gnu.org >> Subject: Re: [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) >> >> On 6/21/19 6:23 AM, Richard Biener wrote: >>> That looks like a pretty easy form to analyze. I'd suggest looking >>> through tree-ssa-phiopt.c closely. There's several transformations in >>> there that share similarities with yours. >>> >>> >>> More specifically there is the conditional store elimination (cselim) >>> pass inside this file. >> That's the one I was thinking of. It looks reasonably close to the cases >> JiangNing is trying to capture. >> >> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>>> 2) My current solution fits into current back-end if-conversion >>> pass very well. I don't want to invent >>>> a new framework to solve this relatively small issue. Besides, >>> this back-end patch doesn't only >>>> enhance store speculation detection, but also fix a bug in the >>> original code. Understood, but I still wonder if we're better off >>> addressing this in gimple. >>> >>> >>> Traditionally if-conversions profitability heavily depends on the >>> target and esp. if memory is involved costing on GIMPLE is hard. >>> That's also one reason why we turned off loop if-conversion on GIMPLE >>> for non-vectorized code. >> Yea. That's always the concern for transformations that aren't trivial to show >> are better. >> >> Conditional store elimination as implemented right now in phiopt requires a >> single store in the then/else clauses. So we're really just sinking the stores >> through a PHI. We're really not changing the number of loads or stores on >> any path. >> >> In the cases I think JiangNing is trying to handle we are adding a store on a >> path where it didn't previously exist because there is no else clause. So we >> likely need to check the edge probabilities to avoid doing something dumb. I >> don't have a good sense where the cutoff should be. tree-ssa-sink might >> have some nuggets of information to help guide. >> >>> phiopt is really about followup optimization opportunities in passes >>> that do not handle conditionals well. >>> >>> cselim is on the border... >> ACK. In fact, looking at it it I wonder if it'd be better in tree-ssa-sink.c :-) >> >> >> jeff > > csel5.patch > > diff --git a/gcc/ChangeLog b/gcc/ChangeLog > index 9f83f75b0bf..1a7acf7f1ba 100644 > --- a/gcc/ChangeLog > +++ b/gcc/ChangeLog > @@ -1,3 +1,9 @@ > +2019-07-08 Jiangning Liu <jiangning.liu@amperecomputing.com> > + > + PR tree-optimization/89430 > + * tree-ssa-phiopt.c (cond_store_replacement): Support conditional > + store elimination for local variable without address escape. > + > 2019-07-07 Jeff Law <law@redhat.com> > > PR tree-optimization/91090 > diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog > index 12e5bc167e0..65a34b43fb5 100644 > --- a/gcc/testsuite/ChangeLog > +++ b/gcc/testsuite/ChangeLog > @@ -1,3 +1,13 @@ > +2019-07-08 Jiangning Liu <jiangning.liu@amperecomputing.com> > + > + PR tree-optimization/89430 > + * gcc.dg/tree-ssa/pr89430-1.c: New test. > + * gcc.dg/tree-ssa/pr89430-2.c: New test. > + * gcc.dg/tree-ssa/pr89430-3.c: New test. > + * gcc.dg/tree-ssa/pr89430-4.c: New test. > + * gcc.dg/tree-ssa/pr89430-5.c: New test. > + * gcc.dg/tree-ssa/pr89430-6.c: New test. THanks! I made a trivial update to the comment before cond_store_replacement and installed this patch on the trunk. jeff ^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2019-07-12 16:29 UTC | newest] Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2019-03-15 8:22 [PATCH] improve ifcvt optimization (PR rtl-optimization/89430) JiangNing OS 2019-03-18 21:53 ` Jeff Law 2019-04-30 16:00 ` Jeff Law 2019-05-05 0:11 ` JiangNing OS 2019-05-24 14:54 ` Kyrill Tkachov 2019-06-17 0:36 ` JiangNing OS 2019-06-17 21:59 ` Jeff Law 2019-06-20 9:53 ` JiangNing OS 2019-06-20 23:13 ` Jeff Law 2019-06-21 12:24 ` Richard Biener 2019-06-21 16:10 ` Jeff Law 2019-07-08 7:43 ` JiangNing OS 2019-07-12 5:22 ` Andrew Pinski 2019-07-12 16:11 ` Jeff Law 2019-07-12 16:40 ` Jeff Law
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).